In [1]:
import igraph as ig
from igraph import Graph
import matplotlib.pyplot as plt
import re
import pandas as pd

In [2]:
path = "amazon-meta.txt"
file = open(path, 'r', encoding="utf8")

In [3]:
products = {}

(Id, ASIN, title, group, salesrank, similar, categories, total_reviews, avg_rating) = ("", "", "", "", 0, [], "", 0, 0.0)

for line in file:
    line = line.strip()
    
    if (line.startswith("Id")):
        Id = line[3:].strip()
        #print(Id)
    elif (line.startswith("ASIN")):
        ASIN = line[5:].strip()
        #print(ASIN)
    elif (line.startswith("title")):
        title = line[6:].strip()
        #print(title)
    elif (line.startswith("group")):
        group = line[6:].strip()
        #print(group)
    elif (line.startswith("salesrank")):
        salesrank = line[10:].strip()
        #print(salesrank)
    elif (line.startswith("similar")):
        similar = line[10:].strip()
        similar = similar.split()
        #print(similar)
    elif (line.startswith("categories")):
        a = line.split()
        categories = ' '.join((file.readline()) for i in range(int(a[1].strip())))
        categories = ' '.join(word for word in categories.split("|"))
        categories = ' '.join(word for word in categories.split())
        pattern = r'\[[0-9]+\]\s?'
        categories = '#'.join(word for word in set(re.split(pattern, categories)))
        categories = categories.strip()
        #print(categories)
    elif (line.startswith("reviews")):
        b = line.split()
        total_reviews = b[2].strip()
        avg_rating = b[7].strip()
        #print(total_reviews, avg_rating)
    elif(line==""):
        metadata = {}
        if(ASIN!=""):
            products[ASIN] = metadata
            metadata['Id'] = Id
            metadata['title'] = title
            metadata['group'] = group
            metadata['salesrank'] = int(salesrank)
            metadata['similar'] = similar
            metadata['categories'] = categories
            metadata['total_reviews'] = int(total_reviews)
            metadata['avg_rating'] = float(avg_rating)
        (Id, ASIN, title, group, salesrank, similar, categories, total_reviews, avg_rating) = ("", "", "", "", 0, [], "", 0, 0.0)
        
file.close()

In [4]:
print(products["6305692610"])

{'Id': '54437', 'title': 'My Life so Far', 'group': 'DVD', 'salesrank': 9456, 'similar': ['B00008L3S0', 'B000056N8V', 'B000069I02', 'B00006JDVX', 'B00015YVI6'], 'categories': "#Amazon.com Stores#Drama#Amazon.com Outlet#Hudson, Hugh#By Theme#Childhood Drama#Mastrantonio, Mary Elizabeth#Directors#Art House & International#( H )#Specialty Stores#Children#Special Features#United Kingdom#DVD Outlet#Genres#Studio Specials#Harris, Rosemary#Miramax Home Video#Actors & Actresses#Comedy#Family Interaction#( F )#Firth, Colin#Coming of Age#Today's Deals in DVD#All Titles#Deals Under $15#By Country#Period Piece#Titles#Family Life#( M )#Foreign Spotlight#MacDonald, Kelly#DVD#Categories#McDowell, Malcolm#General", 'total_reviews': 16, 'avg_rating': 3.5}


In [5]:
df = pd.DataFrame.from_dict(products)
df = df.transpose()
df = df.rename(columns={'Id':'id'})
df['id'] = df.index.values
df.reset_index(inplace=True)
df = df.drop('index', axis=1)
df = df[df["group"] != ""]
df = df[df["title"] != ""]
df = df[df["categories"] != ""]
df = df[df["salesrank"] != 0]
df = df[df["salesrank"] != -1]
df = df.reset_index(drop=True)
df.to_csv('products_metadata.csv', index=False)

In [6]:
print(df)

                id                                              title  group   
0       0827229534            Patterns of Preaching: A Sermon Sampler   Book  \
1       0738700797                         Candlemas: Feast of Flames   Book   
2       0486287785   World War II Allied Fighter Planes Trading Cards   Book   
3       0842328327  Life Application Bible Commentary: 1 and 2 Tim...   Book   
4       1577943082    Prayers That Avail Much for Business: Executive   Book   
...            ...                                                ...    ...   
519471  0520053311                                     Implementation   Book   
519472  6300186016                                 She Done Him Wrong  Video   
519473  0867200170  Aquatic Entomology: The Fishermen's Guide and ...   Book   
519474  158483000X                         Needless Casualties of War   Book   
519475  0972380108  Start Your Own Computer Business: Building a S...   Book   

       salesrank                       

In [7]:
def createSubset(subset, group):
    for key, value in products.items():
        if(value['group'] == group):
            subset[key] = products[key]
    for key, value in subset.items():
        subset[key]['similar'] = [similar for similar in value['similar'] if similar in products.keys()]

In [8]:
dvd = {}
books = {}
music = {}
videos = {}

createSubset(dvd, "DVD")
createSubset(books, "Book")
createSubset(music, "Music")
createSubset(videos, "Video")

In [9]:
g_dvd = Graph.TupleList([(k, v) for k, vs in dvd.items() for v in vs['similar']], directed=True)
ig.summary(g_dvd)

IGRAPH DN-- 19694 54388 -- 
+ attr: name (v)


In [10]:
g_books = Graph.TupleList([(k, v) for k, vs in books.items() for v in vs['similar']], directed=True)
ig.summary(g_books)

IGRAPH DN-- 272909 916404 -- 
+ attr: name (v)


In [11]:
g_music = Graph.TupleList([(k, v) for k, vs in music.items() for v in vs['similar']], directed=True)
ig.summary(g_music)

IGRAPH DN-- 63945 204262 -- 
+ attr: name (v)


In [12]:
g_videos = Graph.TupleList([(k, v) for k, vs in videos.items() for v in vs['similar']], directed=True)
ig.summary(g_videos)

IGRAPH DN-- 27652 56369 -- 
+ attr: name (v)


In [13]:
everything = {}
for key, value in products.items():
        everything[key] = products[key]
for key, value in everything.items():
    everything[key]['similar'] = [similar for similar in value['similar'] if similar in everything.keys()]

In [14]:
g = Graph.TupleList([(k, v) for k, vs in everything.items() for v in vs['similar']], directed=True)
ig.summary(g)

IGRAPH DN-- 366997 1231439 -- 
+ attr: name (v)


In [15]:
def createCSV(g, filename):
    vertex_names = {v.index: v["name"] for v in g.vs}
    edgelist = [(vertex_names[e[0]], vertex_names[e[1]]) for e in g.get_edgelist()]
    df = pd.DataFrame(edgelist, columns=['source', 'target'])
    df.to_csv(filename, index=False)

In [16]:
createCSV(g_dvd, 'dvd.csv')
createCSV(g_books, 'books.csv')
createCSV(g_music, 'music.csv')
createCSV(g_videos, 'videos.csv')
createCSV(g, 'products.csv')