In [185]:
import json

import pandas as pd
import numpy as np
import networkx as nx

import matplotlib.pyplot as plt

from collections import Counter


In [186]:
def ingest_review_data(data):
    reviews = {}
    for line in data.split("\n"):
        if line.startswith("product/productId:"):
            product_id = line.split(": ")[1]
            reviews[product_id] = {}
        elif line.startswith("review/"):
            parts = line.split(": ")
            if len(parts) == 2:
                review_key, review_value = parts
                review_key = review_key.split("/")[1]
                reviews[product_id][review_key] = review_value
            else:
            #handle lines that do not contain a colon
                pass
    return reviews

In [187]:
with open("./Desktop/notebook/foods.txt", "r") as f:
    review_data = f.read()

reviews = ingest_review_data(review_data)

In [188]:
reviews

{'B001E4KFG0': {'userId': 'A3SGXH7AUHU8GW',
  'profileName': 'delmartian',
  'helpfulness': '1/1',
  'score': '5.0',
  'time': '1303862400',
  'summary': 'Good Quality Dog Food',
  'text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},
 'B00813GRG4': {'userId': 'A1D87F6ZCVE5NK',
  'profileName': 'dll pa',
  'helpfulness': '0/0',
  'score': '1.0',
  'time': '1346976000',
  'summary': 'Not as Advertised',
  'text': 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".'},
 'B000LQOCH0': {'userId': 'ABXLMWJIXXAIN',
  'profileName': 'Natalia Corres "Natalia Corres"',
  'helpfulness': '1/1',
  'score': '4.0',
  'time': '1219017600',
  'sum

In [189]:
df = pd.DataFrame.from_dict(reviews, orient='index')


In [190]:
df = df.reset_index()
df = df.rename(columns={'index': 'product_id'})
df

Unnamed: 0,product_id,userId,profileName,helpfulness,score,time,summary,text
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1/1,5.0,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0/0,1.0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1/1,4.0,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3/3,2.0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0/0,5.0,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
...,...,...,...,...,...,...,...,...
74253,B000H7K114,A2AGSSZR9V7XST,Peter,0/0,5.0,1281744000,Excellent Tea,I love this tea. I first discovered the pleas...
74254,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0/0,5.0,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
74255,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0/0,2.0,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
74256,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1/1,5.0,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [191]:
avg_scores = df.groupby('product_id')['score'].mean().reset_index()
avg_scores.columns = ['product_id', 'avg_score']
avg_scores

Unnamed: 0,product_id,avg_score
0,0006641040,4.0
1,141278509X,5.0
2,2734888454,5.0
3,2841233731,5.0
4,7310172001,5.0
...,...,...
74253,B009UOFTUI,1.0
74254,B009UOFU20,1.0
74255,B009UUS05I,5.0
74256,B009WSNWC4,5.0


In [192]:
review_counts = df.groupby('product_id')['score'].count().reset_index()

review_counts.columns = ['product_id', 'review_count']
filtered_products = review_counts[review_counts['review_count'] >= 100]
product_ids = filtered_products['product_id'].tolist()

print(f'Number of products with at least 100 reviews: {len(product_ids)}')

Number of products with at least 100 reviews: 0


In [193]:
product_ids

[]

In [194]:
user_counts = df.groupby('userId')['score'].count().reset_index()

user_counts.columns = ['user_id', 'user_count']
filtered_users = user_counts[user_counts['user_count'] <= 50]
user_ids = filtered_users['user_id'].tolist()

print(f'Number of users with less than 50 reviews: {len(user_ids)}')


Number of users with less than 50 reviews: 55682


In [195]:
user_counts = df.groupby('userId')['score'].count().reset_index()

user_counts.columns = ['user_id', 'user_count']
filtered_users = user_counts[user_counts['user_count'] > 50]

user_ids = filtered_users['user_id'].tolist()

print(f'Number of users with less than 50 reviews: {len(user_ids)}')

Number of users with less than 50 reviews: 8


In [196]:
user_counts = df.groupby(['userId', 'product_id'])['score'].count().reset_index()

user_counts.columns = ['user_id', 'product_id', 'review_count']

filtered_users = user_counts[user_counts['review_count'] < 50]['user_id'].unique().tolist()

filtered_products = user_counts[user_counts['review_count'] < 100]['product_id'].unique().tolist()
filtered_products


['B0026LJ3EA',
 'B0026LIR60',
 'B006Q820X0',
 'B005DVVB9K',
 'B008I1XPKA',
 'B007OSBE1U',
 'B007OSBEV0',
 'B009PIAFTE',
 'B009PIEW3O',
 'B009PICJTS',
 'B007Q3P1GC',
 'B0024SGY8S',
 'B003ZURMFS',
 'B001RV8CGK',
 'B001RVCAAE',
 'B003F7YTK2',
 'B001RPO15C',
 'B005VUGT2W',
 'B0077DF4Q8',
 'B004ZBCJU4',
 'B000NSFGVM',
 'B001G6W0Y6',
 'B0013T5YO4',
 'B004CAOKSW',
 'B002WDCIXK',
 'B001CBHNPQ',
 'B0002XAFTG',
 'B000Q3AE2K',
 'B0020LP6T2',
 'B0078SITZK',
 'B0027E4OH2',
 'B00356BOKQ',
 'B00356FRR2',
 'B003YBJ9JA',
 'B0028FKPUU',
 'B00017L1UK',
 'B001EQ4DG6',
 'B005HY2BRO',
 'B001AEFUIM',
 'B004SKVWNW',
 'B0001BVJB2',
 'B000F3WSCU',
 'B000V36U3M',
 'B001IZHZYA',
 'B000LKU7IE',
 'B003OP8LMW',
 'B003SNX4YA',
 'B001GVIUKY',
 'B0057FSWP8',
 'B001PICXX8',
 'B0002ML1EA',
 'B006OP1Q8K',
 'B002HQIPJ8',
 'B0088A60PS',
 'B0001M0YYO',
 'B005EJJK6W',
 'B0015MFQS8',
 'B0014AUT60',
 'B000UUS0KC',
 'B0052Y15VW',
 'B00012PPSY',
 'B000ES1R1Y',
 'B004Y4Z9CC',
 'B003BI2HK4',
 'B001IW08M4',
 'B001GIE4YI',
 'B0028SY2

In [197]:
filtered_users

['#oc-R10LT57ZGIB140',
 '#oc-R3GE8AOJOSUJVX',
 '#oc-R3IX025L68IP59',
 '#oc-R3LI5WJM4WD02C',
 '#oc-R40PWE1BFHMT2',
 '#oc-RNMDR27B0MP1I',
 'A00489763J7YUCSN6CP7K',
 'A0849196AFU725N8S7RS',
 'A09229701Z8W88AD38877',
 'A1000X0NY39BA1',
 'A10012K7DF3SBQ',
 'A10023OS6MZUC6',
 'A1003YG3CPYQFS',
 'A1005LM7YYY7VG',
 'A1007OFJTJRYII',
 'A100B9UOLABJS8',
 'A100BT8Z05ASDC',
 'A100DR1K6IUZUV',
 'A100EBHBG1GF5',
 'A100KABITNCJQZ',
 'A100QR0G5OD7U5',
 'A100TCEB7KB4B',
 'A100WO06OQR8BQ',
 'A10103MJIKKIFE',
 'A1011I2PCI4K8B',
 'A101C8YPYJ29CG',
 'A101CCC619GN4S',
 'A101CEZ1RKB1T7',
 'A101F8M8DPFOM9',
 'A101FZLX30BZ1T',
 'A101K5ANDEC4N8',
 'A101N9IT72VFSU',
 'A101P5UBAWDB62',
 'A101QLG5FZ2BKS',
 'A101QYDENLLF4M',
 'A101XJR335QHU7',
 'A10216VMEXTY19',
 'A102714YQ4VYZP',
 'A1029CAC8JRPJX',
 'A102ALA6MI312B',
 'A102FG70Q4TGRT',
 'A102GP5X0OU8GM',
 'A102NBEATBBBO4',
 'A102NQYJ3LLR7V',
 'A102NX3G2DMOHH',
 'A102OYXZJJEAYM',
 'A102PC4WQJVR96',
 'A102PUUO2U0YBK',
 'A102R0714VSMB',
 'A102SH9AIMFJSS',
 'A102SLZLS

In [198]:
#while using all the data took too much time to compile, I worked with sampled data
subset = df.sample(frac=0.1)

pivot_table = subset.pivot_table(index='userId', columns='product_id', values='score')
pivot_table

product_id,7310172001,7310172101,B00005IX96,B00005IX98,B00006IDJO,B00006IDJU,B00008434E,B000084388,B000084DWM,B000084E9J,...,B009I31XNI,B009JYEBS0,B009LIK0FW,B009MRNOR8,B009NTCO4O,B009O0U2SW,B009O7DGEW,B009OM66IU,B009PCDDO4,B009QEBGIQ
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#oc-R3LI5WJM4WD02C,,,,,,,,,,,...,,,,,,,,,,
#oc-R40PWE1BFHMT2,,,,,,,,,,,...,,,,,,,,,,
#oc-RNMDR27B0MP1I,,,,,,,,,,,...,,,,,,,,,,
A102714YQ4VYZP,,,,,,,,,,,...,,,,,,,,,,
A102NQYJ3LLR7V,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZWF1BM21B1OK,,,,,,,,,,,...,,,,,,,,,,
AZX18A7F4ZBXB,,,,,,,,,,,...,,,,,,,,,,
AZYTCGDO3R4ZU,,,,,,,,,,,...,,,,,,,,,,
AZYU563S6AOK9,,,,,,,,,,,...,,,,,,,,,,


In [199]:
def normalize_rows(row):
    row_sum = int(row.sum())
    return row.div(row_sum)

result = pivot_table.apply(normalize_rows, axis=1)
#a memory problem occurs here
result


product_id,7310172001,7310172101,B00005IX96,B00005IX98,B00006IDJO,B00006IDJU,B00008434E,B000084388,B000084DWM,B000084E9J,...,B009I31XNI,B009JYEBS0,B009LIK0FW,B009MRNOR8,B009NTCO4O,B009O0U2SW,B009O7DGEW,B009OM66IU,B009PCDDO4,B009QEBGIQ
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#oc-R3LI5WJM4WD02C,,,,,,,,,,,...,,,,,,,,,,
#oc-R40PWE1BFHMT2,,,,,,,,,,,...,,,,,,,,,,
#oc-RNMDR27B0MP1I,,,,,,,,,,,...,,,,,,,,,,
A102714YQ4VYZP,,,,,,,,,,,...,,,,,,,,,,
A102NQYJ3LLR7V,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZWF1BM21B1OK,,,,,,,,,,,...,,,,,,,,,,
AZX18A7F4ZBXB,,,,,,,,,,,...,,,,,,,,,,
AZYTCGDO3R4ZU,,,,,,,,,,,...,,,,,,,,,,
AZYU563S6AOK9,,,,,,,,,,,...,,,,,,,,,,


In [200]:
result_T = result.T
user_similarity = result.dot(result_T)
product_similarity = result_T.dot(result)

user_similarity

userId,#oc-R3LI5WJM4WD02C,#oc-R40PWE1BFHMT2,#oc-RNMDR27B0MP1I,A102714YQ4VYZP,A102NQYJ3LLR7V,A10389W8OI4Q6N,A103OW9PCYAXXH,A105WK4IF15MXJ,A106MB2IVYFRWR,A106R34K3VRITJ,...,AZSE0K98A68HT,AZTO5GI7MFUAB,AZUSUNUP7S8U8,AZV26LP92E6WU,AZVWJW247I1HC,AZWF1BM21B1OK,AZX18A7F4ZBXB,AZYTCGDO3R4ZU,AZYU563S6AOK9,AZZNK89PXD006
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#oc-R3LI5WJM4WD02C,,,,,,,,,,,...,,,,,,,,,,
#oc-R40PWE1BFHMT2,,,,,,,,,,,...,,,,,,,,,,
#oc-RNMDR27B0MP1I,,,,,,,,,,,...,,,,,,,,,,
A102714YQ4VYZP,,,,,,,,,,,...,,,,,,,,,,
A102NQYJ3LLR7V,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZWF1BM21B1OK,,,,,,,,,,,...,,,,,,,,,,
AZX18A7F4ZBXB,,,,,,,,,,,...,,,,,,,,,,
AZYTCGDO3R4ZU,,,,,,,,,,,...,,,,,,,,,,
AZYU563S6AOK9,,,,,,,,,,,...,,,,,,,,,,


In [201]:
product_similarity

product_id,7310172001,7310172101,B00005IX96,B00005IX98,B00006IDJO,B00006IDJU,B00008434E,B000084388,B000084DWM,B000084E9J,...,B009I31XNI,B009JYEBS0,B009LIK0FW,B009MRNOR8,B009NTCO4O,B009O0U2SW,B009O7DGEW,B009OM66IU,B009PCDDO4,B009QEBGIQ
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7310172001,,,,,,,,,,,...,,,,,,,,,,
7310172101,,,,,,,,,,,...,,,,,,,,,,
B00005IX96,,,,,,,,,,,...,,,,,,,,,,
B00005IX98,,,,,,,,,,,...,,,,,,,,,,
B00006IDJO,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B009O0U2SW,,,,,,,,,,,...,,,,,,,,,,
B009O7DGEW,,,,,,,,,,,...,,,,,,,,,,
B009OM66IU,,,,,,,,,,,...,,,,,,,,,,
B009PCDDO4,,,,,,,,,,,...,,,,,,,,,,


In [None]:
G_users = nx.from_pandas_adjacency(user_similarity)
G_products = nx.from_pandas_adjacency(product_similarity)

nx.draw(G_users)
nx.draw(G_products)

In [None]:
#skectching network took too much time to compile. Even after more than a hour, still trying to compile it. 
#I couldn't fix the NaN problem. I tried many methods but none of them worked.