In [1]:
# import os
import json
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np

# from urllib.request import urlopen

In [2]:
#!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz
# Even if we use the link above, it will download the file to the local machine. So I put the file in the same directory as the notebook

In [3]:
# Data is in the format: 
# "overall": 4.0,
# "verified",
# "reviewTime",
# "reviewerID",
# "asin",
# "style": {"Format:"}
# "reviewerName",
# "reviewText"
# "summary",
# "unixReviewTime"

### load the meta data
data = []
with gzip.open('Dataset/Software.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Total number of items in the dataset: ", len(data))

# first row of the list
print(data[0])

Total number of items in the dataset:  12805
{'overall': 4.0, 'verified': False, 'reviewTime': '10 20, 2010', 'reviewerID': 'A38NELQT98S4H8', 'asin': '0321719816', 'style': {'Format:': ' DVD-ROM'}, 'reviewerName': 'WB Halper', 'reviewText': "I've been using Dreamweaver (and it's predecessor Macromedia's UltraDev) for many years.  For someone who is an experienced web designer, this course is a high-level review of the CS5 version of Dreamweaver, but it doesn't go into a great enough level of detail to find it very useful.\n\nOn the other hand, this is a great tool for someone who is a relative novice at web design.  It starts off with a basic overview of HTML and continues through the concepts necessary to build a modern web site.  Someone who goes through this course should exit with enough knowledge to create something that does what you want it do do...within reason.  Don't expect to go off and build an entire e-commerce system with only this class under your belt.\n\nIt's important

In [4]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))
print(df)

12805
       overall  verified   reviewTime      reviewerID        asin  \
0          4.0     False  10 20, 2010  A38NELQT98S4H8  0321719816   
1          4.0     False  10 18, 2010  A3QJU4FEN8PQSZ  0321719816   
2          5.0     False  10 16, 2010   ACJT8MUC0LRF0  0321719816   
3          5.0     False  10 12, 2010   AYUF7YETYOLNX  0321719816   
4          5.0     False   10 7, 2010  A31ICLWQ9CSHRS  0321719816   
...        ...       ...          ...             ...         ...   
12800      4.0     False  07 16, 2016  A1E50L7PCVXLN4  B01FFVDY9M   
12801      3.0     False  06 17, 2017   AVU1ILDDYW301  B01HAP3NUG   
12802      4.0     False  01 24, 2017  A2LW5AL0KQ9P1M  B01HAP3NUG   
12803      3.0     False  06 14, 2018   AZ515FFZ7I2P7  B01HAP47PQ   
12804      4.0     False  04 16, 2018  A2WPL6Y08K6ZQH  B01HAP47PQ   

                            style       reviewerName  \
0         {'Format:': ' DVD-ROM'}          WB Halper   
1         {'Format:': ' DVD-ROM'}             Grimmy 

In [5]:
### load the for "Software" category:

data2 = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data2.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Total number of items in the dataset: ", len(data2))


Total number of items in the dataset:  26790


In [6]:
# convert list into pandas dataframe

df2 = pd.DataFrame.from_dict(data2)

# Features of the dataset
df2.columns

# Features are slightly different than 


Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')

In [7]:
df2.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,[],,[],,HOLT PHYSICS LESSON PRESENTATION CD-ROM QUICK ...,[],,HOLT. RINEHART AND WINSTON,[],"25,550 in Software (",[],Software,,</div>,.a-box-inner{background-color:#fff}#alohaBuyBo...,0030672120,[],[],
1,[],,"[, <b>Latin rhythms that will get your kids si...",,"Sing, Watch, &amp; Learn Spanish (DVD + Guide)...",[],,McGraw Hill,[],"15,792 in Software (",[],Software,,</div>,,0071480935,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,[],,[<b>Connect is the only integrated learning sy...,,Connect with LearnSmart Access Card for Microb...,[],,McGraw-Hill Science/Engineering/Math,[],"16,900 in Software (",[],Software,,</div>,,007329506X,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
3,[],,[],,LearnSmart Standalone Access Card for Prescott...,[],,McGraw-Hill Education,[],"12,986 in Software (",[],Software,,</div>,,0073513458,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
4,[],,[<i>Anatomy &amp; Physiology Revealed Cat</i> ...,,Anatomy &amp; Physiology Revealed Student Acce...,"[0323394612, 0323227937, 1118527488]",,McGraw-Hill Education,[],"14,861 in Software (",[],Software,,</div>,$4.83,0073525758,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [8]:
# Some products have multiple descriptions containing information of for instance the author.
# Perhaps only the first description string should be analyzed for our purpose? This seems to be primarily for the product.
# For now i have just merged all descriptions into a single string for analysis.

df2.description = df2.description.apply(lambda x: "".join(x))

In [9]:
# A lot of the descriptions (and other features) contain HTML.
# The function parses and "translates" into plain text descriptions more suitable for analysis

def strip_html(s):
    if not s or s.isspace(): 
        return ''
    try:
        return str(html.fromstring(s).text_content())
    except etree.ParserError: # I am not able to find out why the error occur so i continued by catching the exception. Seem to happen on some empty description strings 
        return ''


In [10]:
# Applying strip_html to description
df2['description'] = df2['description'].apply(strip_html)

In [11]:
df2.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,[],,,,HOLT PHYSICS LESSON PRESENTATION CD-ROM QUICK ...,[],,HOLT. RINEHART AND WINSTON,[],"25,550 in Software (",[],Software,,</div>,.a-box-inner{background-color:#fff}#alohaBuyBo...,0030672120,[],[],
1,[],,Latin rhythms that will get your kids singing ...,,"Sing, Watch, &amp; Learn Spanish (DVD + Guide)...",[],,McGraw Hill,[],"15,792 in Software (",[],Software,,</div>,,0071480935,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,[],,Connect is the only integrated learning system...,,Connect with LearnSmart Access Card for Microb...,[],,McGraw-Hill Science/Engineering/Math,[],"16,900 in Software (",[],Software,,</div>,,007329506X,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
3,[],,,,LearnSmart Standalone Access Card for Prescott...,[],,McGraw-Hill Education,[],"12,986 in Software (",[],Software,,</div>,,0073513458,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
4,[],,Anatomy & Physiology Revealed Cat is the ultim...,,Anatomy &amp; Physiology Revealed Student Acce...,"[0323394612, 0323227937, 1118527488]",,McGraw-Hill Education,[],"14,861 in Software (",[],Software,,</div>,$4.83,0073525758,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [12]:
# Merge description to reviews data using 'asin'

merged_df = df.merge(df2[['asin', 'description']], on='asin', how='left')

In [13]:
merged_df.iloc[15:200]

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,description
15,5.0,False,"12 6, 2010",A10Y058K7B96C6,0321700945,{'Format:': ' DVD-ROM'},midnight122,"I am not an avid Lightroom user, although I ha...",Great for beginners or just about any Lightroo...,1291593600,,,This complete training program from Adobe Pres...
16,5.0,False,"11 15, 2010",A3V7D0LH8L7BG0,0321700945,{'Format:': ' DVD-ROM'},Spike D.,"I am a long time user of Photoshop Lightroom, ...",You need to have this..,1289779200,,,This complete training program from Adobe Pres...
17,4.0,False,"11 27, 2010",AGVWTYW0ULXHT,0321719824,{'Format:': ' DVD-ROM'},Nate,"The ""Learn by Video"" program for Flash CS5 wou...",A helpful introduction to Flash CS5 - a bit dr...,1290816000,,,This complete training program from Adobe Pres...
18,4.0,False,"10 21, 2010",A3QJU4FEN8PQSZ,0321719824,{'Format:': ' DVD-ROM'},Grimmy,The presenter here sounds much more natural th...,"A good introduction to Flash, with some UI wea...",1287619200,,,This complete training program from Adobe Pres...
19,4.0,False,"10 10, 2010",A31N0XY2UTB25C,0321719824,{'Format:': ' DVD-ROM'},Stephanie Sullivan,This certified associate courseware comes as a...,Learn Adobe Flash Professional CS5 by Video,1286668800,,,This complete training program from Adobe Pres...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,4.0,False,"12 12, 2009",A3ANETCTMAN1NP,B000050HEI,{'Format:': ' Video Game'},JJ,After success with the first two games in the ...,Another good Nancy Drew game,1260576000,,,"In this interactive mystery game, girls play a..."
196,4.0,False,"12 12, 2009",A3ANETCTMAN1NP,B000050HEI,{'Format:': ' Video Game'},JJ,After success with the first two games in the ...,Another good Nancy Drew game,1260576000,,,"In this interactive mystery game, girls play a..."
197,5.0,True,"07 21, 2006",A22F3BWM7RAHDA,B000050HEI,{'Format:': ' Video Game'},P. B. Sharp,"""Message"" is the first Nancy Drew Pc game that...",You are never too old to go sleuthing!,1153440000,4,,"In this interactive mystery game, girls play a..."
198,5.0,True,"07 21, 2006",A22F3BWM7RAHDA,B000050HEI,{'Format:': ' Video Game'},P. B. Sharp,"""Message"" is the first Nancy Drew Pc game that...",You are never too old to go sleuthing!,1153440000,4,,"In this interactive mystery game, girls play a..."
