# Extract lifespan information from Amazon reviews

In [125]:
import nltk
import json
import pandas as pd
import numpy as np
import gzip

First let's create a very small subset to work with

In [20]:
bigfile = 'data/Electronics_5.json'
smallfile = 'data/Electronics_5_small.json'
n_reviews = 1000

# One review per line
with open(bigfile, 'r') as bigfile_handle, open(smallfile, 'w') as smallfile_handle:
    for i in range(n_reviews):
        smallfile_handle.write(bigfile_handle.readline())


Read data using their script

In [147]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    i_max = 100000
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
        if i > i_max:
            break
            
    return pd.DataFrame.from_dict(df, orient='index')

reviews = getDF('data/reviews_Electronics_5.json.gz')
reviews.head(2)

Unnamed: 0,asin,reviewText,reviewTime,overall,unixReviewTime,helpful,reviewerID,summary,reviewerName
0,528881469,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",5.0,1370131200,"[0, 0]",AO94DHGC771SJ,Gotta have GPS!,amazdnu
1,528881469,"I'm a professional OTR truck driver, and I bou...","11 25, 2010",1.0,1290643200,"[12, 15]",AMO214LNFCEI4,Very Disappointed,Amazon Customer


Filter rows containing breaking-related words

In [161]:
term = 'worked for'
timeperiod_terms = set(['day','week','month','year'])
break_reviews = reviews[reviews['reviewText'].str.contains(term)]

asins = []
reviewerIDs = []
n_periods = []
t_periods = []
ratings = []

for index, row in break_reviews.head(1000).iterrows():
    review = row['reviewText']
    asin = row['asin']
    reviewerID = row['reviewerID']
    rating = row['overall']
    
    i_term = review.find(term)
    words_after = review[i_term+1:].split()[2:10]
    
    for timeperiod in timeperiod_terms:
        if (timeperiod in words_after) or (timeperiod+'s' in words_after):
            i_term = np.argmax(pd.Series(words_after).str.contains(timeperiod))
            words_between = words_after[0:i_term]
            if len(words_between) > 0:
                try:
                    n_periods_float = float(words_between[-1])
                    asins.append(asin)
                    reviewerIDs.append(reviewerID)
                    n_periods.append(n_periods_float)
                    t_periods.append(timeperiod)
                    ratings.append(rating)
                except ValueError:
                    if words_between[-1] == 'a':
                        n_periods_float = float(1)
                        asins.append(asin)
                        reviewerIDs.append(reviewerID)
                        n_periods.append(n_periods_float)
                        t_periods.append(timeperiod)
                        ratings.append(rating)
            
breakdowns = pd.DataFrame({'asin': asins, 'reviewerID': reviewerIDs,
                          'n_period': n_periods, 't_period': t_periods,
                          'rating': ratings})

lifespans = []
for i in range(breakdowns.shape[0]):
    if (breakdowns['t_period'][i] == 'day'):
        lifespans.append(breakdowns['n_period'][i] / 30)
    elif (breakdowns['t_period'][i] == 'week'):
        lifespans.append(breakdowns['n_period'][i] * 7 / 30)
    elif (breakdowns['t_period'][i] == 'month'):
        lifespans.append(breakdowns['n_period'][i])
    elif (breakdowns['t_period'][i] == 'year'):
        lifespans.append(breakdowns['n_period'][i] *365 / 30)
    else:
        print('ajjajaj')

breakdowns['lifespan'] = lifespans
print(breakdowns.head())

         asin  n_period  rating      reviewerID t_period   lifespan
0  9888002198       1.0     1.0   ATXRV6Z3N08ZX      day   0.033333
1  9888002198       6.0     5.0   AA6BJGZG6XI05     week   1.400000
2  B00000J0D5       2.0     1.0   A83BIO3AO15LQ    month   2.000000
3  B00003006E       1.0     5.0  A2WYNTUJ91DLZW     year  12.166667
4  B00004Z5LR       8.0     5.0  A2BFZD9WRHVO0J    month   8.000000


Create item table

In [235]:
uniq_asins = breakdowns['asin'].unique()
breakdown_items = pd.DataFrame({'asin': uniq_asins})
breakdown_items['brand'] = 'null'
breakdown_items['model'] = 'null'
print(breakdown_items.head())

         asin brand model
0  9888002198  null  null
1  B00000J0D5  null  null
2  B00003006E  null  null
3  B00004Z5LR  null  null
4  B00005854F  null  null


Convert pandas dataframe to json

In [239]:
def write_to_json(items, reviews, filename):
    with open(filename, 'w') as jsonfile:
        jsonfile.write('{\n')
        
        for index, row in items.iterrows():
            thisitem_reviews = reviews.where(reviews.asin == row.asin).dropna().reset_index(drop=True)
            
            jsonfile.write('\t"' + row['asin'] + '" : {\n')
            jsonfile.write('\t\t"brand" : ' + row['brand'] + ',\n')
            jsonfile.write('\t\t"model" : ' + row['model'] + ',\n')
            jsonfile.write('\t\t"reviews" : [ \n')
            
            for iindex, irow in thisitem_reviews.iterrows():
                if iindex == (thisitem_reviews.shape[0] - 1):
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t}\n')
                else:
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t},\n')
                
            jsonfile.write('\t\t]\n')
            jsonfile.write('\t},\n')
        jsonfile.write('}\n')

filename = 'data/amazon.json'
write_to_json(breakdown_items, breakdowns, filename)

Convert Young Mi's Amazon file

In [225]:
original_columns = ['ASIN','Rating','Months','Brand_name']
new_columns = ['asin','rating','lifespan','brand']

youngmi_amazon = pd.read_csv('data/ratings_Amazon.csv')[original_columns]
youngmi_amazon.columns = new_columns
youngmi_amazon.head()

Unnamed: 0,asin,rating,lifespan,brand
0,132793040,5,6,FRUYZ9646H
1,321732944,5,28,XXSKO4415Q
2,439886341,1,20,VKNEQ7868Y
3,439886341,3,1,XIWNE4324C
4,439886341,1,19,VXVKC6828Z


In [245]:
youngmi_amazon_products = youngmi_amazon[['asin','brand']].drop_duplicates()
youngmi_amazon_products['model'] = 'null'
youngmi_amazon_products.columns = ['asin','brand','model']

youngmi_amazon_products.head()

Unnamed: 0,asin,brand,model
0,132793040,FRUYZ9646H,
1,321732944,XXSKO4415Q,
2,439886341,VKNEQ7868Y,
3,439886341,XIWNE4324C,
4,439886341,VXVKC6828Z,


In [244]:
def write_to_json_ym(items, reviews, filename):
    with open(filename, 'w') as jsonfile:
        jsonfile.write('{\n')
        
        for index, row in items.iterrows():
            thisitem_reviews = reviews.where(reviews.asin == row.asin).dropna().reset_index(drop=True)
            
            jsonfile.write('\t"' + row['asin'] + '" : {\n')
            jsonfile.write('\t\t"brand" : "' + row['brand'] + '",\n')
            jsonfile.write('\t\t"model" : ' + row['model'] + ',\n')
            jsonfile.write('\t\t"reviews" : [ \n')
            
            for iindex, irow in thisitem_reviews.iterrows():
                if iindex == (thisitem_reviews.shape[0] - 1):
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t}\n')
                else:
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t},\n')
                
            jsonfile.write('\t\t]\n')
            jsonfile.write('\t},\n')
        jsonfile.write('}\n')

youngmiamazon_filename = 'data/youngmi_amazon.json'
write_to_json_ym(youngmi_amazon_products, youngmi_amazon, youngmiamazon_filename)

And convert Young Mi's lab data too

In [241]:
original_columns = ['vwr.catalogue','months','ratings']
new_columns = ['asin','lifespan','rating']

youngmi_lab = pd.read_csv('data/vwr.data.csv')
youngmi_lab.columns = new_columns
youngmi_lab.head()

Unnamed: 0,asin,lifespan,rating
0,129-8922,75,1
1,472-5350,29,2
2,527-6106,24,5
3,537-7921,14,4
4,665-1608,40,5


In [242]:
youngmi_lab_products = youngmi_lab[['asin']].drop_duplicates()
youngmi_lab_products['brand'] = 'null'
youngmi_lab_products['model'] = 'null'
youngmi_lab_products.columns = ['asin','brand','model']

youngmi_lab_products.head()

Unnamed: 0,asin,brand,model
0,129-8922,,
1,472-5350,,
2,527-6106,,
3,537-7921,,
4,665-1608,,


In [243]:
youngmi_lab_filename = 'data/youngmi_lab.json'
write_to_json(youngmi_lab_products, youngmi_lab, youngmi_lab_filename)