# Extract lifespan information from Amazon reviews

First let's install some useful packages

In [22]:
import pandas as pd
import numpy as np
import nltk
import json
import gzip

Read in reviews for subset of electronics from Amazon

In [23]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    i_max = 500000
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
        if i > i_max:
            break
        if i % 50000 == 0:
            print(str(int(100*i/i_max)) + '%')
            
    return pd.DataFrame.from_dict(df, orient='index')

reviews = getDF('data/reviews_Electronics_5.json.gz')
reviews.head(2)

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


Unnamed: 0,reviewTime,asin,reviewerName,reviewerID,unixReviewTime,helpful,summary,reviewText,overall
0,"06 2, 2013",528881469,amazdnu,AO94DHGC771SJ,1370131200,"[0, 0]",Gotta have GPS!,We got this GPS for my husband who is an (OTR)...,5.0
1,"11 25, 2010",528881469,Amazon Customer,AMO214LNFCEI4,1290643200,"[12, 15]",Very Disappointed,"I'm a professional OTR truck driver, and I bou...",1.0


Extract reviews with information on items breaking down
For now: only looking for reviews of the forms "worked for X period" and the like.

Allowed time periods:
* Day
* Week
* Month
* Year

In [24]:
def get_lifespans(data):
    lifespans = []
    
    for i in range(data.shape[0]):
        if (data['t_period'][i] == 'day'):
            lifespans.append(data['n_period'][i] / 30)
        elif (data['t_period'][i] == 'week'):
            lifespans.append(data['n_period'][i] * 7 / 30)
        elif (data['t_period'][i] == 'month'):
            lifespans.append(data['n_period'][i])
        elif (data['t_period'][i] == 'year'):
            lifespans.append(data['n_period'][i] *365 / 30)
        else:
            print('ajjajaj')
    
    return(lifespans)

In [27]:
terms = ['worked for','broke after', 'broke down after', 'broken after', 'broken down after', 'worked only for']
timeperiod_terms = set(['day','week','month','year'])

asins = []
reviewerIDs = []
n_periods = []
t_periods = []
ratings = []
words = []

for term in terms:
    break_reviews = reviews[reviews['reviewText'].str.contains(term)]
    for index, row in break_reviews.iterrows():
        review = row['reviewText']
        asin = row['asin']
        reviewerID = row['reviewerID']
        rating = row['overall']

        i_term = review.find(term)
        words_after = review[i_term:].split()[0:10]

        for timeperiod in timeperiod_terms:
            if (timeperiod in words_after) or (timeperiod+'s' in words_after):
                i_term = np.argmax(pd.Series(words_after).str.contains(timeperiod))
                words_between = words_after[2:i_term]
                if len(words_between) > 0:
                    try:
                        n_periods_float = float(words_between[-1])
                        asins.append(asin)
                        reviewerIDs.append(reviewerID)
                        n_periods.append(n_periods_float)
                        t_periods.append(timeperiod)
                        ratings.append(rating)
                        words.append(' '.join(words_after))
                    except ValueError:
                        if words_between[-1] == 'a':
                            n_periods_float = float(1)
                            asins.append(asin)
                            reviewerIDs.append(reviewerID)
                            n_periods.append(n_periods_float)
                            t_periods.append(timeperiod)
                            ratings.append(rating)
                            words.append(' '.join(words_after))
            
breakdowns = pd.DataFrame({'asin': asins, 'reviewerID': reviewerIDs,
                          'n_period': n_periods, 't_period': t_periods,
                          'rating': ratings, 'words': words})

lifespans = get_lifespans(breakdowns)
breakdowns['lifespan'] = lifespans
print(breakdowns.head(25))

          asin  n_period  rating      reviewerID t_period  \
0   9888002198       1.0     1.0   ATXRV6Z3N08ZX      day   
1   9888002198       6.0     5.0   AA6BJGZG6XI05     week   
2   B00000J0D5       2.0     1.0   A83BIO3AO15LQ    month   
3   B00003006E       1.0     5.0  A2WYNTUJ91DLZW     year   
4   B00004Z5LR       8.0     5.0  A2BFZD9WRHVO0J    month   
5   B00005854F       2.0     1.0   AFP499OYIMJ97    month   
6   B00005JAD8       1.0     5.0  A27G747VNYQ17V     year   
7   B00005T39Y       2.0     4.0  A2ZXBFBA7EKYNR     week   
8   B000067SMH       2.0     5.0  A3EEMEJH8QZPVR     year   
9   B000068U29       5.0     1.0  A1J595X1ANFHR7      day   
10  B000069EV0       1.0     1.0  A3OIE7IE9LTPJG      day   
11  B00006I53T       8.0     2.0   AUEMHQBBKUL4Q    month   
12  B00006RVPW       1.0     2.0  A1CYK2VYWWRL5X      day   
13  B00007056H      15.0     5.0  A37FBFYV7JGRVN     year   
14  B00007KLI4       1.0     1.0  A37Y2D14F5NJHW    month   
15  B00008RW8B       1.0

Create dataframe of unique items

In [28]:
uniq_asins = breakdowns['asin'].unique()
breakdown_items = pd.DataFrame({'asin': uniq_asins})
breakdown_items['brand'] = 'null'
breakdown_items['model'] = 'null'
print(breakdown_items.head(3))

         asin brand model
0  9888002198  null  null
1  B00000J0D5  null  null
2  B00003006E  null  null


Write data in json format

In [29]:
def write_to_json(items, reviews, filename):
    with open(filename, 'w') as jsonfile:
        jsonfile.write('{\n')
        
        for index, row in items.iterrows():
            thisitem_reviews = reviews.where(reviews.asin == row.asin).dropna().reset_index(drop=True)
            
            jsonfile.write('\t"' + row['asin'] + '" : {\n')
            jsonfile.write('\t\t"brand" : ' + row['brand'] + ',\n')
            jsonfile.write('\t\t"model" : ' + row['model'] + ',\n')
            jsonfile.write('\t\t"reviews" : [ \n')
            
            for iindex, irow in thisitem_reviews.iterrows():
                if iindex == (thisitem_reviews.shape[0] - 1):
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t}\n')
                else:
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t},\n')
                
            jsonfile.write('\t\t]\n')
            jsonfile.write('\t},\n')
        jsonfile.write('}\n')

filename = 'data/amazon.json'
write_to_json(breakdown_items, breakdowns, filename)

Convert mock Amazon data from Young Mi

In [246]:
original_columns = ['ASIN','Rating','Months','Brand_name']
new_columns = ['asin','rating','lifespan','brand']

youngmi_amazon = pd.read_csv('data/ratings_Amazon_same_brandname.csv')[original_columns]
youngmi_amazon.columns = new_columns
youngmi_amazon.head(3)

Unnamed: 0,asin,rating,lifespan,brand
0,132793040,5,6,FRUYZ9646H
1,321732944,5,28,XXSKO4415Q
2,439886341,1,20,VKNEQ7868Y
3,439886341,3,1,VKNEQ7868Y
4,439886341,1,19,VKNEQ7868Y


In [247]:
youngmi_amazon_products = youngmi_amazon[['asin','brand']].drop_duplicates()
youngmi_amazon_products['model'] = 'null'
youngmi_amazon_products.columns = ['asin','brand','model']

youngmi_amazon_products.head(3)

Unnamed: 0,asin,brand,model
0,132793040,FRUYZ9646H,
1,321732944,XXSKO4415Q,
2,439886341,VKNEQ7868Y,
5,511189877,XIWNE4324C,
11,528881469,VXVKC6828Z,


In [248]:
def write_to_json_ym(items, reviews, filename):
    with open(filename, 'w') as jsonfile:
        jsonfile.write('{\n')
        
        for index, row in items.iterrows():
            thisitem_reviews = reviews.where(reviews.asin == row.asin).dropna().reset_index(drop=True)
            
            jsonfile.write('\t"' + row['asin'] + '" : {\n')
            jsonfile.write('\t\t"brand" : "' + row['brand'] + '",\n')
            jsonfile.write('\t\t"model" : ' + row['model'] + ',\n')
            jsonfile.write('\t\t"reviews" : [ \n')
            
            for iindex, irow in thisitem_reviews.iterrows():
                if iindex == (thisitem_reviews.shape[0] - 1):
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t}\n')
                else:
                    jsonfile.write('\t\t{\t"lifespan": ' + str(irow['lifespan']) + ',\n')
                    jsonfile.write('\t\t\t"rating": ' + str(irow['rating']) + '\n')
                    jsonfile.write('\t\t},\n')
                
            jsonfile.write('\t\t]\n')
            jsonfile.write('\t},\n')
        jsonfile.write('}\n')

youngmiamazon_filename = 'data/youngmi_amazon.json'
write_to_json_ym(youngmi_amazon_products, youngmi_amazon, youngmiamazon_filename)

Convert mock lab equipment data from Young Mi

In [241]:
original_columns = ['vwr.catalogue','months','ratings']
new_columns = ['asin','lifespan','rating']

youngmi_lab = pd.read_csv('data/vwr.data.csv')
youngmi_lab.columns = new_columns
youngmi_lab.head(3)

Unnamed: 0,asin,lifespan,rating
0,129-8922,75,1
1,472-5350,29,2
2,527-6106,24,5
3,537-7921,14,4
4,665-1608,40,5


In [242]:
youngmi_lab_products = youngmi_lab[['asin']].drop_duplicates()
youngmi_lab_products['brand'] = 'null'
youngmi_lab_products['model'] = 'null'
youngmi_lab_products.columns = ['asin','brand','model']

youngmi_lab_products.head(3)

Unnamed: 0,asin,brand,model
0,129-8922,,
1,472-5350,,
2,527-6106,,
3,537-7921,,
4,665-1608,,


In [243]:
youngmi_lab_filename = 'data/youngmi_lab.json'
write_to_json(youngmi_lab_products, youngmi_lab, youngmi_lab_filename)

Extract some an item with a few reviews to present

In [52]:
aggregated = breakdowns[['asin','lifespan']].groupby(['asin']).agg(['count','mean'])
aggregated.columns = ['count','mean']
aggregated = aggregated.sort_values(['count'], ascending = False)
aggregated.head()

Unnamed: 0_level_0,count,mean
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
B000S5Q9CA,5,9.1
B000MMWT9Q,2,30.416667
B000089GN2,2,9.083333
B001F7AJKI,2,7.583333
B00009KH63,2,36.5


In [53]:
chosen_id = aggregated.index[0]
chosen_reviews = breakdowns.where(breakdowns.asin == chosen_id).dropna()
chosen_reviews

Unnamed: 0,asin,n_period,rating,reviewerID,t_period,words,lifespan
110,B000S5Q9CA,1.0,5.0,A97M5R3SWXB6C,year,worked for almost a year now. What more do you,12.166667
111,B000S5Q9CA,1.0,2.0,A3V37YH9NLG6PD,month,worked for a month then it would charge my phone,1.0
112,B000S5Q9CA,8.0,5.0,A2Y8I1M87H9XOR,month,worked for about 8 months but then started to ...,8.0
113,B000S5Q9CA,1.0,5.0,A2VMV6WU9K6HCU,year,worked for about a year of heavy use before it,12.166667
114,B000S5Q9CA,1.0,3.0,A30KZWMO16ONEY,year,worked for a while... about a year and then it,12.166667
