## Analysis of Amazon product reviews for Electronics category (approx. 7M reviews/ratings)

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import gzip
import json
import datetime
from IPython.display import display, display_pretty, Javascript, HTML
from pandas_highcharts.core import serialize
from pandas_highcharts.display import display_charts

### Parsing & unpacking JSON for zip/json reviews file

In [2]:
def parse_zipfile(path):
    file = gzip.open(path, 'rb')
    for line in file:
        yield eval(line)

def get_PD_dataframe(path):
    j = 0
    dframe = {}
    for d in parse_zipfile(path):
        dframe[j] = d
        j += 1
    return pd.DataFrame.from_dict(dframe, orient='index')

In [3]:
#Get pandas dataframe for the table
df = get_PD_dataframe('reviews_Electronics.json.gz')

In [4]:
df.head(5)

Unnamed: 0,reviewerID,reviewerName,summary,reviewTime,asin,reviewText,overall,unixReviewTime,helpful
0,AKM1MP6P0OYPR,"Vicki Gibson ""momo4""",Very thorough,"04 13, 2013",132793040,Corey Barker does a great job of explaining Bl...,5.0,1365811200,"[1, 1]"
1,A2CX7LUOHB2NDG,Bernie,Adobe Photoshop CS5 Crash Course with master P...,"07 1, 2012",321732944,While many beginner DVDs try to teach you ever...,5.0,1341100800,"[0, 0]"
2,A2NWSAGRHCP8N5,bowmans2007,absolutely horrible,"04 29, 2013",439886341,It never worked. My daughter worked to earn th...,1.0,1367193600,"[1, 1]"
3,A2WNBOD3WNDNKT,JAL,Disappointing,"07 22, 2013",439886341,Some of the functions did not work properly. ...,3.0,1374451200,"[1, 1]"
4,A1GI0U4ZRJA8WN,Truthfull,TERRIBLE DONT WASTE YOUR MONEY,"04 18, 2012",439886341,Do not waste your money on this thing it is te...,1.0,1334707200,"[4, 4]"


In [5]:
df.shape

(7824482, 9)

In [6]:
#Convert data/timestamp to pd format
df['reviewdate']=pd.to_datetime(df['unixReviewTime'], unit='s')

In [7]:
#create additional data points of interest
df['month'] = pd.DatetimeIndex(df['reviewdate']).month

In [8]:
df['year'] = pd.DatetimeIndex(df['reviewdate']).year

In [9]:
df.dtypes

reviewerID                object
reviewerName              object
summary                   object
reviewTime                object
asin                      object
reviewText                object
overall                  float64
unixReviewTime             int64
helpful                   object
reviewdate        datetime64[ns]
month                      int32
year                       int32
dtype: object

In [10]:
#Summarize the mean rating and review count over years 
df2=df.groupby('year').agg({'overall':np.mean,'reviewerID':np.size})

In [11]:
#df2.reset_index(level=0, inplace=True)
df2=df2.rename(columns = {'overall':'avg_rating','reviewerID':'num_review'})

In [12]:
df2=df2.reset_index()

In [13]:
df2.head()

Unnamed: 0,year,num_review,avg_rating
0,1998,4,4.5
1,1999,1212,4.066007
2,2000,9214,4.113957
3,2001,14753,3.928489
4,2002,18648,3.850118


In [14]:
#summary statistics
df2.describe()

Unnamed: 0,year,num_review,avg_rating
count,17.0,17.0,17.0
mean,2006.0,460263.6,3.944268
std,5.049752,741075.0,0.21257
min,1998.0,4.0,3.556025
25%,2002.0,18648.0,3.850118
50%,2006.0,86659.0,3.958765
75%,2010.0,475626.0,4.053685
max,2014.0,2626582.0,4.5


In [15]:
# Chart trend over the years
df2 = df2.set_index(df2['year'])
display_charts(df2,kind='bar',y=['avg_rating','num_review'], secondary_y=['num_review'], title="Amazon Ratings - Electronics Category",zoom="xy")

### The average rating over the years is around 4, majority reviews for the products are positive. How many reviews were negative in 2013?

In [16]:
df['ratings']=['Positive' if x>=4 else 'Neutral' if x==3 else 'Negative' for x in df['overall']]

In [17]:
df1=df.groupby(['year','ratings']).agg({'overall':np.mean,'reviewerID':np.size})

In [18]:
df1=df1.rename(columns = {'overall':'avg_rating','reviewerID':'num_review'})

In [19]:
df1=df1.reset_index()

In [20]:
df1.dtypes

year            int64
ratings        object
num_review      int64
avg_rating    float64
dtype: object

In [21]:
df1['average_ratings_in_year']=df1.apply(lambda x: str(x['year']) + '-' + x['ratings'],axis=1)
df1=df1.set_index(df1['average_ratings_in_year'])
display_charts(df1,y=['avg_rating','num_review'], secondary_y=['num_review'],kind='bar', title='Average Ratings & Review Count by Year',zoom='xy')

In [22]:
df5=df1

In [23]:
df5=df5.set_index(df5['average_ratings_in_year'])
display_charts(df5,y=['num_review'],kind='pie', title='# Reviews by Rating')

### Why did reviews count dropped in 2014? Is there a seasonal trend for increase in reviews? 

In [24]:
df3=df.groupby(['year','month'])['reviewerID'].count()

In [25]:
df3=df3.reset_index()

In [26]:
df3=df3.sort_values(['year','month'])

In [27]:
df3['datekey']=df3.apply(lambda x: str(x['month'])+'-'+str(x['year']),axis=1)

In [28]:
df3=df3.set_index(df3['datekey'])

In [29]:
df3=df3.rename(columns = {'reviewerID':'num_review'})
display_charts(df3,y=['num_review'], title="# Reviews by Year - Electronics Category",zoom="xy")