# Converting Amazon customer review data into .csv format
---

In [2]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

## Conversion for Kindle Store data
---

In [2]:
df = getDF('Kindle_Store_5.json.gz')

In [4]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"07 3, 2014",A2LSKD2H9U8N0J,B000FA5KK0,{'Format:': ' Kindle Edition'},sandra sue marsolek,"pretty good story, a little exaggerated, but I...",pretty good story,1404345600,,
1,5.0,True,"05 26, 2014",A2QP13XTJND1QS,B000FA5KK0,{'Format:': ' Kindle Edition'},Tpl,"If you've read other max brand westerns, you k...",A very good book,1401062400,,
2,5.0,True,"09 16, 2016",A8WQ7MAG3HFOZ,B000FA5KK0,{'Format:': ' Kindle Edition'},Alverne F. Anderson,"Love Max, always a fun twist",Five Stars,1473984000,,
3,5.0,True,"03 3, 2016",A1E0MODSRYP7O,B000FA5KK0,{'Format:': ' Kindle Edition'},Jeff,"As usual for him, a good book",a good,1456963200,,
4,5.0,True,"09 10, 2015",AYUTCGVSM1H7T,B000FA5KK0,{'Format:': ' Kindle Edition'},DEHS - EddyRapcon,MB is one of the original western writers and ...,A Western,1441843200,2.0,


This dataset has size 1.418 GB when converted to a csv. As such, we instead take a random sample of 10% of the data. This yields a dataset with 222,299 rows of data: more than enough for our purposes.

In [5]:
df_reduced = df.sample(frac=0.1, random_state=8)

In [6]:
df_reduced.to_csv("kindle_reduced.csv")

## Brief data cleaning
---

In [3]:
df_new = pd.read_csv("kindle_reduced.csv")

In [4]:
df_new.head()

Unnamed: 0.1,Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,1126873,5.0,True,"01 24, 2015",A8VYIGNN6SZKV,B00XUR3BTC,{'Format:': ' Kindle Edition'},Becca,Great Classic for free. only wish they had th...,Five Stars,1422057600,,
1,1725214,3.0,False,"03 26, 2014",A1H5WB9NOSRIC9,B00BFNJBC4,{'Format:': ' Kindle Edition'},Jiha&#039;s file,This is a sweet and pleasant love story about ...,My Rating 3.5/5 stars!,1395792000,2.0,
2,1972797,3.0,False,"08 5, 2015",APB49T60Q0BEK,B00R8Q0NWE,{'Format:': ' Kindle Edition'},Paul Guncheon,I received the series from Amazon in return fo...,Fast and Fun Read,1438732800,,
3,1709720,3.0,False,"03 3, 2016",A3DQWL3MH5BT9F,B00ANFPPAW,{'Format:': ' Kindle Edition'},Literate - usually,This was a very hard book to rate. Technically...,Holes! Why did it have to have plot holes?,1456963200,,
4,329074,4.0,True,"05 16, 2013",A1LTABLH0TCYXT,B00CMU1IN4,{'Format:': ' Kindle Edition'},Amazon Customer,"I liked it, wished for more about her grandpar...",Pretty good,1368662400,,


In [5]:
# only include verified reviews
df_new = df_new[df_new["verified"] == True]

In [6]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 141992 entries, 0 to 222297
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      141992 non-null  int64  
 1   overall         141992 non-null  float64
 2   verified        141992 non-null  bool   
 3   reviewTime      141992 non-null  object 
 4   reviewerID      141992 non-null  object 
 5   asin            141992 non-null  object 
 6   style           138138 non-null  object 
 7   reviewerName    141992 non-null  object 
 8   reviewText      141961 non-null  object 
 9   summary         141895 non-null  object 
 10  unixReviewTime  141992 non-null  int64  
 11  vote            18753 non-null   float64
 12  image           76 non-null      object 
dtypes: bool(1), float64(2), int64(2), object(8)
memory usage: 14.2+ MB


In [7]:
df_new["verified"].value_counts()

True    141992
Name: verified, dtype: int64

In [8]:
# removing unnecessary columns
df_new = df_new[["overall", "reviewTime", "reviewText", "summary"]]

In [9]:
df_new.head()

Unnamed: 0,overall,reviewTime,reviewText,summary
0,5.0,"01 24, 2015",Great Classic for free. only wish they had th...,Five Stars
4,4.0,"05 16, 2013","I liked it, wished for more about her grandpar...",Pretty good
6,5.0,"01 2, 2013","I really liked this short story, but I really ...",Wished it was longer...
7,5.0,"03 23, 2016",Wow I love this series,Five Stars
8,5.0,"09 19, 2015",Great series. Can't wait to read more. Love t...,Love it


Now to deal with the inconsistent row indices:

In [13]:
df_reset = df_new.reset_index(drop=True)

In [14]:
df_reset

Unnamed: 0,overall,reviewTime,reviewText,summary
0,5.0,"01 24, 2015",Great Classic for free. only wish they had th...,Five Stars
1,4.0,"05 16, 2013","I liked it, wished for more about her grandpar...",Pretty good
2,5.0,"01 2, 2013","I really liked this short story, but I really ...",Wished it was longer...
3,5.0,"03 23, 2016",Wow I love this series,Five Stars
4,5.0,"09 19, 2015",Great series. Can't wait to read more. Love t...,Love it
...,...,...,...,...
141987,5.0,"12 24, 2014","It was a cute, short, sweet read. Very well wr...",Sweet
141988,5.0,"04 13, 2015",loved it,Five Stars
141989,5.0,"10 1, 2014",WOW - this was a 5++ rating. It was hard to p...,Fatal Flowers
141990,3.0,"08 20, 2017",Pretty good read.,Three Stars


In [15]:
df_reset.to_csv("kindle_cleaned.csv")