In [14]:
#This notebook will be used to explore and understand the 200000 Amazon Reviews Data set

import pandas as pd
import matplotlib as matplot
import csv
from os import path

In [15]:
# Getting importing the csv and checking contents
filepath = path.join("..", "datasets", "data", "amz200k_processed.csv.bak")

reviewsDF = pd.read_csv(filepath, encoding="utf-8")

reviewsDF.head()

Unnamed: 0.1,Unnamed: 0,Review Time,Item ID,Rating,Summary,Categories,Review Text
0,0,"07 8, 2014",I948571311,5.0,"Cute, retro dress!","[['Clothing, Shoes & Jewelry', 'Women'], ['Clo...",Beautiful red dress! Material is a little thin...
1,1,"12 26, 2012",I543938031,5.0,Great coat and great price,"[['Clothing, Shoes & Jewelry', 'Men', 'Surf, S...",I got this color for a very good price ($119)....
2,2,"12 15, 2013",I506749476,4.0,"Cute, good buy","[['Clothing, Shoes & Jewelry', 'Women'], ['Clo...","I have a very slender shape, size 1, and this ..."
3,3,"07 19, 2013",I969695447,5.0,Very Versatile Wallet for the man in your life,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",My boyfriend needed a new wallet for his birth...
4,4,"04 12, 2013",I225955774,5.0,compared to A&F Super Skinny 0S,"[['Clothing, Shoes & Jewelry', 'Women', 'Cloth...",I bought 3 pairs which are very different. Ind...


In [16]:
# Getting rid of extra column 'Unnamed: 0'

reviewsDF.drop(reviewsDF.columns[0], axis=1, inplace=True)

reviewsDF.head()


Unnamed: 0,Review Time,Item ID,Rating,Summary,Categories,Review Text
0,"07 8, 2014",I948571311,5.0,"Cute, retro dress!","[['Clothing, Shoes & Jewelry', 'Women'], ['Clo...",Beautiful red dress! Material is a little thin...
1,"12 26, 2012",I543938031,5.0,Great coat and great price,"[['Clothing, Shoes & Jewelry', 'Men', 'Surf, S...",I got this color for a very good price ($119)....
2,"12 15, 2013",I506749476,4.0,"Cute, good buy","[['Clothing, Shoes & Jewelry', 'Women'], ['Clo...","I have a very slender shape, size 1, and this ..."
3,"07 19, 2013",I969695447,5.0,Very Versatile Wallet for the man in your life,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",My boyfriend needed a new wallet for his birth...
4,"04 12, 2013",I225955774,5.0,compared to A&F Super Skinny 0S,"[['Clothing, Shoes & Jewelry', 'Women', 'Cloth...",I bought 3 pairs which are very different. Ind...


In [23]:
# Checking to see if the dataset has any NaN values
print(reviewsDF.count())
reviewsDF.dropna(inplace=True)
print("\n-- After --")
reviewsDF.count()

Review Time    1400
Item ID        1400
Rating         1400
Summary        1400
Categories     1400
Review Text    1400
dtype: int64

-- After --


Review Time    1400
Item ID        1400
Rating         1400
Summary        1400
Categories     1400
Review Text    1400
dtype: int64

In [24]:
# Getting summary statistics to get an idea of the composition of the dataset
reviewsDF.describe(include='all')

Unnamed: 0,Review Time,Item ID,Rating,Summary,Categories,Review Text
count,1400,1400,1400.0,1400,1400,1400
unique,676,1309,,1299,813,1400
top,"01 23, 2014",I784582779,,Perfect,"[['Clothing, Shoes & Jewelry', 'Women'], ['Clo...",these earrings aren't too big or too small i l...
freq,8,4,,7,23,1
mean,,,4.199286,,,
std,,,1.147892,,,
min,,,1.0,,,
25%,,,4.0,,,
50%,,,5.0,,,
75%,,,5.0,,,


In [36]:
# Getting Summary Statistics for reviews column
print(f"Mode: {reviewsDF['Rating'].mode()}\n")
print(f"(ascending order) Median: {(reviewsDF['Rating']).sort_values(ascending=True).median()}\n")
print(f"{reviewsDF['Rating'].describe()}\n")
print(f"Unique Values are:\n{reviewsDF['Rating'].value_counts()}\n")

Mode: 0    5.0
dtype: float64

(ascending order) Median: 5.0

count    1400.000000
mean        4.199286
std         1.147892
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000
Name: Rating, dtype: float64

Unique Values are:
5.0    806
4.0    285
3.0    156
2.0     88
1.0     65
Name: Rating, dtype: int64



# What have we learned up until now

## What type of data do we have?
```
- The dataset has 6 usable columns
- 1 'Unnamed 0' Columan that needed to be cleaned out
- 5 columns (excluding Ratings) are all text/object values
- The categories column still has Python lists in raw text form that need to be cleaned
- The entire dataset download from Kaggle has been pre-curated and contains NO NaN values
- We were able to obtain a datset with 1400 Amazon product reviews
```


## Insights for the column Rating
````
- The Rating column is the only one with numerical values
- The Ratings column has 5 discrete values 1 - 5 for its scoring
- We have a heavily skewed dataset leaning towards positive
````


In [None]:
# Visualizing the values of the column Ratings as as bar graph
ratingsX = [1,2,3,4,5]

# In the cell above we saw that value_counts gives us the ratings in the descending order based on
# values and it seems that the ratings are only in reverse order that we need it to be
ratingsTotal = reviewsDF["Rating"].value_counts(ascending=True).tolist()

plt.figure().suptitle(' Distribution of values for Ratings', fontsize=20)
plt.bar(ratingsX, ratingsTotal, color="b", align="center")
plt.xlabel('Ratings', fontsize=18)
plt.ylabel('Total count (Ratings)', fontsize=16)

plt.savefig(path.join("..","images","RatingsBarGraph.png"))