In [1]:
import numpy as np
import pandas as pd
import pandas as pd
import re


import matplotlib.pyplot as plt


In [2]:
# load in Amazon Fine Food Dataset
data_ori = pd.read_csv(r"./Reviews.csv")
data_ori.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [3]:
# remove reviews with scores of 3 which represents neutral sentiment
# the objective of this project is to predict whether a review is 
# negative(score 1 or 2) or positive (score 4 or 5)
data_ori = data_ori[data_ori.Score != 3]
# check the unique Score
data_ori.Score.unique()

array([5, 1, 4, 2])

In [4]:
"""
It is shown that there are duplicated identical reviews from 
the same customers for different products.
e.g the first two rows show the identical summaries'Green" K-cup packaging sacrifices flavor', 
texts 'Overall its just OK when considering the price.' sent at the same time from Breyton (thereby same userid), 
and the score are both negative.

Although the list of product name was not able to be accessed, ProductId=B005ZBZLT4 and ProductId=B007Y59HVM 
represents two products with the same brands but different flavors or quantities. Therefore, the inference 
is that the reviews with different ProductId, but same the other parameters represents the same product with 
different flavors or quantities.

As a result, the entries with different ProductId but the same other parameters were deleted. 
Redundancy in training data can cause bias to the model.
"""

pd.concat(v for i, v in data_ori.groupby("UserId") if len(v) > 1).head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
83317,83318,B005ZBZLT4,#oc-R115TNMSPFT9I7,Breyton,2,3,2,1331510400,"""Green"" K-cup packaging sacrifices flavor",Overall its just OK when considering the price...
180871,180872,B007Y59HVM,#oc-R115TNMSPFT9I7,Breyton,2,3,2,1331510400,"""Green"" K-cup packaging sacrifices flavor",Overall its just OK when considering the price...
290947,290948,B005HG9ESG,#oc-R11D9D7SHXIJB9,"Louis E. Emory ""hoppy""",0,0,5,1342396800,Muscle spasms,"My wife has recurring extreme muscle spasms, u..."
455533,455534,B005HG9ERW,#oc-R11D9D7SHXIJB9,"Louis E. Emory ""hoppy""",0,0,5,1342396800,Muscle spasms,"My wife has recurring extreme muscle spasms, u..."
496893,496894,B005HG9ET0,#oc-R11D9D7SHXIJB9,"Louis E. Emory ""hoppy""",0,0,5,1342396800,Muscle spasms,"My wife has recurring extreme muscle spasms, u..."


In [5]:
"""
The approach used to eliminate entries of the same product with different ProductId is to firstly 
sort the ProductId in an ascending order. Then keep the first review as a representative and delete 
the remaining reviews. Doing sorting before elimination is to make sure noncontiguous entries of the 
same products would not have more than one representatives. 

"""

# Sorting data according to ProductId in ascending order

data_ori=data_ori.sort_values(
    'ProductId',
    axis=0,
    ascending = True, 
    inplace=False )
data_ori.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 525814 entries, 150523 to 327600
Data columns (total 10 columns):
Id                        525814 non-null int64
ProductId                 525814 non-null object
UserId                    525814 non-null object
ProfileName               525798 non-null object
HelpfulnessNumerator      525814 non-null int64
HelpfulnessDenominator    525814 non-null int64
Score                     525814 non-null int64
Time                      525814 non-null int64
Summary                   525789 non-null object
Text                      525814 non-null object
dtypes: int64(5), object(5)
memory usage: 44.1+ MB


In [6]:
# Deduplication of entries
data_ori=data_ori.drop_duplicates(
    subset=['UserId','ProfileName','Text'], 
    keep='first')
data_ori.shape

(363899, 10)

In [7]:
"""
Erroneous data of which HelpfulnessNumerator is greater than HelpfulnessDenominator were deleted. this is imposible in real life.

As shown below, entries of Id 64422 and 44737 are errorneous data where HelpfulnessNumerator is greater than HelpfulnessDenominator.

"""

data_ori[data_ori.HelpfulnessNumerator>data_ori.HelpfulnessDenominator]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
64421,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
44736,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [8]:
data_ori=data_ori[data_ori.HelpfulnessNumerator <= data_ori.HelpfulnessDenominator]

In [9]:
data_ori.shape

(363897, 10)

In [10]:
"""
Detect if there is any missing value in features that were used for training the data. If there is missingness
detected, print the number of missing value in each feature.

The reason why missing value must be replaced is because in the later stage, if missing value is not string, it would cause bug.
"""
data_ori.isna().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               11
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                    1
Text                       0
dtype: int64

In [11]:
# Show what does the value of missingness look like. The missing value is nan.
index_missing = data_ori.Summary.index[data_ori.Summary.isna()].tolist()
print('The missing values look like:')

for i,v in enumerate(index_missing):
    print(data_ori.Summary[v], end = ' ')

The missing values look like:
nan 

In [12]:
# Fill the missing value with string of blank space.
data_ori=data_ori.fillna(str(' '))

In [13]:
data_ori.isna().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64