In [57]:
# Import libraries
import numpy as np
import pandas as pd 
import sys
import matplotlib.pyplot as plt

%matplotlib inline

In [38]:
# Load the dataset
df = pd.read_csv('data/reviews.csv')

df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [39]:
df.shape

(568454, 10)

In [40]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [41]:
df = df.dropna()

In [42]:
df.isnull().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

In [43]:
df.shape

(568401, 10)

In [44]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

### Select useful columns

In [45]:
useful_columns = ['UserId', 'ProductId', 'Time', 'Score', 'Summary', 'Text']
df = df[useful_columns]
df.head()

Unnamed: 0,UserId,ProductId,Time,Score,Summary,Text
0,A3SGXH7AUHU8GW,B001E4KFG0,1303862400,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,A1D87F6ZCVE5NK,B00813GRG4,1346976000,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,ABXLMWJIXXAIN,B000LQOCH0,1219017600,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,A395BORC6FGVXV,B000UA0QIQ,1307923200,2,Cough Medicine,If you are looking for the secret ingredient i...
4,A1UQRSCLF8GW1T,B006K2ZZ7K,1350777600,5,Great taffy,Great taffy at a great price. There was a wid...


### Preprocess Summary and Text columns

In [46]:
# Lower case summary and text
df['Summary'] = df['Summary'].str.lower()
df['Text'] = df['Text'].str.lower()
print(df["Summary"][0])
print(df["Text"][0])

good quality dog food
i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than  most.


In [47]:
# Expand contractions
import contractions

df['Summary'] = df['Summary'].apply(lambda x: contractions.fix(x))
df['Text'] = df['Text'].apply(lambda x: contractions.fix(x))

In [48]:
# Remove punctuation
import re
import string

df["Summary"] = df["Summary"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , str(x)))
df['text'] = df['Text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , str(x)))

In [49]:
# Remove words containing digits and digits
df['Summary'] = df['Summary'].apply(lambda x: re.sub(r'\b\S*\d\S*\b', '', str(x)))
df['Text'] = df['Text'].apply(lambda x: re.sub(r'\b\S*\d\S*\b', '', str(x)))

In [50]:
# Remove stopwords 
#import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
#here we have implemented a custom function that will split each word from the text and check whether it is a stopword or not.
#If not then pass as it is in string and if stopword then removes it.
df['Summary'] = df['Summary'].apply(lambda x: remove_stopwords(x))
df['Text'] = df['Text'].apply(lambda x: remove_stopwords(x))

In [53]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
df['Summary_stem'] = df['Summary'].apply(lambda x: stem_words(x))
df['Text_stem'] = df['Text'].apply(lambda x: stem_words(x))

In [55]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADAMA\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ADAMA\AppData\Roaming\nltk_data...


True

In [56]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
df['Summary_lem'] = df['Summary'].apply(lambda x: lemmatize_words(x))
df['Text_lem'] = df['Text'].apply(lambda x: lemmatize_words(x))

In [58]:
df.head()

Unnamed: 0,UserId,ProductId,Time,Score,Summary,Text,text,Summary_stem,Text_stem,Summary_lem,Text_lem
0,A3SGXH7AUHU8GW,B001E4KFG0,1303862400,5,good qualiti dog food,bought several vitality canned dog food produc...,i have bought several of the vitality canned d...,good qualiti dog food,bought sever vital can dog food product found ...,good qualiti dog food,bought several vitality canned dog food produc...
1,A1D87F6ZCVE5NK,B00813GRG4,1346976000,1,advertis,product arrived labeled jumbo salted peanuts.....,product arrived labeled as jumbo salted peanut...,adverti,product arriv label jumbo salt peanuts...th pe...,advertis,product arrived labeled jumbo salted peanuts.....
2,ABXLMWJIXXAIN,B000LQOCH0,1219017600,4,delight say,"confection around centuries. light, pillowy ci...",this is a confection that has been around a fe...,delight say,"confect around centuries. light, pillowi citru...",delight say,"confection around centuries. light, pillowy ci..."
3,A395BORC6FGVXV,B000UA0QIQ,1307923200,2,cough medicin,looking secret ingredient robitussin believe f...,if you are looking for the secret ingredient i...,cough medicin,look secret ingredi robitussin believ found it...,cough medicin,looking secret ingredient robitussin believe f...
4,A1UQRSCLF8GW1T,B006K2ZZ7K,1350777600,5,great taffi,great taffy great price. wide assortment yummy...,great taffy at a great price there was a wide...,great taffi,great taffi great price. wide assort yummi taf...,great taffi,great taffy great price. wide assortment yummy...


In [59]:
df.to_csv('data/reviews_cleaned.csv')