##### Used the extra libraries - contractions and sklearn, for the homework implementation

##### Below is the Python version used

In [1]:
!python --version

Python 3.9.12


In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ragha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

## Read Data

##### Reading the dataset using pandas

In [4]:
df = pd.read_csv('data.tsv', header=0, sep='\t', quotechar='"', on_bad_lines='skip', dtype='unicode')

In [5]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,1797882,R3I2DHQBR577SS,B001ANOOOE,2102612,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5,0,0,N,Y,Five Stars,"Love this, excellent sun block!!",2015-08-31
1,US,18381298,R1QNE9NQFJC2Y4,B0016J22EQ,106393691,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5,0,0,N,Y,Thank you Alba Bontanica!,The great thing about this cream is that it do...,2015-08-31
2,US,19242472,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5,0,0,N,Y,Five Stars,"Great Product, I'm 65 years old and this is al...",2015-08-31
3,US,19551372,R3KSZHPAEVPEAL,B002HWS7RM,255651889,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5,0,0,N,Y,GOOD DEAL!,I use them as shower caps & conditioning caps....,2015-08-31
4,US,14802407,RAI2OIG50KZ43,B00SM99KWU,116158747,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5,0,0,N,Y,this soaks in quick and provides a nice base f...,This is my go-to daily sunblock. It leaves no ...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5094302,US,50113639,RZ7RZ02MTP4SL,B000050B70,185454094,Conair NE150NSCS Cordless Nose and Ear Hair Tr...,Beauty,5,10,10,N,N,Great Little Grooming Tool,After watching my Dad struggle with his scisso...,2000-11-12
5094303,US,52940456,R2IRC0IZ8YCE5T,B000050FF2,678848064,Homedics Envirascape Sound Spa Alarm Clock Radio,Beauty,3,23,23,N,N,Not bad for the price,"Like most sound machines, the sounds choices a...",2000-11-07
5094304,US,47587881,R1U4ZSXOD228CZ,B000050B6U,862195513,Conair Instant Heat Curling Iron,Beauty,5,89,97,N,N,Best Curling Iron Ever,I bought this product because it indicated 30 ...,2000-11-02
5094305,US,53047750,R3SFJLZE09URWM,B000050FDE,195242894,Oral-B Professional Care 1000 Power Toothbrush,Beauty,5,10,10,N,N,"The best electric toothbrush ever, REALLY!",We have used Oral-B products for 15 years; thi...,2000-11-01


##### The dataset contains 5094307 reviews

## Keep Reviews and Ratings

In [6]:
df = pd.concat([df.iloc[:, 7], df.iloc[:, 13]], axis = 1)
df

Unnamed: 0,star_rating,review_body
0,5,"Love this, excellent sun block!!"
1,5,The great thing about this cream is that it do...
2,5,"Great Product, I'm 65 years old and this is al..."
3,5,I use them as shower caps & conditioning caps....
4,5,This is my go-to daily sunblock. It leaves no ...
...,...,...
5094302,5,After watching my Dad struggle with his scisso...
5094303,3,"Like most sound machines, the sounds choices a..."
5094304,5,I bought this product because it indicated 30 ...
5094305,5,We have used Oral-B products for 15 years; thi...


 ## We form three classes and select 20000 reviews randomly from each class.



##### Removing the rows from the dataset that contain missing values

In [7]:
df.isna().sum()

star_rating     10
review_body    400
dtype: int64

In [8]:
df = df.dropna(subset=['star_rating','review_body'])

In [9]:
df.isna().sum()

star_rating    0
review_body    0
dtype: int64

##### We are dividing the Amazon reviews in 3 classes - 
##### Class 1 - ratings with the values of 1 and 2<br>Class 2 - ratings with the value of 3<br>Class 3 - ratings with the values of 4 and 5

In [10]:
pd.options.mode.chained_assignment = None 
def labelClass(rating):
    if rating == "1" or rating == "2":
          return 1
    if rating == "3" :
          return 2
    if rating == "4" or rating  == "5":
          return 3
df['class'] = df['star_rating'].map(labelClass)

In [11]:
df

Unnamed: 0,star_rating,review_body,class
0,5,"Love this, excellent sun block!!",3
1,5,The great thing about this cream is that it do...,3
2,5,"Great Product, I'm 65 years old and this is al...",3
3,5,I use them as shower caps & conditioning caps....,3
4,5,This is my go-to daily sunblock. It leaves no ...,3
...,...,...,...
5094302,5,After watching my Dad struggle with his scisso...,3
5094303,3,"Like most sound machines, the sounds choices a...",2
5094304,5,I bought this product because it indicated 30 ...,3
5094305,5,We have used Oral-B products for 15 years; thi...,3


##### Selecting 20000 ratings from each class, randomly

In [12]:
class1 = df.loc[df['class'] == 1].sample(n=20000, random_state=1)
class2 = df.loc[df['class'] == 2].sample(n=20000, random_state=1)
class3 = df.loc[df['class'] == 3].sample(n=20000, random_state=1)
df = pd.concat([class1, class2, class3])

# Data Cleaning



### We perform the following data cleaning processes - 
#### 1. Convert all reviews to lowercase: This is done using the str.lower() function of Python <br>2. Remove all HTML and URLs: Used pandas.dataframe.apply on the row with a regex to remove HTML and URLs<br> 3. Remove non-alphabetical characters: Used pandas.dataframe.apply on the row with a regex to remove non-alphabetical characters<br>4. Remove extra spaces in reviews: Used pandas.dataframe.apply on the row with a regex to remove extra spaces<br>5. Perfom contractions on reviews: Used Python's contractions library to perform contractions

In [13]:
mean_length_before_clean = df['review_body'].apply(len).mean()

##### Converting all reviews into lowercase

In [14]:
df['review_body'] = df['review_body'].str.lower()
df

Unnamed: 0,star_rating,review_body,class
3957522,2,"i used this only for my eyebrows, as over time...",1
2728078,1,smelled more like pine sol. not happy with thi...,1
1437220,1,i used it and did not see any changes at,1
372183,1,i've used this on my body...no problems. then ...,1
774226,1,did not have safety seal on.. it could've just...,1
...,...,...,...
2530393,4,this product is awesome for the price. the wig...,3
6141,5,amazing results for a good price and customer ...,3
4408621,5,usually china glaze and other brands stick to ...,3
1518415,5,i use this as part of my recipe for homemade b...,3


In [15]:
df['review_body'] = df['review_body'].astype(str)

##### Removing the HTML and URLs from the reviews

In [16]:
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile('http\S+|https\S+'), "", x))
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile('<.*?>'), "", x))

##### Remove non-alphabetical characters

In [17]:
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile("[^A-Za-z]")," ", x))

##### Remove extra spaces

In [18]:
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile(' +'),' ', x))

##### Performing contractions on the reviews

In [19]:
df['review_body'] = df['review_body'].apply(lambda x: contractions.fix(x))

In [20]:
mean_length_after_clean = df['review_body'].apply(len).mean()

In [21]:
print('Average length of reviews before and after data cleaning is: ' + str(mean_length_before_clean), ',', str(mean_length_after_clean))

Average length of reviews before and after data cleaning is: 268.995 , 258.46111666666667


# Pre-processing

### Two types of preprocessing is done - 
#### 1. Stop word removal: They are removed below using the NLTK library in Python<br>2. Lemmatization: This procedure is also performed below using the NLTK library in Python

In [22]:
mean_length_before_preprocess = df['review_body'].apply(len).mean()

## remove the stop words 

##### Example of stop word removal - <br>Can listening be exhausting? ---> listening, exhausting

In [23]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['review_body'] = df['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ragha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## perform lemmatization  

##### Example of lemmatization - <br>went, gone, goes ---> go

In [24]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('omw-1.4')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

df['review_body'] = df['review_body'].apply(lemmatize_text)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ragha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [25]:
mean_length_after_preprocess = df['review_body'].apply(len).mean()

In [26]:
print('Average length of reviews before and after data preprocessing is: ' + str(mean_length_before_preprocess), ',', str(mean_length_after_preprocess))

Average length of reviews before and after data preprocessing is: 258.46111666666667 , 155.2794


# TF-IDF Feature Extraction

### TF-IDF reflects how significant a word is to an instance in the dataset 
##### This step is performed with the help of the sklearn library in Python

In [27]:
 # Creating X data and y labels
X = df['review_body']
y = df['class']
    
# Applying TFIDF feature extraction on X
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

##### Split the dataset, with 80% of it for training and 20% of it for testing

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=10)

In [29]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(48000, 31269) (12000, 31269) (48000,) (12000,)


# Perceptron

##### Applying the perceptron algorithm using Python's sklearn library.

In [30]:
params = {
    'max_iter': [50, 100, 500, 1000],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'eta0': [0.0001, 0.001, 0.01, 0.1, 1.0, 1.5]
}

##### Applying grid search to find the best hyperparameters

In [31]:
model = GridSearchCV(Perceptron(), params, cv=5).fit(X_train, y_train)
best_penalty = model.best_params_['penalty']
best_max_iter = model.best_params_['max_iter']
best_eta0 = model.best_params_['eta0']

##### Applying the perceptron algorithm on the best parameters found after applying grid search, and then printing the classification report

In [32]:
final_model = Perceptron(max_iter = best_max_iter, eta0 = best_eta0, penalty = best_penalty, random_state = 500).fit(X_train, y_train)
y_predict = final_model.predict(X_test)
report1 = classification_report(y_predict, y_test, output_dict=True)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           1       0.67      0.62      0.64      4290
           2       0.49      0.54      0.51      3687
           3       0.69      0.69      0.69      4023

    accuracy                           0.62     12000
   macro avg       0.62      0.61      0.61     12000
weighted avg       0.62      0.62      0.62     12000



In [33]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report1['1']['precision'])+",\t"+str(report1['1']['recall'])+",\t"+ str(report1['1']['f1-score']))
print('2     '+"\t"+ str(report1['2']['precision'])+",\t"+str(report1['2']['recall'])+",\t"+ str(report1['2']['f1-score']))
print('3     '+"\t"+ str(report1['3']['precision'])+",\t"+str(report1['3']['recall'])+",\t"+ str(report1['3']['f1-score']))
print('average' +"\t"+str((report1['1']['precision']+report1['2']['precision']+report1['3']['precision'])/3)+",\t"+str((report1['1']['recall']+report1['2']['recall']+report1['3']['recall'])/3)+",\t"+str((report1['1']['f1-score']+report1['2']['f1-score']+report1['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.6664147140337616,	0.6165501165501166,	0.6405133793437462
2     	0.49498243853487206,	0.5351234065636018,	0.5142708197575916
3     	0.6855377008652658,	0.6892866020382798,	0.6874070401586514
average	0.6156449511446332,	0.613653375050666,	0.6140637464199964 



# SVM

##### Applying the SVM algorithm using Python's sklearn library.

In [34]:
params = {"C": [0.01, 0.1, 1, 10, 100, 1000, 10000]}

##### Applying grid search to find the best hyperparameters

In [35]:
model = GridSearchCV(LinearSVC(), params, cv=5).fit(X_train, y_train)
best_c_val = model.best_params_['C']

##### Applying the SVM algorithm on the best parameters found after applying grid search, and then printing the classification report

In [36]:
final_model = LinearSVC(C = best_c_val, random_state = 42).fit(X_train, y_train)
y_predict = final_model.predict(X_test)
report2 = classification_report(y_predict,y_test, output_dict=True)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           1       0.72      0.68      0.70      4198
           2       0.56      0.62      0.59      3632
           3       0.77      0.74      0.75      4170

    accuracy                           0.68     12000
   macro avg       0.68      0.68      0.68     12000
weighted avg       0.69      0.68      0.69     12000



In [37]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report2['1']['precision'])+",\t"+str(report2['1']['recall'])+",\t"+ str(report2['1']['f1-score']))
print('2     '+"\t"+ str(report2['2']['precision'])+",\t"+str(report2['2']['recall'])+",\t"+ str(report2['2']['f1-score']))
print('3     '+"\t"+ str(report2['3']['precision'])+",\t"+str(report2['3']['recall'])+",\t"+ str(report2['3']['f1-score']))
print('average' +"\t"+str((report2['1']['precision']+report2['2']['precision']+report2['3']['precision'])/3)+",\t"+str((report2['1']['recall']+report2['2']['recall']+report2['3']['recall'])/3)+",\t"+str((report2['1']['f1-score']+report2['2']['f1-score']+report2['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.7226001511715797,	0.6831824678418295,	0.7023386800538753
2     	0.5607124937280482,	0.615363436123348,	0.5867681806248358
3     	0.7663782447466008,	0.7434052757793765,	0.7547169811320755
average	0.6832302965487429,	0.6806503932481847,	0.681274613936929 



# Logistic Regression

##### Applying the Logistic Regression algorithm using Python's sklearn library.

In [38]:
params = {"penalty": ['l1', 'l2', 'elasticnet', None]}

##### Applying grid search to find the best hyperparameters

In [39]:
model = GridSearchCV(LogisticRegression(), params, cv=5).fit(X_train, y_train)
best_penalty = model.best_params_['penalty']

##### Applying the Logistic Regression algorithm on the best parameters found after applying grid search, and then printing the classification report

In [40]:
final_model = LogisticRegression(penalty = best_penalty, random_state = 42).fit(X_train, y_train)
y_predict = final_model.predict(X_test)
report3 = classification_report(y_predict,y_test, output_dict=True)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           1       0.71      0.69      0.70      4088
           2       0.59      0.60      0.59      3911
           3       0.75      0.76      0.75      4001

    accuracy                           0.68     12000
   macro avg       0.68      0.68      0.68     12000
weighted avg       0.68      0.68      0.68     12000



In [41]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report3['1']['precision'])+",\t"+str(report3['1']['recall'])+",\t"+ str(report3['1']['f1-score']))
print('2     '+"\t"+ str(report3['2']['precision'])+",\t"+str(report3['2']['recall'])+",\t"+ str(report3['2']['f1-score']))
print('3     '+"\t"+ str(report3['3']['precision'])+",\t"+str(report3['3']['recall'])+",\t"+ str(report3['3']['f1-score']))
print('average' +"\t"+str((report3['1']['precision']+report3['2']['precision']+report3['3']['precision'])/3)+",\t"+str((report3['1']['recall']+report3['2']['recall']+report3['3']['recall'])/3)+",\t"+str((report3['1']['f1-score']+report3['2']['f1-score']+report3['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.708742756361804,	0.6881115459882583,	0.6982747921062431
2     	0.5888108379327647,	0.6001022756328305,	0.5944029378244904
3     	0.7495673671199011,	0.7578105473631592,	0.7536664180959483
average	0.6823736538048233,	0.6820081229947493,	0.682114716008894 



# Naive Bayes

##### Applying the Naive Bayes algorithm using Python's sklearn library.

In [42]:
params = {'alpha': [0.01, 0.1, 0, 1.0, 10.0]}

##### Applying grid search to find the best hyperparameters

In [43]:
model = GridSearchCV(MultinomialNB(), params, cv=5).fit(X_train, y_train)
best_alpha = model.best_params_['alpha']

##### Applying the Naive Bayes algorithm on the best parameters found after applying grid search, and then printing the classification report

In [44]:
final_model = MultinomialNB(alpha = best_alpha).fit(X_train, y_train)
y_predict = model.predict(X_test)
report4 = classification_report(y_predict,y_test, output_dict=True)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           1       0.66      0.70      0.68      3741
           2       0.63      0.57      0.60      4443
           3       0.71      0.75      0.73      3816

    accuracy                           0.67     12000
   macro avg       0.67      0.67      0.67     12000
weighted avg       0.66      0.67      0.66     12000



In [45]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report4['1']['precision'])+",\t"+str(report4['1']['recall'])+",\t"+ str(report4['1']['f1-score']))
print('2     '+"\t"+ str(report4['2']['precision'])+",\t"+str(report4['2']['recall'])+",\t"+ str(report4['2']['f1-score']))
print('3     '+"\t"+ str(report4['3']['precision'])+",\t"+str(report4['3']['recall'])+",\t"+ str(report4['3']['f1-score']))
print('average' +"\t"+str((report4['1']['precision']+report4['2']['precision']+report4['3']['precision'])/3)+",\t"+str((report4['1']['recall']+report4['2']['recall']+report4['3']['recall'])/3)+",\t"+str((report4['1']['f1-score']+report4['2']['f1-score']+report4['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.6598639455782312,	0.7000801924619086,	0.6793774319066148
2     	0.6309583542398395,	0.5660589691649787,	0.5967493178312967
3     	0.7072929542645241,	0.7497379454926625,	0.7278972140948988
average	0.6660384180275316,	0.6719590357065166,	0.6680079879442701 

