In [1]:
import pandas as pd
import numpy as np
import re
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import gensim.downloader as api
import gensim.models
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from torch.autograd import Variable

In [2]:
!python --version

Python 3.9.12


In [3]:
torch.__version__

'1.12.1+cu116'

##### VERSIONS OF OTHER LIBRARIES USED

scikit-image : 0.19.2 <br>
numpy : 1.21.5 <br>
pandas : 1.4.2 <br>
contractions : 0.1.73 <br>
gensim : 4.1.2 <br>

# Dataset Generation

In [4]:
df = pd.read_csv('data.tsv', header=0, sep='\t', quotechar='"', on_bad_lines='skip', dtype='unicode')

In [5]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,1797882,R3I2DHQBR577SS,B001ANOOOE,2102612,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5,0,0,N,Y,Five Stars,"Love this, excellent sun block!!",2015-08-31
1,US,18381298,R1QNE9NQFJC2Y4,B0016J22EQ,106393691,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5,0,0,N,Y,Thank you Alba Bontanica!,The great thing about this cream is that it do...,2015-08-31
2,US,19242472,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5,0,0,N,Y,Five Stars,"Great Product, I'm 65 years old and this is al...",2015-08-31
3,US,19551372,R3KSZHPAEVPEAL,B002HWS7RM,255651889,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5,0,0,N,Y,GOOD DEAL!,I use them as shower caps & conditioning caps....,2015-08-31
4,US,14802407,RAI2OIG50KZ43,B00SM99KWU,116158747,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5,0,0,N,Y,this soaks in quick and provides a nice base f...,This is my go-to daily sunblock. It leaves no ...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5094302,US,50113639,RZ7RZ02MTP4SL,B000050B70,185454094,Conair NE150NSCS Cordless Nose and Ear Hair Tr...,Beauty,5,10,10,N,N,Great Little Grooming Tool,After watching my Dad struggle with his scisso...,2000-11-12
5094303,US,52940456,R2IRC0IZ8YCE5T,B000050FF2,678848064,Homedics Envirascape Sound Spa Alarm Clock Radio,Beauty,3,23,23,N,N,Not bad for the price,"Like most sound machines, the sounds choices a...",2000-11-07
5094304,US,47587881,R1U4ZSXOD228CZ,B000050B6U,862195513,Conair Instant Heat Curling Iron,Beauty,5,89,97,N,N,Best Curling Iron Ever,I bought this product because it indicated 30 ...,2000-11-02
5094305,US,53047750,R3SFJLZE09URWM,B000050FDE,195242894,Oral-B Professional Care 1000 Power Toothbrush,Beauty,5,10,10,N,N,"The best electric toothbrush ever, REALLY!",We have used Oral-B products for 15 years; thi...,2000-11-01


In [6]:
df = pd.concat([df.iloc[:, 7], df.iloc[:, 13]], axis = 1)
df

Unnamed: 0,star_rating,review_body
0,5,"Love this, excellent sun block!!"
1,5,The great thing about this cream is that it do...
2,5,"Great Product, I'm 65 years old and this is al..."
3,5,I use them as shower caps & conditioning caps....
4,5,This is my go-to daily sunblock. It leaves no ...
...,...,...
5094302,5,After watching my Dad struggle with his scisso...
5094303,3,"Like most sound machines, the sounds choices a..."
5094304,5,I bought this product because it indicated 30 ...
5094305,5,We have used Oral-B products for 15 years; thi...


In [7]:
df = df.dropna(subset=['star_rating','review_body'])

In [8]:
# DIVIDED ALL THE REVIEWS INTO 3 CLASSES, AS WE DID IN HW1
pd.options.mode.chained_assignment = None 
def labelClass(rating):
    if rating == "1" or rating == "2":
          return 1
    if rating == "3" :
          return 2
    if rating == "4" or rating  == "5":
          return 3
df['class'] = df['star_rating'].map(labelClass)

In [9]:
# CREATE A BALANCED DATASET OF 60K REVIEWS, LIKE HW1
class1 = df.loc[df['class'] == 1].sample(n=20000, random_state=1)
class2 = df.loc[df['class'] == 2].sample(n=20000, random_state=1)
class3 = df.loc[df['class'] == 3].sample(n=20000, random_state=1)
df = pd.concat([class1, class2, class3])

In [10]:
df['review_body'] = df['review_body'].str.lower()  # CONVERT ALL REVEIWS TO LOWERCASE
df['review_body'] = df['review_body'].astype(str)  
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile('http\S+|https\S+'), "", x))  # REMOVE ALL URLs 
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile('<.*?>'), "", x))   # REMOVE ALL HTML TAGS
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile("[^A-Za-z]")," ", x))  # REMOVE ALL NON-ALPHABETICAL CHARACTERS
df['review_body'] = df['review_body'].apply(lambda x: re.sub(re.compile(' +'),' ', x))  # REMOVES EXTRA SPACES IN REVIEWS
df['review_body'] = df['review_body'].apply(lambda x: contractions.fix(x))  # PERFORM CONTRACTIONS ON REVIEWS

# Word Embedding

#### PART A (PRETRAINED WORD2VEC)

In [11]:
word_2_vec = api.load('word2vec-google-news-300')

In [12]:
# CHECK SIMILARITY BETWEEN THE FOLLOWING 2 WORDS
word_2_vec.similarity("gorgeous", "beautiful")

0.8353004

In [13]:
# CHECK SIMILARITY BETWEEN THE FOLLOWING 2 WORDS
word_2_vec.similarity("happy", "pleased")

0.66321707

In [114]:
neighbors = word_2_vec.most_similar(positive=['man', 'woman'], negative=['boy'], topn=5)
for n in neighbors:
    print(n)

('lady', 0.5354641079902649)
('person', 0.529635488986969)
('Woman', 0.513024628162384)
('men', 0.4956325590610504)
('policewoman', 0.4909151792526245)


#### PART B (WORD2VEC TRAINED ON OUR DATA)

In [15]:
# SPLIT THE REVIEWS INTO INDIVIDUAL WORDS
sentences = []
for i in range(len(df['review_body'])):
    sentences.append(df['review_body'].values[i].split(' '))

In [16]:
model = gensim.models.Word2Vec(sentences=sentences, vector_size=300, min_count=9, window=13)

In [17]:
# CHECK SIMILARITY BETWEEN THE FOLLOWING 2 WORDS
model.wv.similarity("gorgeous", "beautiful")

0.85005796

In [18]:
# CHECK SIMILARITY BETWEEN THE FOLLOWING 2 WORDS
model.wv.similarity("happy", "pleased")

0.90446115

In [113]:
neighbors = word_2_vec.most_similar(positive=['man', 'woman'], negative=['boy'], topn=5)
for n in neighbors:
    print(n)

('lady', 0.5354641079902649)
('person', 0.529635488986969)
('Woman', 0.513024628162384)
('men', 0.4956325590610504)
('policewoman', 0.4909151792526245)


##### The trained Word2Vec model on our dataset seems to encode semantic similarities between words better.

# Simple models

## Using TF-IDF Feature Extraction

In [20]:
 # Creating X data and y labels
X = df['review_body']
y = df['class']
    
# Applying TFIDF feature extraction on X
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(48000, 34598) (12000, 34598) (48000,) (12000,)


### Perceptron using TF-IDF

In [22]:
# FIT A SINGLE PERCEPTRON MODEL ON OUR DATASET
model1 = Perceptron(tol=1e-3, random_state=100)
model1.fit(X_train, y_train)
y_predict = model1.predict(X_test)

In [23]:
report1 = classification_report(y_predict, y_test, output_dict=True)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           1       0.64      0.64      0.64      3981
           2       0.58      0.53      0.55      4318
           3       0.68      0.74      0.71      3701

    accuracy                           0.63     12000
   macro avg       0.63      0.64      0.63     12000
weighted avg       0.63      0.63      0.63     12000



In [24]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report1['1']['precision'])+",\t"+str(report1['1']['recall'])+",\t"+ str(report1['1']['f1-score']))
print('2     '+"\t"+ str(report1['2']['precision'])+",\t"+str(report1['2']['recall'])+",\t"+ str(report1['2']['f1-score']))
print('3     '+"\t"+ str(report1['3']['precision'])+",\t"+str(report1['3']['recall'])+",\t"+ str(report1['3']['f1-score']))
print('average' +"\t"+str((report1['1']['precision']+report1['2']['precision']+report1['3']['precision'])/3)+",\t"+str((report1['1']['recall']+report1['2']['recall']+report1['3']['recall'])/3)+",\t"+str((report1['1']['f1-score']+report1['2']['f1-score']+report1['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.6394557823129252,	0.6375282592313489,	0.6384905660377359
2     	0.5762669342699448,	0.531959240389069,	0.5532273603082851
3     	0.681087762669963,	0.7443934071872467,	0.7113348825200103
average	0.6322701597509442,	0.6379603022692216,	0.6343509362886771 



### SVM using TF-IDF

In [25]:
# FIT A SVM MODEL ON OUR DATASET
model2 = LinearSVC(random_state=42)
model2.fit(X_train, y_train)
y_predict = model2.predict(X_test)

In [26]:
report2 = classification_report(y_predict, y_test, output_dict=True)
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

           1       0.71      0.70      0.71      4017
           2       0.60      0.61      0.61      3894
           3       0.78      0.77      0.78      4089

    accuracy                           0.70     12000
   macro avg       0.70      0.70      0.70     12000
weighted avg       0.70      0.70      0.70     12000



In [27]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report2['1']['precision'])+",\t"+str(report2['1']['recall'])+",\t"+ str(report2['1']['f1-score']))
print('2     '+"\t"+ str(report2['2']['precision'])+",\t"+str(report2['2']['recall'])+",\t"+ str(report2['2']['f1-score']))
print('3     '+"\t"+ str(report2['3']['precision'])+",\t"+str(report2['3']['recall'])+",\t"+ str(report2['3']['f1-score']))
print('average' +"\t"+str((report2['1']['precision']+report2['2']['precision']+report2['3']['precision'])/3)+",\t"+str((report2['1']['recall']+report2['2']['recall']+report2['3']['recall'])/3)+",\t"+str((report2['1']['f1-score']+report2['2']['f1-score']+report2['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.709498614260519,	0.7010206621857107,	0.7052341597796142
2     	0.5988459608630206,	0.6129943502824858,	0.6058375634517766
3     	0.7824474660074165,	0.7740278796771827,	0.7782149004179985
average	0.6969306803769854,	0.6960142973817932,	0.6964288745497965 



## Word2Vec for feature extraction

### Perceptron using Word2Vec

In [49]:
data = []
sent = []

for i in range(len(df['review_body'])):
    split_sent = df['review_body'].values[i].split(' ')
    for word in split_sent:
        try:
            sent.append(word_2_vec[word])
        except:
            sent.append(np.zeros(300))
    sent = np.array(sent)
    sent = np.mean(sent, axis=0)
    data.append(sent)
    sent = []

In [50]:
data = np.array(data)
data_label = df['class'].values
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data_label, test_size = 0.2, random_state=10)
print(X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape)

(48000, 300) (12000, 300) (48000,) (12000,)


In [30]:
# FIT A SINGLE PERCEPTRON MODEL ON OUR DATASET
model3 = Perceptron(tol=1e-3, random_state=10)
model3.fit(X_train1, y_train1)
y_predict = model3.predict(X_test1)

In [31]:
report3 = classification_report(y_predict, y_test1, output_dict=True)
print(classification_report(y_predict,y_test1))

              precision    recall  f1-score   support

           1       0.72      0.59      0.65      4776
           2       0.61      0.54      0.57      4526
           3       0.56      0.83      0.67      2698

    accuracy                           0.63     12000
   macro avg       0.63      0.66      0.63     12000
weighted avg       0.64      0.63      0.62     12000



In [32]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report3['1']['precision'])+",\t"+str(report3['1']['recall'])+",\t"+ str(report3['1']['f1-score']))
print('2     '+"\t"+ str(report3['2']['precision'])+",\t"+str(report3['2']['recall'])+",\t"+ str(report3['2']['f1-score']))
print('3     '+"\t"+ str(report3['3']['precision'])+",\t"+str(report3['3']['recall'])+",\t"+ str(report3['3']['f1-score']))
print('average' +"\t"+str((report3['1']['precision']+report3['2']['precision']+report3['3']['precision'])/3)+",\t"+str((report3['1']['recall']+report3['2']['recall']+report3['3']['recall'])/3)+",\t"+str((report3['1']['f1-score']+report3['2']['f1-score']+report3['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.7150415721844293,	0.5942211055276382,	0.6490566037735849
2     	0.6131460110386352,	0.5399911621741051,	0.5742481203007519
3     	0.5557478368355995,	0.8332097850259451,	0.6667655346285036
average	0.627978473352888,	0.6558073509092295,	0.6300234195676134 



### SVM using Word2Vec

In [33]:
# FIT A SVM MODEL ON OUR DATASET
model4 = LinearSVC(random_state=100)
model4.fit(X_train1, y_train1)
y_predict = model4.predict(X_test1)

In [34]:
report4 = classification_report(y_predict, y_test1, output_dict=True)
print(classification_report(y_predict,y_test1))

              precision    recall  f1-score   support

           1       0.70      0.66      0.68      4232
           2       0.55      0.59      0.57      3732
           3       0.72      0.72      0.72      4036

    accuracy                           0.66     12000
   macro avg       0.66      0.65      0.66     12000
weighted avg       0.66      0.66      0.66     12000



In [35]:
print('Class\tPrecision\t\tRecall\t\t\tF1-score\n')
print('1     '+"\t"+ str(report4['1']['precision'])+",\t"+str(report4['1']['recall'])+",\t"+ str(report4['1']['f1-score']))
print('2     '+"\t"+ str(report4['2']['precision'])+",\t"+str(report4['2']['recall'])+",\t"+ str(report4['2']['f1-score']))
print('3     '+"\t"+ str(report4['3']['precision'])+",\t"+str(report4['3']['recall'])+",\t"+ str(report4['3']['f1-score']))
print('average' +"\t"+str((report4['1']['precision']+report4['2']['precision']+report4['3']['precision'])/3)+",\t"+str((report4['1']['recall']+report4['2']['recall']+report4['3']['recall'])/3)+",\t"+str((report4['1']['f1-score']+report4['2']['f1-score']+report4['3']['f1-score'])/3), '\n')

Class	Precision		Recall			F1-score

1     	0.6984126984126984,	0.6550094517958412,	0.676015120107304
2     	0.5524335173105871,	0.590032154340836,	0.5706141487431978
3     	0.7181705809641533,	0.7197720515361744,	0.7189704244524192
average	0.6563389322291463,	0.6549378858909506,	0.6551998977676403 



In [36]:
pred = model1.predict(X_test)
acc = accuracy_score(y_test, pred) * 100
pred1 = model3.predict(X_test1)
acc1 = accuracy_score(y_test1, pred1) * 100
print("FOR PERCEPTRON -")
print("Test accuracy when using TF-IDF:", acc, "%")
print("Test accuracy when using Word2Vec:", acc1, "%")

FOR PERCEPTRON -
Test accuracy when using TF-IDF: 63.24999999999999 %
Test accuracy when using Word2Vec: 62.74999999999999 %


In [37]:
pred3 = model2.predict(X_test)
acc = accuracy_score(y_test, pred3) * 100
pred4 = model4.predict(X_test1)
acc1 = accuracy_score(y_test1, pred4) * 100
print("FOR SVM -")
print("Test accuracy when using TF-IDF:", acc, "%")
print("Test accuracy when using Word2Vec:", acc1, '%')

FOR SVM -
Test accuracy when using TF-IDF: 69.73333333333333 %
Test accuracy when using Word2Vec: 65.65833333333333 %


##### TF-IDF gives better results than Word2Vec for both Perceptron and SVM

# Feedforward Neural Networks

#### PART A

In [51]:
# CREATE ONE-HOT ENCODINGS OF THE 3 RATINGS
enc_data_label = np.zeros((len(data_label), 3))
for i in range(len(data_label)):
    if data_label[i] == 1:
        enc_data_label[i] = [0, 0, 1]
    elif data_label[i] == 2:
        enc_data_label[i] = [0, 1, 0]
    elif data_label[i] == 3:
        enc_data_label[i] = [1, 0, 0]

In [52]:
# TRAIN-TEST-SPLIT
X_train, X_test, y_train, y_test = train_test_split(data, enc_data_label, test_size = 0.2, random_state=10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(48000, 300) (12000, 300) (48000, 3) (12000, 3)


In [62]:
# FNN IN PYTORCH
class FNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(300, 100)   
        self.dropout = nn.Dropout(0.2)   
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 3)
        
    def forward(self,x):
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)     # added dropout to reduce overfitting 
        x = nn.functional.relu(self.fc2(x))
#         x = self.dropout(x)   # added dropout to reduce overfitting 
        x = nn.functional.relu(self.fc3(x))
        
        return x

In [63]:
# USING GPU
print(torch.cuda.device_count())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

1


device(type='cuda', index=0)

In [64]:
fnn_model = FNN().to(device)
fnn_model = fnn_model.to(device)   # using '.to(device)' to move the model from CPU to GPU

In [65]:
# CREATE DATASET CLASS FOR DATALOADERS
class Dataset(Dataset):
    def __init__(self, data1, data2):
        self.data1 = data1
        self.data2 = data2
        
    def __len__(self):
        return len(self.data1)
    
    def __getitem__(self, idx):
        X = self.data1[idx]
        y = self.data2[idx]

        return X, y

In [66]:
batch_size = 32  # BATCH SIZE FOR THIS MODEL
train_dataset = Dataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_dataset = Dataset(X_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

In [67]:
model_optim=torch.optim.Adam(fnn_model.parameters(), lr = 0.001)  
CEloss = nn.CrossEntropyLoss()

In [68]:
# TRAINING THE FNN MODEL
history_train = []
history_test = []
train_dataloader_len = len(train_dataloader)
test_dataloader_len = len(test_dataloader)
train_len = X_train.shape[0]
test_len = X_test.shape[0]

for epoch in range(30):  # loops over the complete dataset multiple times (which is the nummber of epochs)
    fnn_model.train()     
    train_loss = 0.0
    correct_train = 0
    train_accuracy = 0.0
    for i, data in enumerate(train_dataloader, 0):  # loops over complete training dataset once 
        
        inputs, label = data
        inputs = inputs.float()    # convert double values to float
        inputs, label = inputs.to(device), label.to(device)   

        model_optim.zero_grad()
        output = fnn_model(inputs)   # forward pass of model
        output = output.to(device)
        
        loss1 = CEloss(output, label)     # loss calculation
        loss1.backward()            # computes the gradient during the backward pass
        model_optim.step()   # performs single optimization step

        train_loss += loss1.item()   # adding accuracy values of all batches in an epoch
        _, output = torch.max(output, 1)     # storing the index of maximum value in prediction to the variable 'output'
        output = output.cpu().detach().numpy()     # loads the variable to cpu and converts it to a numpy array
        label = label.cpu().detach().numpy()        
        label = np.argmax(label, axis = 1)   # storing the index of maximum value in label to the variable 'label'
        train_accuracy += accuracy_score(label, output)  # adding accuracy values of all batches in training dataset in an epoch
    
    train_loss = train_loss/train_dataloader_len
    train_accuracy = train_accuracy/train_dataloader_len  # dividing accuracy by number of batches for training dataset
    history_train.append((train_loss, train_accuracy))
    
    fnn_model.eval()     # model evaluation on test dataset
    test_loss = 0.0
    test_accuracy = 0.0
    correct_test = 0
    with torch.no_grad():     # disables gradient calculation
        for i, data in enumerate(test_dataloader, 0):    # loops over complete test dataset once
            
            inputs, label = data
            inputs = inputs.float()
            inputs, label = inputs.to(device), label.to(device)

            pred = fnn_model(inputs)
            pred = pred.to(device)
            loss2 = CEloss(pred, label)

            test_loss += loss2.item()
            _, pred = torch.max(pred, 1)
            pred = pred.cpu().detach().numpy()
            label = label.cpu().detach().numpy()
            label = np.argmax(label, axis = 1)    
            test_accuracy += accuracy_score(label, pred)
            
        test_loss = test_loss/test_dataloader_len
        test_accuracy = test_accuracy/test_dataloader_len
        history_test.append((test_loss, test_accuracy))
    
#     print("Epoch:", (epoch+1))
#     print("Train Loss:", train_loss, "\tTrain Accuracy:", train_accuracy)
#     print("Test Loss:", test_loss, "\tTest Accuracy:", test_accuracy)
#     print("=======================================================================================")
    print("Train accuracy:", train_accuracy)

print("Final test accuracy:", test_accuracy)

Train accuracy: 0.4492083333333333
Train accuracy: 0.47072916666666664
Train accuracy: 0.4746458333333333
Train accuracy: 0.47654166666666664
Train accuracy: 0.47922916666666665
Train accuracy: 0.479625
Train accuracy: 0.48225
Train accuracy: 0.48033333333333333
Train accuracy: 0.4801458333333333
Train accuracy: 0.48185416666666664
Train accuracy: 0.48504166666666665
Train accuracy: 0.48585416666666664
Train accuracy: 0.48404166666666665
Train accuracy: 0.48702083333333335
Train accuracy: 0.4846875
Train accuracy: 0.4861666666666667
Train accuracy: 0.48685416666666664
Train accuracy: 0.48741666666666666
Train accuracy: 0.4875
Train accuracy: 0.4895833333333333
Train accuracy: 0.4875
Train accuracy: 0.48814583333333333
Train accuracy: 0.48820833333333336
Train accuracy: 0.49
Train accuracy: 0.4886041666666667
Train accuracy: 0.4888958333333333
Train accuracy: 0.562375
Train accuracy: 0.6599791666666667
Train accuracy: 0.6700416666666666
Train accuracy: 0.6755416666666667
Final test accu

#### PART B

In [73]:
# concatenate the first 10 Word2Vec vectors for each review
concat_data = []
sent = []
for i in range(len(df['review_body'])):
    split_sent = df['review_body'].values[i].split(' ')
    for word in split_sent[:10]:
        try:
            sent.append(word_2_vec[word])
        except:
            
            sent.append(list(np.zeros(300)))   
    if len(sent) < 10:
        sent = np.concatenate([sent,np.zeros((10-len(sent), 300))])
    sent = np.array(sent)
    sent = sent.flatten()
    concat_data.append(sent)
    sent = []
concat_data = np.array(concat_data)

In [74]:
class FNN1(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(3000, 100)
        self.dropout = nn.Dropout(0.2)   
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 3)
        
    def forward(self,x):
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)     # added dropout to reduce overfitting 
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.relu(self.fc3(x))
        
        return x

In [75]:
fnn_model1 = FNN1().to(device)
fnn_model1 = fnn_model1.to(device)

In [76]:
X_train, X_test, y_train, y_test = train_test_split(concat_data, enc_data_label, test_size = 0.2, random_state=10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(48000, 3000) (12000, 3000) (48000, 3) (12000, 3)


In [77]:
batch_size = 32
train_dataset = Dataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_dataset = Dataset(X_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

In [78]:
model_optim=torch.optim.Adam(fnn_model1.parameters(), lr = 0.001)
CEloss = nn.CrossEntropyLoss()

In [79]:
history_train = []
history_test = []
train_dataloader_len = len(train_dataloader)
test_dataloader_len = len(test_dataloader)
train_len = X_train.shape[0]
test_len = X_test.shape[0]

for epoch in range(15):  # loops over only 15 epochs because the model starts overfitting heavily after that
    fnn_model1.train()
    train_loss = 0.0
    correct_train = 0
    train_accuracy = 0.0
    for i, data in enumerate(train_dataloader, 0):
        
        inputs, label = data
        inputs = inputs.float()
        inputs, label = inputs.to(device), label.to(device)

        model_optim.zero_grad()
        output = fnn_model1(inputs)
        output = output.to(device)
        loss1 = CEloss(output, label)
        loss1.backward()
        model_optim.step()

        train_loss += loss1.item()
        _, output = torch.max(output, 1)
        output = output.cpu().detach().numpy()
        label = label.cpu().detach().numpy()
        label = np.argmax(label, axis = 1)
        train_accuracy += accuracy_score(label, output)
    
    train_loss = train_loss/train_dataloader_len
    train_accuracy = train_accuracy/train_dataloader_len
    history_train.append((train_loss, train_accuracy))
    
    fnn_model1.eval()   
    test_loss = 0.0
    test_accuracy = 0.0
    correct_test = 0
    with torch.no_grad():
        for i, data in enumerate(test_dataloader, 0):
            
            inputs, label = data
            inputs = inputs.float()
            inputs, label = inputs.to(device), label.to(device)

            pred = fnn_model1(inputs)
            pred = pred.to(device)
            loss2 = CEloss(pred, label)

            test_loss += loss2.item()
            _, pred = torch.max(pred, 1)
            pred = pred.cpu().detach().numpy()
            label = label.cpu().detach().numpy()
            label = np.argmax(label, axis = 1)
            test_accuracy += accuracy_score(label, pred)
            
        test_loss = test_loss/test_dataloader_len
        test_accuracy = test_accuracy/test_dataloader_len
        history_test.append((test_loss, test_accuracy))
    
#     print("Epoch:", (epoch+1))
#     print("Train Loss:", train_loss, "\tTrain Accuracy:", train_accuracy)
#     print("Test Loss:", test_loss, "\tTest Accuracy:", test_accuracy)
#     print("=======================================================================================")
    print("Train accuracy:", train_accuracy)

print("Final test accuracy:", test_accuracy)

Train accuracy: 0.5382083333333333
Train accuracy: 0.6067291666666667
Train accuracy: 0.6413958333333334
Train accuracy: 0.6778958333333334
Train accuracy: 0.7139166666666666
Train accuracy: 0.7483541666666667
Train accuracy: 0.7807916666666667
Train accuracy: 0.80425
Train accuracy: 0.8216875
Train accuracy: 0.8404583333333333
Train accuracy: 0.8522708333333333
Train accuracy: 0.8615
Train accuracy: 0.874875
Train accuracy: 0.8785625
Train accuracy: 0.8843333333333333
Final test accuracy: 0.5588333333333333


# Recurrent Neural Networks

#### PART A (Simple RNN)

In [88]:
# Limiting the maximum review length to 20 by truncating longer reviews and padding shorter reviews with a null value 0 (for creating our dataset)
new_data = []
sent = []

for i in range(len(df['review_body'])):
    split_sent = df['review_body'].values[i].split(' ')
    for word in split_sent[:20]:
        sent.append(word)

    if len(split_sent) < 20:
        for i in range(20-len(split_sent)):
            sent.append('0')
    
    new_data.append(sent)
    sent = []

In [89]:
rnn_data = []
sent = []

for i in range(len(new_data)):
    sent_part = new_data[i]
    for word in sent_part:
        try:
            sent.append(word_2_vec[word])
        except:
            sent.append(np.zeros(300))
    sent = np.array(sent)
    rnn_data.append(sent)
    sent = []
rnn_data = np.array(rnn_data)

In [90]:
X_train, X_test, y_train, y_test = train_test_split(rnn_data, enc_data_label, test_size = 0.2, random_state=10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(48000, 20, 300) (12000, 20, 300) (48000, 3) (12000, 3)


In [96]:
batch_size = 32
train_dataset = Dataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_dataset = Dataset(X_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

In [97]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.rnn = nn.RNN(300, 20, 1, batch_first = True, nonlinearity='relu')
        self.fc1 = nn.Linear(20, 3)
        
    def forward(self,x):
         
        x, hn = self.rnn(x)
        x = nn.functional.relu(self.fc1(x[:, -1, :]))
        
        return x

In [98]:
rnn_model = RNN().to(device)
rnn_model = rnn_model.to(device)

In [99]:
model_optim=torch.optim.Adam(rnn_model.parameters(), lr = 0.0001)
CEloss = nn.CrossEntropyLoss()

In [100]:
history_train = []
history_test = []
train_dataloader_len = len(train_dataloader)
test_dataloader_len = len(test_dataloader)
train_len = X_train.shape[0]
test_len = X_test.shape[0]

for epoch in range(40):  
    rnn_model.train()
    train_loss = 0.0
    correct_train = 0
    train_accuracy = 0.0
    for i, data in enumerate(train_dataloader, 0):
        
        inputs, label = data
        inputs = inputs.float()
        inputs, label = inputs.to(device), label.to(device)

        model_optim.zero_grad()
        output = rnn_model(inputs)
        output = output.to(device)
        loss1 = CEloss(output, label)
        loss1.backward()
        model_optim.step()

        train_loss += loss1.item()
        _, output = torch.max(output, 1)
        output = output.cpu().detach().numpy()
        label = label.cpu().detach().numpy()
        label = np.argmax(label, axis = 1)
        train_accuracy += accuracy_score(label, output)
    
    train_loss = train_loss/train_dataloader_len
    train_accuracy = train_accuracy/train_dataloader_len
    history_train.append((train_loss, train_accuracy))
    
    rnn_model.eval()   
    test_loss = 0.0
    test_accuracy = 0.0
    correct_test = 0
    with torch.no_grad():
        for i, data in enumerate(test_dataloader, 0):
            
            inputs, label = data
            inputs = inputs.float()
            inputs, label = inputs.to(device), label.to(device)

            pred = rnn_model(inputs)
            pred = pred.to(device)
            loss2 = CEloss(pred, label)

            test_loss += loss2.item()
            _, pred = torch.max(pred, 1)
            pred = pred.cpu().detach().numpy()
            label = label.cpu().detach().numpy()
            label = np.argmax(label, axis = 1)
            test_accuracy += accuracy_score(label, pred)
            
        test_loss = test_loss/test_dataloader_len
        test_accuracy = test_accuracy/test_dataloader_len
        history_test.append((test_loss, test_accuracy))
    
#     print("Epoch:", (epoch+1))
#     print("Train Loss:", train_loss, "\tTrain Accuracy:", train_accuracy)
#     print("Test Loss:", test_loss, "\tTest Accuracy:", test_accuracy)
#     print("=======================================================================================")
    print("Train accuracy:", train_accuracy)

print("Final test accuracy:", test_accuracy)

Train accuracy: 0.3328333333333333
Train accuracy: 0.33447916666666666
Train accuracy: 0.335375
Train accuracy: 0.336
Train accuracy: 0.338125
Train accuracy: 0.34270833333333334
Train accuracy: 0.35975
Train accuracy: 0.37164583333333334
Train accuracy: 0.3819375
Train accuracy: 0.39460416666666664
Train accuracy: 0.47091666666666665
Train accuracy: 0.4885833333333333
Train accuracy: 0.502875
Train accuracy: 0.5130208333333334
Train accuracy: 0.53025
Train accuracy: 0.5389166666666667
Train accuracy: 0.55425
Train accuracy: 0.56275
Train accuracy: 0.5678333333333333
Train accuracy: 0.5717708333333333
Train accuracy: 0.5772083333333333
Train accuracy: 0.58175
Train accuracy: 0.5845416666666666
Train accuracy: 0.5869583333333334
Train accuracy: 0.5897083333333333
Train accuracy: 0.5927291666666666
Train accuracy: 0.5946666666666667
Train accuracy: 0.5997083333333333
Train accuracy: 0.6
Train accuracy: 0.6030833333333333
Train accuracy: 0.6055833333333334
Train accuracy: 0.60704166666666

##### FNN gives better test accuracy than Simple RNN. But the dataset (input) is different for Simple RNN and FNN. If the dataset would have been same, RNN would have given better test accuracy.

#### PART B (GRU)

In [101]:
class GRU(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.gru = nn.GRU(300, 20, 1, batch_first = True)
        self.fc1 = nn.Linear(20, 3)
        
    def forward(self,x):
         
        x, hn = self.gru(x)
        x = nn.functional.relu(self.fc1(x[:, -1, :]))
        
        return x

In [102]:
gru_model = GRU().to(device)
gru_model = gru_model.to(device)

In [103]:
model_optim=torch.optim.Adam(gru_model.parameters(), lr = 0.0001)
CEloss = nn.CrossEntropyLoss()

In [104]:
history_train = []
history_test = []
train_dataloader_len = len(train_dataloader)
test_dataloader_len = len(test_dataloader)
train_len = X_train.shape[0]
test_len = X_test.shape[0]

for epoch in range(40):  
    gru_model.train()
    train_loss = 0.0
    correct_train = 0
    train_accuracy = 0.0
    for i, data in enumerate(train_dataloader, 0):
        
        inputs, label = data
        inputs = inputs.float()
        inputs, label = inputs.to(device), label.to(device)

        model_optim.zero_grad()
        output = gru_model(inputs)
        output = output.to(device)
        loss1 = CEloss(output, label)
        loss1.backward()
        model_optim.step()

        train_loss += loss1.item()
        _, output = torch.max(output, 1)
        output = output.cpu().detach().numpy()
        label = label.cpu().detach().numpy()
        label = np.argmax(label, axis = 1)
        train_accuracy += accuracy_score(label, output)
    
    train_loss = train_loss/train_dataloader_len
    train_accuracy = train_accuracy/train_dataloader_len
    history_train.append((train_loss, train_accuracy))
    
    gru_model.eval()   
    test_loss = 0.0
    test_accuracy = 0.0
    correct_test = 0
    with torch.no_grad():
        for i, data in enumerate(test_dataloader, 0):
            
            inputs, label = data
            inputs = inputs.float()
            inputs, label = inputs.to(device), label.to(device)

            pred = gru_model(inputs)
            pred = pred.to(device)
            loss2 = CEloss(pred, label)

            test_loss += loss2.item()
            _, pred = torch.max(pred, 1)
            pred = pred.cpu().detach().numpy()
            label = label.cpu().detach().numpy()
            label = np.argmax(label, axis = 1)
            test_accuracy += accuracy_score(label, pred)
            
        test_loss = test_loss/test_dataloader_len
        test_accuracy = test_accuracy/test_dataloader_len
        history_test.append((test_loss, test_accuracy))
    
#     print("Epoch:", (epoch+1))
#     print("Train Loss:", train_loss, "\tTrain Accuracy:", train_accuracy)
#     print("Test Loss:", test_loss, "\tTest Accuracy:", test_accuracy)
#     print("=======================================================================================")
    print("Train accuracy:", train_accuracy)
print("Final test accuracy:", test_accuracy)

Train accuracy: 0.36666666666666664
Train accuracy: 0.39789583333333334
Train accuracy: 0.5310416666666666
Train accuracy: 0.5660416666666667
Train accuracy: 0.5832083333333333
Train accuracy: 0.5949791666666666
Train accuracy: 0.6066458333333333
Train accuracy: 0.6150625
Train accuracy: 0.6231458333333333
Train accuracy: 0.6290625
Train accuracy: 0.6328541666666667
Train accuracy: 0.637875
Train accuracy: 0.6425625
Train accuracy: 0.6444166666666666
Train accuracy: 0.6465833333333333
Train accuracy: 0.6493958333333333
Train accuracy: 0.6505416666666667
Train accuracy: 0.6530833333333333
Train accuracy: 0.654625
Train accuracy: 0.6564375
Train accuracy: 0.658375
Train accuracy: 0.658875
Train accuracy: 0.6618125
Train accuracy: 0.6625833333333333
Train accuracy: 0.6641458333333333
Train accuracy: 0.6662291666666667
Train accuracy: 0.6659166666666667
Train accuracy: 0.668375
Train accuracy: 0.6686041666666667
Train accuracy: 0.6692916666666666
Train accuracy: 0.6718958333333334
Train ac

#### PART C (LSTM)

In [109]:
class LSTM(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.lstm = nn.LSTM(300, 20, 1, batch_first = True)
        self.fc1 = nn.Linear(20, 3)
        
    def forward(self,x):
         
        x, (hn, cn) = self.lstm(x)
        x = nn.functional.relu(self.fc1(x[:, -1, :]))
        
        return x

In [110]:
lstm_model = LSTM().to(device)
lstm_model = lstm_model.to(device)

In [111]:
model_optim=torch.optim.Adam(lstm_model.parameters(), lr = 0.0003)
CEloss = nn.CrossEntropyLoss()

In [112]:
history_train = []
history_test = []
train_dataloader_len = len(train_dataloader)
test_dataloader_len = len(test_dataloader)
train_len = X_train.shape[0]
test_len = X_test.shape[0]

for epoch in range(30): 
    lstm_model.train()
    train_loss = 0.0
    correct_train = 0
    train_accuracy = 0.0
    for i, data in enumerate(train_dataloader, 0):
        
        inputs, label = data
        inputs = inputs.float()
        inputs, label = inputs.to(device), label.to(device)

        model_optim.zero_grad()
        output = lstm_model(inputs)
        output = output.to(device)
        loss1 = CEloss(output, label)
        loss1.backward()
        model_optim.step()

        train_loss += loss1.item()
        _, output = torch.max(output, 1)
        output = output.cpu().detach().numpy()
        label = label.cpu().detach().numpy()
        label = np.argmax(label, axis = 1)
        train_accuracy += accuracy_score(label, output)
    
    train_loss = train_loss/train_dataloader_len
    train_accuracy = train_accuracy/train_dataloader_len
    history_train.append((train_loss, train_accuracy))
    
    lstm_model.eval()   
    test_loss = 0.0
    test_accuracy = 0.0
    correct_test = 0
    with torch.no_grad():
        for i, data in enumerate(test_dataloader, 0):
            
            inputs, label = data
            inputs = inputs.float()
            inputs, label = inputs.to(device), label.to(device)

            pred = lstm_model(inputs)
            pred = pred.to(device)
            loss2 = CEloss(pred, label)

            test_loss += loss2.item()
            _, pred = torch.max(pred, 1)
            pred = pred.cpu().detach().numpy()
            label = label.cpu().detach().numpy()
            label = np.argmax(label, axis = 1)
            test_accuracy += accuracy_score(label, pred)
            
        test_loss = test_loss/test_dataloader_len
        test_accuracy = test_accuracy/test_dataloader_len
        history_test.append((test_loss, test_accuracy))
    
#     print("Epoch:", (epoch+1))
#     print("Train Loss:", train_loss, "\tTrain Accuracy:", train_accuracy)
#     print("Test Loss:", test_loss, "\tTest Accuracy:", test_accuracy)
#     print("=======================================================================================")
    print("Train accuracy:", train_accuracy)

print("Final test accuracy:", test_accuracy)

Train accuracy: 0.4464375
Train accuracy: 0.5387916666666667
Train accuracy: 0.5848125
Train accuracy: 0.6065208333333333
Train accuracy: 0.621125
Train accuracy: 0.6314375
Train accuracy: 0.6374166666666666
Train accuracy: 0.6424791666666667
Train accuracy: 0.6488541666666666
Train accuracy: 0.65475
Train accuracy: 0.6585416666666667
Train accuracy: 0.6615416666666667
Train accuracy: 0.6640208333333333
Train accuracy: 0.6696666666666666
Train accuracy: 0.6710833333333334
Train accuracy: 0.6739375
Train accuracy: 0.6747083333333334
Train accuracy: 0.6781041666666666
Train accuracy: 0.6815625
Train accuracy: 0.6819166666666666
Train accuracy: 0.6850625
Train accuracy: 0.689
Train accuracy: 0.6889791666666667
Train accuracy: 0.6913958333333333
Train accuracy: 0.6914791666666666
Train accuracy: 0.695375
Train accuracy: 0.6964791666666666
Train accuracy: 0.697625
Train accuracy: 0.698875
Train accuracy: 0.7006875
Final test accuracy: 0.6625833333333333


##### We can conclude that GRU and LSTM give better results than Simple RNN for our dataset