In [1]:
# ! pip install contractions
# ! pip install scikit-learn
# ! pip install pandas
# ! pip install numpy
# ! pip install nltk
# ! pip install gensim
# ! pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
# # Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/
# #↪amazon_reviews_us_Beauty_v1_00.tsv.gz

In [2]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\karav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\karav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# data = pd.read_csv('amazon_reviews_us_Beauty_v1_00.tsv', sep="\t", encoding='utf-8', on_bad_lines='skip')

In [4]:
# data['star_rating'] = data['star_rating'].astype('int',errors='ignore') 

# df = data[['review_body', 'star_rating']]
# df = df.dropna(thresh=2)

Form three classes of 20000 reviews randomly

In [5]:
# class_a = df.loc[df['star_rating'].isin([1,2])].sample(n=20000, random_state=4)
# class_b = df.loc[df['star_rating'].isin([3])].sample(n=20000,random_state=7)
# class_c = df.loc[df['star_rating'].isin([4,5])].sample(n=20000, random_state=21)

# df_sampled = pd.concat([class_a, class_b, class_c])
# df_sampled['star_rating'].value_counts()

In [6]:
# df_sampled['star_rating'] = df_sampled['star_rating'].replace([1,2],0)
# df_sampled['star_rating'] = df_sampled['star_rating'].replace([3],1)
# df_sampled['star_rating'] = df_sampled['star_rating'].replace([4,5],2)

# df_sampled.to_csv("data.tsv", sep='\t')

So far, I have read the dataset from the tsv file. I converted the data type of star_rating field to
int in order to enforce uniformity as there were some float and date values in them. After dropping
null values, sampled 20000 reviews for each class label and merged them into a single dataframe and then stored it for easier read access.

## Task 2

In [7]:
w2v = api.load('word2vec-google-news-300')

In [8]:
# Functions to clean dataset by removing html, urls, punctuation and multiple spaces

def remove_HTML(s):
    return re.sub(r'<.*?>',' ',s)

def remove_URL(s):
    return re.sub(r'https?:\/\/.*\/\w*',' ',s)

def remove_nonalphabets(s):
    return re.sub(r'[^a-zA-Z]',' ',s)

def remove_multispace(s):
    return re.sub(r'\s+',' ',s)

In [9]:
data = pd.read_csv('data.tsv', sep='\t', usecols=['review_body', 'star_rating'], dtype={'review_body' : str, 'star_rating': int})

data['review_body'] = data['review_body'].apply(lambda x:remove_HTML(x))
data['review_body'] = data['review_body'].apply(lambda x:remove_URL(x))
data['review_body'] = data['review_body'].apply(lambda x:remove_nonalphabets(x))
data['review_body'] = data['review_body'].apply(lambda x:remove_multispace(x))
tfidfdata = data.copy(deep=True)
data['review_body'] = data['review_body'].apply(lambda x: x.split())
tfidfdata['review_body'] = tfidfdata['review_body'].apply(lambda x: x.lower().split())
tfidfdata['review_body'] = tfidfdata['review_body'].apply(lambda x: ' '.join(x))

w2vdata = list(data['review_body'])

In [10]:
model = Word2Vec(w2vdata, min_count=9, vector_size=300, window=13, workers=6)

### 2(a)

In [11]:
#Testing examples on Google's word2vec

test_vec = w2v['China']-w2v['India']+w2v['Indian']
print(w2v.most_similar(positive=[test_vec],topn=3))
print(w2v.similarity('excellent','outstanding'))
print(w2v.similarity('excellent','poor'))
print(w2v.similarity('beautiful','horrible'))
print(w2v.doesnt_match(['excellent','outstanding','poor']))
print(w2v.doesnt_match(['excellent','bad','poor']))

[('Chinese', 0.8240145444869995), ('China', 0.6561532616615295), ('Indian', 0.643246054649353)]
0.55674857
0.37769592
0.38830176
poor
excellent


### 2(b)

In [12]:
#Testing examples on trained word2vec

test_vec = model.wv['China']-model.wv['India']+model.wv['Indian']
print(model.wv.most_similar(positive=[test_vec],topn=3))
print(model.wv.similarity('excellent','outstanding'))
print(model.wv.similarity('excellent','poor'))
print(model.wv.similarity('beautiful','horrible'))
print(model.wv.doesnt_match(['excellent','outstanding','poor']))
print(model.wv.doesnt_match(['excellent','bad','poor']))

[('China', 0.9744254350662231), ('china', 0.7366096377372742), ('USA', 0.7354326248168945)]
0.73620814
0.5710197
0.40271857
poor
bad


In [13]:
del model

The vectors generated by our dataset do not seem to capture the relationships between words accurately. Although it seems to place words that are relevant to our context in proximity in vector space, the relationships are not encoded properly. It is not capable of performing the vector algebra or picking odd one out as well as pretrained word2vec.

## Task 3

In [14]:
Vectorizer = TfidfVectorizer()
tfidf = Vectorizer.fit_transform(tfidfdata['review_body'])

TF_train, TF_test, tf_train, tf_test  = train_test_split(tfidf, 
                            tfidfdata['star_rating'],
                            stratify=tfidfdata['star_rating'],
                            test_size=0.2, random_state=1)

p = Perceptron(random_state=7)
p.fit(TF_train, tf_train)

print("--------------------TF-IDF Features--------------------")
print('Perceptron')
print(classification_report(tf_test, p.predict(TF_test)))

print('-------------------------------------------------------')
print('SVM')
s = LinearSVC(random_state=7, tol= 1e-5)
s.fit(TF_train, tf_train)
print(classification_report(tf_test, s.predict(TF_test)))

--------------------TF-IDF Features--------------------
Perceptron
              precision    recall  f1-score   support

           0       0.62      0.69      0.65      4000
           1       0.56      0.51      0.53      4000
           2       0.71      0.70      0.70      4000

    accuracy                           0.63     12000
   macro avg       0.63      0.63      0.63     12000
weighted avg       0.63      0.63      0.63     12000

-------------------------------------------------------
SVM
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      4000
           1       0.61      0.59      0.60      4000
           2       0.76      0.79      0.77      4000

    accuracy                           0.70     12000
   macro avg       0.69      0.70      0.69     12000
weighted avg       0.69      0.70      0.69     12000



In [15]:
del tfidfdata, Vectorizer, tfidf, TF_train, tf_train, TF_test, tf_test

In [16]:
def transform_w2v(data):
    w2vfeats = []
    for sentence in data:
        word_vecs = [w2v[word] for word in sentence if word in w2v]
        if len(word_vecs):
            sent_vec = np.mean(word_vecs, axis=0)
        else:
            sent_vec = np.zeros(300)
        w2vfeats.append(sent_vec)
    return np.array(w2vfeats)

w2vfeats_np = transform_w2v(w2vdata)
w2vlabels_np = np.array(list(data['star_rating'].astype('int')))

I have averaged all the word vectors for each review. For reviews that have no words in the w2v vocabulary, the feature vector is a list of zeros.

In [17]:
X_train, X_test, y_train, y_test  = train_test_split(w2vfeats_np, 
                            w2vlabels_np,
                            stratify=w2vlabels_np,
                            test_size=0.2, random_state=1)

In [18]:
print("--------------------Word2Vec Features--------------------")
print('Perceptron')
p = Perceptron(random_state=7)
p.fit(X_train, y_train)
print(classification_report(y_test, p.predict(X_test)))

print("---------------------------------------------------------")
print('SVM')
s = LinearSVC(random_state=7, tol= 1e-5)
s.fit(X_train, y_train)
print(classification_report(y_test, s.predict(X_test)))


del p,s

--------------------Word2Vec Features--------------------
Perceptron
              precision    recall  f1-score   support

           0       0.79      0.19      0.30      4000
           1       0.39      0.93      0.55      4000
           2       0.87      0.33      0.48      4000

    accuracy                           0.48     12000
   macro avg       0.68      0.48      0.44     12000
weighted avg       0.68      0.48      0.44     12000

---------------------------------------------------------
SVM
              precision    recall  f1-score   support

           0       0.65      0.68      0.67      4000
           1       0.59      0.55      0.57      4000
           2       0.71      0.72      0.72      4000

    accuracy                           0.65     12000
   macro avg       0.65      0.65      0.65     12000
weighted avg       0.65      0.65      0.65     12000



TF-IDF features - 

Perceptron : 63% accuracy | SVM : 70% accuracy

Word2Vec features - 

Perceptron : 48% accuracy | SVM - 65% accuracy

The models were able to classify better based on TF-IDF features than the word2vec features. This could be due to few reasons - TF-IDF is a statistical method that is intended to improve metrics such as precision and recall. Word2Vec is aimed at capturing semantic relationships. Google's pretrained word2vec was trained on a wide context that may not have seen all these emotions to weigh them well in vector space.

In [19]:
# if torch.cuda.is_available():
#     device = torch.device("cuda")
#     print(device)
# else:
device = torch.device("cpu")

## Task 4

In [20]:
class TrainDataset(Dataset):
    def __init__(self, reviews, labels, transform=None):
        self.reviews = torch.from_numpy(reviews).float()
        self.labels = torch.from_numpy(labels).long()
        self.transform = transform

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        if self.transform:
            review = self.transform(review)
        return review, label

In [21]:
params = {
    'batch_size': 64,
    'shuffle': True,
    'num_workers': 0
}

train_data = TrainDataset(X_train, y_train)
train_loader = DataLoader(train_data, **params)

valid_data = TrainDataset(X_test, y_test)
valid_loader = DataLoader(valid_data, **params)

Created a custom Dataset class inheriting from torch's Dataset and used a dataloader for batching

### 4(a)

In [22]:
class MLP1(nn.Module):
    def __init__(self):
        super(MLP1, self).__init__()
        self.fc1 = nn.Linear(300, 100)
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.log_softmax(self.fc3(x), dim=1)
        # x = self.fc3(x)
        return x

mlp1 = MLP1()
mlp1 = mlp1.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(mlp1.parameters(), lr=0.01)

In [23]:
max_epochs = 30
valid_loss_min = np.Inf

for epoch in range(max_epochs):
    train_loss = 0
    valid_loss = 0
    mlp1.train()
    train_correct=0
    for batch, labels in train_loader:
        batch, labels = batch.to(device), labels.to(device)
        optimizer.zero_grad()
        output = mlp1(batch)
        loss = loss_fn(output, labels)
        predicted = torch.argmax(output, dim=1)
        loss.backward()
        optimizer.step()
        train_loss+= loss.item()
        train_correct+= (predicted==labels).sum().item()

    mlp1.eval()
    valid_correct=0
    with torch.no_grad():
        for batch, labels in valid_loader:
            batch, labels = batch.to(device), labels.to(device)
            output = mlp1(batch)
            loss = loss_fn(output, labels)
            predicted = torch.argmax(output, dim=1)
            valid_loss+= loss.item()
            valid_correct+=(predicted==labels).sum().item()
    
    train_loss = train_loss/len(train_loader)
    valid_loss = valid_loss/len(valid_loader)
    train_acc = 100 * train_correct / len(train_data)
    valid_acc = 100 * valid_correct / len(valid_data)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tTraining Accuracy: {:.6f} \t Validation Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        train_acc,
        valid_acc
        ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(mlp1.state_dict(), 'mlp1.pt')
        valid_loss_min = valid_loss

del mlp1

Epoch: 1 	Training Loss: 0.882419 	Validation Loss: 0.805861 	Training Accuracy: 58.970833 	 Validation Accuracy: 64.666667
Validation loss decreased (inf --> 0.805861).  Saving model ...
Epoch: 2 	Training Loss: 0.838510 	Validation Loss: 0.840016 	Training Accuracy: 62.397917 	 Validation Accuracy: 62.116667
Epoch: 3 	Training Loss: 0.823994 	Validation Loss: 0.791876 	Training Accuracy: 62.995833 	 Validation Accuracy: 64.216667
Validation loss decreased (0.805861 --> 0.791876).  Saving model ...
Epoch: 4 	Training Loss: 0.820845 	Validation Loss: 0.786919 	Training Accuracy: 63.077083 	 Validation Accuracy: 64.991667
Validation loss decreased (0.791876 --> 0.786919).  Saving model ...
Epoch: 5 	Training Loss: 0.811417 	Validation Loss: 0.802333 	Training Accuracy: 63.672917 	 Validation Accuracy: 65.258333
Epoch: 6 	Training Loss: 0.803838 	Validation Loss: 0.788787 	Training Accuracy: 64.104167 	 Validation Accuracy: 64.850000
Epoch: 7 	Training Loss: 0.800308 	Validation Loss: 0.

In [24]:
#model.load_state_dict(torch.load('mlp1.pt')) to load best valid_loss model

### 4(b)

In [25]:
def transform_w2v_10words(data):
    w2vfeats = []
    for sentence in data:
        word_vecs = [w2v[word] for word in sentence if word in w2v]
        sent_vec = np.concatenate([word_vecs[i] if i < len(word_vecs) else np.zeros(300) for i in range(10)], axis=0)
        w2vfeats.append(sent_vec)
    return np.array(w2vfeats)

w2vfeats_np = transform_w2v_10words(w2vdata)

Concatenated only first 10 words to generate a vector for reviews.

In [26]:
X_train, X_test, y_train, y_test  = train_test_split(w2vfeats_np, 
                            w2vlabels_np,
                            stratify=w2vlabels_np,
                            test_size=0.2, random_state=1)

In [27]:
params = {
    'batch_size': 64,
    'shuffle': True,
    'num_workers': 0
}

train_data = TrainDataset(X_train, y_train)
train_loader = DataLoader(train_data, **params)

valid_data = TrainDataset(X_test, y_test)
valid_loader = DataLoader(valid_data, **params)

In [28]:
class MLP2(nn.Module):
    def __init__(self):
        super(MLP2, self).__init__()
        self.fc1 = nn.Linear(3000, 100)
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 3)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.log_softmax(self.fc3(x), dim=1)
        return x

mlp2 = MLP2()
mlp2 = mlp2.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(mlp2.parameters(), lr=0.005)

In [29]:
max_epochs = 30
valid_loss_min = np.Inf

for epoch in range(max_epochs):
    train_loss = 0
    valid_loss = 0
    mlp2.train()
    train_correct=0
    for batch, labels in train_loader:
        batch, labels = batch.to(device), labels.to(device)
        optimizer.zero_grad()
        output = mlp2(batch)
        loss = loss_fn(output, labels)
        predicted = torch.argmax(output, dim=1)
        loss.backward()
        optimizer.step()
        train_loss+= loss.item()
        train_correct+= (predicted==labels).sum().item()

    mlp2.eval()
    valid_correct=0
    with torch.no_grad():
        for batch, labels in valid_loader:
            batch, labels = batch.to(device), labels.to(device)
            output = mlp2(batch)
            loss = loss_fn(output, labels)
            predicted = torch.argmax(output, dim=1)
            valid_loss+= loss.item()
            valid_correct+=(predicted==labels).sum().item()
    
    train_loss = train_loss/len(train_loader)
    valid_loss = valid_loss/len(valid_loader)
    train_acc = 100 * train_correct / len(train_data)
    valid_acc = 100 * valid_correct / len(valid_data)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tTraining Accuracy: {:.6f} \t Validation Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        train_acc,
        valid_acc
        ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(mlp2.state_dict(), 'mlp2.pt')
        valid_loss_min = valid_loss

del mlp2

Epoch: 1 	Training Loss: 0.976678 	Validation Loss: 0.924451 	Training Accuracy: 50.704167 	 Validation Accuracy: 54.841667
Validation loss decreased (inf --> 0.924451).  Saving model ...
Epoch: 2 	Training Loss: 0.917402 	Validation Loss: 0.902421 	Training Accuracy: 56.052083 	 Validation Accuracy: 56.175000
Validation loss decreased (0.924451 --> 0.902421).  Saving model ...
Epoch: 3 	Training Loss: 0.884060 	Validation Loss: 0.900157 	Training Accuracy: 57.893750 	 Validation Accuracy: 56.175000
Validation loss decreased (0.902421 --> 0.900157).  Saving model ...
Epoch: 4 	Training Loss: 0.850590 	Validation Loss: 0.909461 	Training Accuracy: 59.993750 	 Validation Accuracy: 56.275000
Epoch: 5 	Training Loss: 0.820458 	Validation Loss: 0.920390 	Training Accuracy: 61.864583 	 Validation Accuracy: 56.175000
Epoch: 6 	Training Loss: 0.790396 	Validation Loss: 0.934739 	Training Accuracy: 63.789583 	 Validation Accuracy: 54.891667
Epoch: 7 	Training Loss: 0.762101 	Validation Loss: 0.

Observed that in the case of 4(b), the training loss decreases while validation loss increases. Experimented with dropout and lower learning rates but this phenomenon seems to occur regardless. The model in 4(a) performs significantly better than a single perceptron. Adding complexities to the model with non-linearity and layers has improved the performance from 48% to 66.1%. The disparity between 4(a) and 4(b) is possibly because 4(a) tries to capture the entirety of a review while still maintaining lower dimensions whereas 4(b) concatenates the first 10 words, which may not contain all necessary information to classify, and also increases the dimensions of features.

## Task 5

In [30]:
def transform_rnn(data):
    w2vfeats = []
    for sentence in data:    
        word_vecs = [w2v[word] for word in sentence if word in w2v]
        sent_vec = [word_vecs[i] if i < len(word_vecs) else np.zeros(300) for i in range(20)]
        w2vfeats.append(sent_vec)
    return np.array(w2vfeats)

w2vfeats_np = transform_rnn(w2vdata)

Created word2vec features for review by appending 20 word vectors individually without performing any mathematical operations. If there are less than 20 words, the feature is padded with zeros; If there are more than 20, review is truncated at 20 words. This is the input required for RNNs.

In [31]:
X_train, X_test, y_train, y_test  = train_test_split(w2vfeats_np, 
                            w2vlabels_np,
                            stratify=w2vlabels_np,
                            test_size=0.2, random_state=1)

In [32]:
params = {
    'batch_size': 64,
    'shuffle': True,
    'num_workers': 0
}

train_data = TrainDataset(X_train, y_train)
train_loader = DataLoader(train_data, **params)

valid_data = TrainDataset(X_test, y_test)
valid_loader = DataLoader(valid_data, **params)

### 5(a)

In [33]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, 1, batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = torch.zeros(1, batch_size, self.hidden_size).to(device)
        out, hidden = self.rnn(x, hidden)
        out = F.log_softmax(self.fc(out[:,-1,:]), dim=1) #Last output alone to calculate loss
        return out

rnn = RNN(300, 20, 3)
rnn = rnn.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(rnn.parameters(), lr=0.005)

In [34]:
max_epochs = 30
valid_loss_min = np.Inf

for epoch in range(max_epochs):
    train_loss = 0
    valid_loss = 0
    rnn.train()
    
    train_correct=0
    for batch, labels in train_loader:
        batch, labels = batch.to(device), labels.to(device)
        optimizer.zero_grad()
        output = rnn(batch)
        # print(output.shape, '|', labels.shape)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        train_loss+= loss.item()
        train_correct+= (torch.argmax(output, dim=1)==labels).sum().item()

    rnn.eval()
    valid_correct=0
    with torch.no_grad():
        for batch, labels in valid_loader:
            batch, labels = batch.to(device), labels.to(device)
            output = rnn(batch)
            loss=loss_fn(output, labels)
            valid_loss+= loss.item()
            valid_correct+= (torch.argmax(output, dim=1)==labels).sum().item()
    
    train_loss = train_loss/len(train_loader)
    valid_loss = valid_loss/len(valid_loader)
    train_acc = 100 * train_correct / len(train_data)
    valid_acc = 100 * valid_correct / len(valid_data)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tTraining Accuracy: {:.6f} \t Validation Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        train_acc,
        valid_acc
        ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(rnn.state_dict(), 'rnn.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.973483 	Validation Loss: 0.924852 	Training Accuracy: 50.408333 	 Validation Accuracy: 55.008333
Validation loss decreased (inf --> 0.924852).  Saving model ...
Epoch: 2 	Training Loss: 0.884785 	Validation Loss: 0.866593 	Training Accuracy: 57.091667 	 Validation Accuracy: 57.658333
Validation loss decreased (0.924852 --> 0.866593).  Saving model ...
Epoch: 3 	Training Loss: 0.859548 	Validation Loss: 0.863384 	Training Accuracy: 59.220833 	 Validation Accuracy: 58.991667
Validation loss decreased (0.866593 --> 0.863384).  Saving model ...
Epoch: 4 	Training Loss: 0.841908 	Validation Loss: 0.880996 	Training Accuracy: 60.531250 	 Validation Accuracy: 58.741667
Epoch: 5 	Training Loss: 0.828599 	Validation Loss: 0.850196 	Training Accuracy: 61.337500 	 Validation Accuracy: 60.008333
Validation loss decreased (0.863384 --> 0.850196).  Saving model ...
Epoch: 6 	Training Loss: 0.822405 	Validation Loss: 0.841068 	Training Accuracy: 61.489583 	 Validation Accur

FFNN - 66.1% | RNN - 61.4% (Accuracy of models with best validation loss)

RNN accuracy is slightly lower as compared to FFNN. The training loss is decreasing while the validation loss is increasing. Experimented by adding dropout and reducing learning rate but RNN still seems to try to overfit on the data. The discrepancy in accuracy values could be possibly due to RNN only considering 20 words whereas the FFNN averages the entire review.

### 5(b)

In [35]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        out, hidden = self.gru(x, hidden)
        out = F.log_softmax(self.fc(out[:,-1,:]), dim=1) #Taking the last output alone to calculate loss
        return out

gru = GRU(300, 20, 2, 3)
gru = gru.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(gru.parameters(), lr=0.005)

In [36]:
max_epochs = 30
valid_loss_min = np.Inf

for epoch in range(max_epochs):
    train_loss = 0
    valid_loss = 0
    gru.train()
    
    train_correct=0
    for batch, labels in train_loader:
        batch, labels = batch.to(device), labels.to(device)
        optimizer.zero_grad()
        output = gru(batch)
        # print(output.shape, '|', labels.shape)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        train_loss+= loss.item()
        train_correct+= (torch.argmax(output, dim=1)==labels).sum().item()

    gru.eval()
    valid_correct=0
    with torch.no_grad():
        for batch, labels in valid_loader:
            batch, labels = batch.to(device), labels.to(device)
            output = gru(batch)
            loss=loss_fn(output, labels)
            valid_loss+= loss.item()
            valid_correct+= (torch.argmax(output, dim=1)==labels).sum().item()
    
    train_loss = train_loss/len(train_loader)
    valid_loss = valid_loss/len(valid_loader)
    train_acc = 100 * train_correct / len(train_data)
    valid_acc = 100 * valid_correct / len(valid_data)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tTraining Accuracy: {:.6f} \t Validation Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        train_acc,
        valid_acc
        ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(gru.state_dict(), 'gru.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.852240 	Validation Loss: 0.782885 	Training Accuracy: 59.668750 	 Validation Accuracy: 63.900000
Validation loss decreased (inf --> 0.782885).  Saving model ...
Epoch: 2 	Training Loss: 0.758170 	Validation Loss: 0.759034 	Training Accuracy: 65.779167 	 Validation Accuracy: 65.200000
Validation loss decreased (0.782885 --> 0.759034).  Saving model ...
Epoch: 3 	Training Loss: 0.726154 	Validation Loss: 0.749555 	Training Accuracy: 67.587500 	 Validation Accuracy: 66.350000
Validation loss decreased (0.759034 --> 0.749555).  Saving model ...
Epoch: 4 	Training Loss: 0.703309 	Validation Loss: 0.748379 	Training Accuracy: 68.800000 	 Validation Accuracy: 66.275000
Validation loss decreased (0.749555 --> 0.748379).  Saving model ...
Epoch: 5 	Training Loss: 0.682284 	Validation Loss: 0.742501 	Training Accuracy: 69.950000 	 Validation Accuracy: 66.800000
Validation loss decreased (0.748379 --> 0.742501).  Saving model ...
Epoch: 6 	Training Loss: 0.665869 	Valid

GRU - 66.8% (Accuracy after 5 epochs with best validation loss)

Validation loss is increasing with more epochs while training loss is decreasing, showing signs of overfitting. At end of 30 epochs, validation accuracy is 64.25% which is still better than a simple RNN.

### 5(c)

In [39]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        cell_state = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        out, (hidden, cell_state) = self.lstm(x, (hidden, cell_state))
        # print(out.shape)
        out = F.log_softmax(self.fc(out[:,-1,:]), dim=1) #Taking the last output alone to calculate loss
        return out

lstm = LSTM(300, 20, 2, 3)
lstm = lstm.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(lstm.parameters(), lr=0.001)

In [40]:
max_epochs = 30
valid_loss_min = np.Inf

for epoch in range(max_epochs):
    train_loss = 0
    valid_loss = 0
    lstm.train()
    
    train_correct=0
    for batch, labels in train_loader:
        batch, labels = batch.to(device), labels.to(device)
        optimizer.zero_grad()
        output = lstm(batch)
        # print(output.shape, '|', labels.shape)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        train_loss+= loss.item()
        train_correct+= (torch.argmax(output, dim=1)==labels).sum().item()

    lstm.eval()
    valid_correct=0
    with torch.no_grad():
        for batch, labels in valid_loader:
            batch, labels = batch.to(device), labels.to(device)
            output = lstm(batch)
            loss=loss_fn(output, labels)
            valid_loss+= loss.item()
            valid_correct+= (torch.argmax(output, dim=1)==labels).sum().item()
    
    train_loss = train_loss/len(train_loader)
    valid_loss = valid_loss/len(valid_loader)
    train_acc = 100 * train_correct / len(train_data)
    valid_acc = 100 * valid_correct / len(valid_data)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tTraining Accuracy: {:.6f} \t Validation Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        train_acc,
        valid_acc
        ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(lstm.state_dict(), 'lstm.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.943884 	Validation Loss: 0.885581 	Training Accuracy: 52.981250 	 Validation Accuracy: 57.966667
Validation loss decreased (inf --> 0.885581).  Saving model ...
Epoch: 2 	Training Loss: 0.834866 	Validation Loss: 0.825576 	Training Accuracy: 61.293750 	 Validation Accuracy: 61.766667
Validation loss decreased (0.885581 --> 0.825576).  Saving model ...
Epoch: 3 	Training Loss: 0.797658 	Validation Loss: 0.809182 	Training Accuracy: 63.581250 	 Validation Accuracy: 62.716667
Validation loss decreased (0.825576 --> 0.809182).  Saving model ...
Epoch: 4 	Training Loss: 0.777975 	Validation Loss: 0.789377 	Training Accuracy: 64.693750 	 Validation Accuracy: 63.858333
Validation loss decreased (0.809182 --> 0.789377).  Saving model ...
Epoch: 5 	Training Loss: 0.760450 	Validation Loss: 0.779643 	Training Accuracy: 65.758333 	 Validation Accuracy: 64.375000
Validation loss decreased (0.789377 --> 0.779643).  Saving model ...
Epoch: 6 	Training Loss: 0.745741 	Valid

RNN - Best model : 61.4% accuracy | GRU - Best model : 66.8% accuracy | LSTM - Best model : 66.37% accuracy

GRU offers significant improvement over RNN due to the mechanism of gates that are capable of learning which inputs are important and need to be remembered. This additional gate mechanism solves the vanishing gradient problem of RNN but is causing the GRU to overfit on training data.

LSTM is computationally slightly more expensive than GRU and offers solutions to the problem of overfitting by adding a forget gate. This results in better performance in both training and validation data. 

## References

1. https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
2. https://www.kaggle.com/code/mishra1993/pytorch-multi-layer-perceptron-mnist/notebook
3. https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
4. https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
5. https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html