In [1]:
from typing import Optional, Callable
from typing_extensions import TypeAlias

import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
import contractions

import os
from urllib import request
import gzip

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
import torch
import random

seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(0)

# Task 1: Dataset Generation

In [3]:
url = 'https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz'

extracted_file = 'data.tsv'
compressed_file = extracted_file + '.gz'


# Retrieve the dataset from given url and store it in location specified by compressed_file
if not os.path.exists(extracted_file):
    request.urlretrieve(url, compressed_file)

    # extract the dataset from the gzipped file
    with gzip.open(compressed_file, 'rb') as f_in, open(extracted_file, 'wb') as f_out:
        for line in f_in:
            f_out.write(line)

    os.remove(compressed_file)
    

# read the extracted data into pandas dataframe
original_df = pd.read_csv(extracted_file, sep='\t', on_bad_lines='skip', low_memory=False)
print(original_df.head())

  marketplace  customer_id       review_id  product_id  product_parent  \
0          US     43081963  R18RVCKGH1SSI9  B001BM2MAC       307809868   
1          US     10951564  R3L4L6LW1PUOFY  B00DZYEXPQ        75004341   
2          US     21143145  R2J8AWXWTDX2TF  B00RTMUHDW       529689027   
3          US     52782374  R1PR37BR7G3M6A  B00D7H8XB6       868449945   
4          US     24045652  R3BDDDZMZBZDPU  B001XCWP34        33521401   

                                       product_title product_category  \
0     Scotch Cushion Wrap 7961, 12 Inches x 100 Feet  Office Products   
1          Dust-Off Compressed Gas Duster, Pack of 4  Office Products   
2  Amram Tagger Standard Tag Attaching Tagging Gu...  Office Products   
3  AmazonBasics 12-Sheet High-Security Micro-Cut ...  Office Products   
4  Derwent Colored Pencils, Inktense Ink Pencils,...  Office Products   

  star_rating  helpful_votes  total_votes vine verified_purchase  \
0           5            0.0          0.0    N  

In [4]:
# creating the dataframe by taking only review_body and star_rating columns
df = pd.DataFrame(original_df[['review_body', 'star_rating']])
print(df.head())

# we notice there are some erroneous values for the star_rating column
print(df['star_rating'].unique())

# converting the star_rating to numeric values and dropping erroneous columns
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
df.dropna(inplace=True)

print(df['star_rating'].unique())

                                         review_body star_rating
0                                     Great product.           5
1  What's to say about this commodity item except...           5
2    Haven't used yet, but I am sure I will like it.           5
3  Although this was labeled as &#34;new&#34; the...           1
4                    Gorgeous colors and easy to use           4
['5' '1' '4' '2' '3' '2015-06-05' '2015-02-11' nan '2014-02-14']
[5. 1. 4. 2. 3.]


In [5]:
# creating the target column: target = 1 if star_rating is 1, 2 or 3. target = 2 if star_rating is 4 or 5
df['star_rating'] = df['star_rating'].astype(int)
df['target'] = df['star_rating'].apply(lambda x: 0 if x <= 3 else 1)

sample_size = 50000

# creating a sample dataframe where target = 1 of size 50000 rows
class_1 = df.loc[df['target'] == 0].sample(n=sample_size, random_state=42)

# creating a sample dataframe where target = 2 of size 50000 rows
class_2 = df.loc[df['target'] == 1].sample(n=sample_size, random_state=42)

# merging the two sample dataframes
df_new = pd.concat([class_1, class_2], ignore_index=True)

In [6]:
def clean(review):
    """
    convert to lower-case
    remove html and urls
    remove non-alphabetical character
    remove extra spaces
    """
    
    # converting to lowercase
    review = review.lower()
    
    # removing htmls
    soup = BeautifulSoup(review, "html.parser")
    
    for a_tag in soup.find_all("a"):
        a_tag.decompose()
        
    review = soup.get_text()
    
    # removing urls
    review = re.sub(r'^https?:\/\/.*[\r\n]*', '', review)
    
    # removing non-alphabetical characters
    review = re.sub(r'[^a-zA-Z\s]', '', review)
    
    # removing extra spaces
    review = re.sub(r'\s+', ' ', review).strip()
        
    return review
    

df_new['review_body'] = df_new['review_body'].apply(clean)

  soup = BeautifulSoup(review, "html.parser")
  soup = BeautifulSoup(review, "html.parser")


# Task 2: Creating the Word2Vec Models

When comparing a pretrained model with a custom word2vec model, we find a substantial difference in vocabulary size: 3,000,000 unique words in the pretrained model versus 14,994 in the custom model. This suggests the pretrained model may handle out-of-vocabulary words better during testing.

In semantic similarity tests:
1. Outstanding and excellent show lower similarity in the pretrained model.
2. The arithmetic "King - Man + Woman" correctly yields "Queen" in the pretrained model, but not in the custom model.
3. The arithmetic "Doctor - Man + Woman" doesn't produce nurse-related results in the custom model.

Overall, the pretrained model performs better in some semantic tasks but struggles with similarity in specific cases.

## Word2Vec from pretrained

In [7]:
import gensim.downloader
from gensim.models import KeyedVectors

# downloading the pre-trained model if it is not available
if not os.path.exists('pretrained_w2v.model'):
    pretrained_w2v = gensim.downloader.load('word2vec-google-news-300')
    pretrained_w2v.save('pretrained_w2v.model')
# if available, we load the data from local storage
else:
    pretrained_w2v = KeyedVectors.load('pretrained_w2v.model')

In [8]:
print(len(pretrained_w2v), len(pretrained_w2v[0]))

3000000 300


In [9]:
# printing the similarity score for outstanding and excellent
print(pretrained_w2v.similarity('outstanding', 'excellent'))

0.55674857


In [10]:
# printing the most similar words matching the arithmetic king - man + woman
print(pretrained_w2v.most_similar(positive=['king', 'woman'], negative=['man'], topn=5))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]


In [11]:
# printing the most similar words matching the arithmetic doctor - man + queen
print(pretrained_w2v.most_similar(positive=['doctor', 'woman'], negative=['man'], topn=5))

[('gynecologist', 0.7093892097473145), ('nurse', 0.6477287411689758), ('doctors', 0.6471460461616516), ('physician', 0.6438996195793152), ('pediatrician', 0.6249487996101379)]


In [12]:
# Creating the vocabulary from the pretrained w2v
vocab = list(pretrained_w2v.index_to_key)

# Initialize an embedding matrix with zeros
embedding_dim = pretrained_w2v.vector_size
embedding_matrix = np.zeros((len(vocab), embedding_dim), dtype=np.float32)

# Populate the embedding matrix with Word2Vec vectors
for i, word in enumerate(vocab):
    if word in pretrained_w2v:
        # print(word, i, custom_w2v.wv[word])
        embedding_matrix[i] = pretrained_w2v[word]

print(len(embedding_matrix), len(embedding_matrix[0]))

3000000 300


## From dataset (custom)

In [13]:
from gensim.models import Word2Vec

# getting the tokens
tokenized = [nltk.word_tokenize(review) for review in df_new['review_body']]

# if the model is not already available, then create from scratch
if not os.path.exists('custom_w2v.model'):
    custom_w2v = Word2Vec(tokenized, vector_size=300, window=13, min_count=9, sg=1, workers=1)
    custom_w2v.save('custom_w2v.model')
# if available, load the model from local storage
else:
    custom_w2v = Word2Vec.load('custom_w2v.model')

In [14]:
print(len(custom_w2v.wv), len(custom_w2v.wv[0]))

14994 300


In [15]:
# printing the similarity score for outstanding and excellent
print(custom_w2v.wv.similarity('outstanding', 'excellent'))

0.6205609


In [16]:
# printing the most similar words matching the arithmetic king - man + woman
print(custom_w2v.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5))

[('Idea', 0.5137808918952942), ('Archival', 0.5026917457580566), ('fine-point', 0.49942412972450256), ('inherited', 0.4969730079174042), ('Flair', 0.4938381314277649)]


In [17]:
# printing the most similar words matching the arithmetic doctor - man + queen
print(custom_w2v.wv.most_similar(positive=['doctor', 'woman'], negative=['man'], topn=5))

[('assistant', 0.4972769320011139), ('visiting', 0.4651092290878296), ('deadlines', 0.45854732394218445), ('appointment', 0.45726412534713745), ('prayer', 0.4571249485015869)]


# Task 3: Simple Models


* **Perceptron** achieved **80.77% accuracy** on word embeddings, which is slightly better than **79.345%** using TF-IDF.
* **SVM** attained **82.665% accuracy** on word embeddings, slightly lower than **84.865%** using TF-IDF.

* In general, word embeddings exhibit competitive performance, however, in the SVM task, we notice slightly less performance.

**Note**: The two figures below show the accuracy obtained for perceptron and SVM when training on TF-IDF. These were obtained by modifying the metric of calculation in Homework 1 without any changes as to how the models were trained or features were extracted.

![Accuracy of perceptron on TF-IDF](img/perceptron_accuracy.png)
<center><strong>Accuracy of perceptron on TF-IDF</strong></center>

![Accuracy for SVM on TF-IDF](img/svm_accuracy.png)
<center><strong>Accuracy of SVM on TF-IDF</strong></center>

### Creating mean sentence embeddings

In [18]:
# a function to create mean embeddings for a sentence given a w2v model
def create_avg_embeddings(sentence, w2v):
    tokens = nltk.word_tokenize(sentence)
    vectors = [w2v[word] for word in tokens if word in w2v]
    
    if vectors:
        embedding = np.mean(vectors, axis=0, dtype=np.float32)
    else:
        embedding = np.zeros(w2v.vector_size, dtype=np.float32)
        
    return embedding

In [19]:
avg_embeddings = df_new['review_body'].apply(lambda x: create_avg_embeddings(x, pretrained_w2v))
avg_embeddings = np.array(avg_embeddings.tolist())

targets = df_new['target']

In [20]:
X_train_avg, X_test_avg, Y_train_avg, Y_test_avg = train_test_split(
    avg_embeddings,
    df_new['target'],
    shuffle=True,
    test_size=0.2,
    random_state=42
)

Y_train_avg = np.array(Y_train_avg.tolist())
Y_test_avg = np.array(Y_test_avg.tolist())

### Perceptron training

In [21]:
per_clf = Perceptron(penalty='elasticnet', l1_ratio=0.8, alpha=1e-5, tol=1e-4, random_state=42)
per_clf.fit(list(X_train_avg), Y_train_avg)

per_Y_preds = per_clf.predict(list(X_test_avg))
per_acc = accuracy_score(per_Y_preds, Y_test_avg)
print(per_acc)

0.8077


### SVM training

In [22]:
svc_clf = LinearSVC(dual=True, loss='hinge', C=0.8, max_iter=10000, random_state=42)
svc_clf.fit(list(X_train_avg), Y_train_avg)

svc_Y_preds = svc_clf.predict(list(X_test_avg))
svc_acc = accuracy_score(svc_Y_preds, Y_test_avg)
print(svc_acc)

0.82665


# Task 4: Feedforward Neural Network

In [23]:
# general purpose function for training a model with given training data 
def train(model, train_data, test_data, criterion, optimizer, n_epochs=10, verbose=True, device='cpu'):
    for epoch in range(n_epochs):
        train_loss = 0.
        test_loss = 0.
        
        model.train()
        for data, target in train_data:
            # shifting data and target to the appropriate device
            data = data.to(device)
            target = target.to(device)
            
            # setting the gradients to zero
            optimizer.zero_grad()
            
            # getting the output and calculating the loss
            out = model(data)
            loss = criterion(out, target)
            
            # performing the backward step and using the optimizer
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        model.eval()
        with torch.no_grad():
            for data, target in test_data:
                # shifting data and target to appropriate device
                data = data.to(device)
                target = target.to(device)
                
                # getting the output and then getting the loss and updating the total test loss
                out = model(data)
                
                loss = criterion(out, target)
                test_loss += loss.item()
                
        if verbose:
            print(f'Epoch: {epoch+1} / {n_epochs}\tTraining Loss: {train_loss}\tTest Loss: {test_loss}')

In [24]:
# a function to calculate accuracy on test data for a given model
def accuracy(model, test_data, device='cpu'):
    correct, total = 0, 0
    
    with torch.no_grad():
        for data, target in test_data:
            # shifting data and target to appropriate device
            data = data.to(device)
            target = target.to(device)
            
            # getting the output and predictions
            out = model(data)
            _, preds = torch.max(out.data, 1)
            
            # updating the total and correct variables
            total += target.size(0)
            correct += (preds == target).sum().item()
            
    return (100*correct) / total

In [25]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# converting numpy arrays created for perceptron and svm to torch tensors
X_train_avg = torch.tensor(X_train_avg, dtype=torch.float32)
X_test_avg = torch.tensor(X_test_avg, dtype=torch.float32)

Y_train_avg = torch.tensor(Y_train_avg, dtype=torch.int64)
Y_test_avg = torch.tensor(Y_test_avg, dtype=torch.int64)

# generating dataset from created tensors
train_dataset1 = TensorDataset(X_train_avg, Y_train_avg)
test_dataset1 = TensorDataset(X_test_avg, Y_test_avg)

batch_size = 256
# creating train and test data loaders
train_loader1 = DataLoader(train_dataset1, batch_size=batch_size, shuffle=True)
test_loader1 = DataLoader(test_dataset1, batch_size=batch_size)

print(device)

cuda:0


## Task 4 (a)
* Here, we train a feedforward neural network on sentence embeddings obtained by calculating the mean of all word embeddings in the sentence.
* We train the neural network using AdamW optimizer with 1e-4 learning rate for 100 epochs with batch size 256.
* The accuracy on test set for this model is roughly between **82% - 84%**.

In [26]:
class NeuralNetwork1(nn.Module):
    def __init__(self):
        super(NeuralNetwork1, self).__init__()
        self.embedding_dim = 300
        self.hidden1 = 50
        self.hidden2 = 5
        self.out_dim = 2
        
        self.linear = nn.Sequential(
            nn.Linear(self.embedding_dim, self.hidden1),
            nn.ReLU(),
            nn.Linear(self.hidden1, self.hidden2),
            nn.ReLU(),
            nn.Linear(self.hidden2, self.out_dim),
            
        )
        
    def forward(self, x):
        x = self.linear(x)
        
        return x
    
model1 = NeuralNetwork1()
model1.to(device)
print(model1)

NeuralNetwork1(
  (linear): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=5, bias=True)
    (3): ReLU()
    (4): Linear(in_features=5, out_features=2, bias=True)
  )
)


In [27]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model1.parameters(), lr=1e-4)

In [28]:
# training the model
train(model1, train_loader1, test_loader1, criterion, optimizer, n_epochs=100, device=device)

Epoch: 1 / 100	Training Loss: 223.27994138002396	Test Loss: 54.716657280921936
Epoch: 2 / 100	Training Loss: 205.70290875434875	Test Loss: 48.29630637168884
Epoch: 3 / 100	Training Loss: 176.52640929818153	Test Loss: 41.207524448633194
Epoch: 4 / 100	Training Loss: 155.0030519068241	Test Loss: 37.60107463598251
Epoch: 5 / 100	Training Loss: 144.89616572856903	Test Loss: 35.95973256230354
Epoch: 6 / 100	Training Loss: 140.06317156553268	Test Loss: 35.0801557302475
Epoch: 7 / 100	Training Loss: 137.1439170241356	Test Loss: 34.57513916492462
Epoch: 8 / 100	Training Loss: 135.18273961544037	Test Loss: 34.08438017964363
Epoch: 9 / 100	Training Loss: 133.62331506609917	Test Loss: 33.74214479327202
Epoch: 10 / 100	Training Loss: 132.32582929730415	Test Loss: 33.46182382106781
Epoch: 11 / 100	Training Loss: 131.22946453094482	Test Loss: 33.21343466639519
Epoch: 12 / 100	Training Loss: 130.28965264558792	Test Loss: 32.998418152332306
Epoch: 13 / 100	Training Loss: 129.46428593993187	Test Loss: 

In [29]:
part_4a_accuracy = accuracy(model1, test_loader1, device)
print(part_4a_accuracy)

83.965


In [30]:
def create_pad_embeddings(sentence, w2v, max_len=10):
    tokens = nltk.word_tokenize(sentence)
    vec_size = w2v.vector_size
    
    embedding = np.zeros((max_len * vec_size), dtype=np.float32)
    for i, word in enumerate(tokens):
        if i >= max_len:
            break
            
        if word in w2v:
            embedding[i*vec_size: (i+1)*vec_size] = w2v[word]
                  
    return embedding

sentence = "This is a sample sentence"
sample_embedding = create_pad_embeddings(sentence, pretrained_w2v)
print(sample_embedding.shape)

print(create_avg_embeddings(sentence, pretrained_w2v).shape)

(3000,)
(300,)


In [31]:
pad_embeddings = df_new['review_body'].apply(lambda x: create_pad_embeddings(x, pretrained_w2v))
pad_embeddings = np.array(pad_embeddings.tolist())

In [32]:
X_train_pad, X_test_pad, Y_train_pad, Y_test_pad = train_test_split(
    pad_embeddings,
    targets,
    shuffle=True, 
    test_size=0.2, 
    random_state=42
)

Y_train_pad = np.array(Y_train_pad.tolist())
Y_test_pad = np.array(Y_test_pad.tolist())

In [33]:
X_train_pad = torch.tensor(X_train_pad, dtype=torch.float32)
X_test_pad = torch.tensor(X_test_pad, dtype=torch.float32)

Y_train_pad = torch.tensor(Y_train_pad, dtype=torch.int64)
Y_test_pad = torch.tensor(Y_test_pad, dtype=torch.int64)

train_dataset2 = TensorDataset(X_train_pad, Y_train_pad)
test_dataset2 = TensorDataset(X_test_pad, Y_test_pad)

batch_size = 256
train_loader2 = DataLoader(train_dataset2, batch_size=batch_size, shuffle=True)
test_loader2 = DataLoader(test_dataset2, batch_size=batch_size)

## Task 4 (b)

* Here, we train a feedforward neural network on sentence embeddings obtained by concatenating the first 10 word embeddings and applying padding if the sentence is smaller than 10 words.
* We use a ReLU activation layer between the linear layers.
* We train the neural network using AdamW optimizer with 1e-4 learning rate and 1e-4 weight decay for 30 epochs with batch size 256.
* Upon re-running the code several times, we observe the accuracy on test set for this model is roughly between **75.5% - 76.6%**.

In [34]:
class NeuralNetwork2(nn.Module):
    def __init__(self):
        super(NeuralNetwork2, self).__init__()
        self.embedding_dim = 3000
        self.hidden1 = 50
        self.hidden2 = 5
        self.out_dim = 2
        
        self.linear = nn.Sequential(
            nn.Linear(self.embedding_dim, self.hidden1),
            nn.ReLU(),
            nn.Linear(self.hidden1, self.hidden2),
            nn.ReLU(),
            nn.Linear(self.hidden2, self.out_dim),
        )
        
    def forward(self, x):
        x = self.linear(x)
        
        return x
    
model2 = NeuralNetwork2()
model2.to(device)
print(model2)

NeuralNetwork2(
  (linear): Sequential(
    (0): Linear(in_features=3000, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=5, bias=True)
    (3): ReLU()
    (4): Linear(in_features=5, out_features=2, bias=True)
  )
)


In [35]:
optimizer = torch.optim.AdamW(model2.parameters(), lr=1e-4, weight_decay=1e-4)

In [36]:
train(model2, train_loader2, test_loader2, criterion=criterion, optimizer=optimizer, n_epochs=30, device=device)

Epoch: 1 / 30	Training Loss: 194.17466282844543	Test Loss: 43.72030174732208
Epoch: 2 / 30	Training Loss: 165.8945385813713	Test Loss: 41.193147748708725
Epoch: 3 / 30	Training Loss: 159.00936275720596	Test Loss: 40.39856415987015
Epoch: 4 / 30	Training Loss: 155.71505045890808	Test Loss: 39.918146044015884
Epoch: 5 / 30	Training Loss: 153.38784858584404	Test Loss: 39.65698781609535
Epoch: 6 / 30	Training Loss: 151.5978156030178	Test Loss: 39.41517451405525
Epoch: 7 / 30	Training Loss: 149.9291484951973	Test Loss: 39.31141784787178
Epoch: 8 / 30	Training Loss: 148.38147443532944	Test Loss: 39.11142221093178
Epoch: 9 / 30	Training Loss: 146.90436762571335	Test Loss: 39.007114231586456
Epoch: 10 / 30	Training Loss: 145.52796256542206	Test Loss: 38.82473134994507
Epoch: 11 / 30	Training Loss: 144.09228175878525	Test Loss: 38.75255364179611
Epoch: 12 / 30	Training Loss: 142.6417380273342	Test Loss: 38.639875173568726
Epoch: 13 / 30	Training Loss: 141.23964083194733	Test Loss: 38.5982598364

In [37]:
part_4b_accuracy = accuracy(model2, test_loader2, device=device)
print(part_4b_accuracy)

76.29


# Task 5: Recurrent Neural Networks

In [38]:
def create_block_embeddings(sentence, w2v, max_len=10):
    tokens = nltk.word_tokenize(sentence)
    vec_size = w2v.vector_size
    
    embedding = np.zeros((max_len, vec_size), dtype=np.float32)
    for i, word in enumerate(tokens):
        if i >= max_len:
            break
            
        if word in w2v:
            embedding[i] = w2v[word]
                    
    return embedding

sentence = "This is a sample sentence"
sample_embedding = create_block_embeddings(sentence, pretrained_w2v)

In [39]:
block_embeddings = df_new['review_body'].apply(lambda x: create_block_embeddings(x, pretrained_w2v))
block_embeddings = np.array(block_embeddings.tolist())

In [40]:
X_train_block, X_test_block, Y_train_block, Y_test_block = train_test_split(
    block_embeddings,
    targets,
    shuffle=True,
    test_size=0.2,
    random_state=42
)

Y_train_block = np.array(Y_train_block.tolist())
Y_test_block = np.array(Y_test_block.tolist())

In [41]:
X_train_block = torch.tensor(X_train_block, dtype=torch.float32)
X_test_block = torch.tensor(X_test_block, dtype=torch.float32)

Y_train_block = torch.tensor(Y_train_block, dtype=torch.int64)
Y_test_block = torch.tensor(Y_test_block, dtype=torch.int64)

train_dataset3 = TensorDataset(X_train_block, Y_train_block)
test_dataset3 = TensorDataset(X_test_block, Y_test_block)

batch_size = 256
train_loader3 = DataLoader(train_dataset3, batch_size=batch_size, shuffle=True)
test_loader3 = DataLoader(test_dataset3, batch_size=batch_size)

## Task 5 (a)
* For this task, we train a RNN on sentence embeddings obtained by stacking the first 10 word embeddings and applying padding if the sentence is smaller than 10 words.
* We train the neural network using AdamW optimizer with 1e-3 learning rate with 1e-6 weight decay for 100 epochs with batch size 256.
* Upon re-running the code several times, we observe the accuracy on test set for this model is roughly around **77.7% - 78.5%**.

In [42]:
class RNNClf(nn.Module):
    def __init__(self):
        super(RNNClf, self).__init__()
        self.embed_size = 300
        self.hidden_size = 10
        self.out_size = 2
        
        self.rnn = nn.RNN(input_size=self.embed_size, hidden_size=self.hidden_size, batch_first=True)
        self.linear = nn.Linear(self.hidden_size, self.out_size)
        
    def forward(self, x):
        x, hidden = self.rnn(x)
        x = self.linear(x[:, -1, :])

        return x
    
rnnModel = RNNClf()
rnnModel.to(device)
print(rnnModel)

RNNClf(
  (rnn): RNN(300, 10, batch_first=True)
  (linear): Linear(in_features=10, out_features=2, bias=True)
)


In [43]:
optimizer = torch.optim.AdamW(rnnModel.parameters(), lr=1e-3, weight_decay=1e-6)

In [44]:
train(rnnModel, train_loader3, test_loader3, criterion, optimizer, n_epochs=100, device=device)

Epoch: 1 / 100	Training Loss: 189.24623772501945	Test Loss: 42.86137253046036
Epoch: 2 / 100	Training Loss: 165.2519319653511	Test Loss: 41.2786665558815
Epoch: 3 / 100	Training Loss: 160.46886545419693	Test Loss: 40.46919998526573
Epoch: 4 / 100	Training Loss: 158.11708521842957	Test Loss: 40.11415392160416
Epoch: 5 / 100	Training Loss: 156.1327583193779	Test Loss: 39.94943976402283
Epoch: 6 / 100	Training Loss: 154.3838656246662	Test Loss: 39.207602590322495
Epoch: 7 / 100	Training Loss: 153.1704162955284	Test Loss: 39.585414320230484
Epoch: 8 / 100	Training Loss: 151.9054266512394	Test Loss: 38.63105762004852
Epoch: 9 / 100	Training Loss: 150.69831427931786	Test Loss: 38.46751955151558
Epoch: 10 / 100	Training Loss: 149.9881165921688	Test Loss: 38.16771939396858
Epoch: 11 / 100	Training Loss: 149.20873829722404	Test Loss: 38.12427684664726
Epoch: 12 / 100	Training Loss: 148.06970876455307	Test Loss: 37.88624459505081
Epoch: 13 / 100	Training Loss: 147.25514441728592	Test Loss: 37.83

In [45]:
rnn_accuracy = accuracy(rnnModel, test_loader3, device)

print(rnn_accuracy)

77.955


## Task 5(b): GRU

* Here, we train a GRU on sentence embeddings obtained by stacking the first 10 word embeddings and applying padding if the sentence is smaller than 10 words.
* We train the neural network using AdamW optimizer with 1e-3 learning rate with 1e-2 weight decay for 30 epochs with batch size 256.
* Upon re-running the code several times, we observe the accuracy on test set for this model is roughly around **79.1% - 79.7%**.

In [46]:
class GRUClf(nn.Module):
    def __init__(self):
        super(GRUClf, self).__init__()
        self.embed_size = 300
    
        self.hidden_size = 10
        self.out_size = 2
        
        self.gru = nn.GRU(input_size=self.embed_size, hidden_size=self.hidden_size, batch_first=True)
        self.linear = nn.Linear(self.hidden_size, self.out_size)
        
    def forward(self, x):
        x, hidden = self.gru(x)
        x = self.linear(x[:, -1, :])
        
        return x
    
gruModel = GRUClf()
gruModel.to(device)
print(gruModel)

GRUClf(
  (gru): GRU(300, 10, batch_first=True)
  (linear): Linear(in_features=10, out_features=2, bias=True)
)


In [47]:
optimizer = torch.optim.AdamW(gruModel.parameters(), lr=1e-3, weight_decay=1e-2)

In [48]:
train(gruModel, train_loader3, test_loader3, criterion, optimizer, n_epochs=30, device=device)

Epoch: 1 / 30	Training Loss: 181.33681312203407	Test Loss: 40.29534995555878
Epoch: 2 / 30	Training Loss: 152.9514371752739	Test Loss: 38.26805377006531
Epoch: 3 / 30	Training Loss: 146.8739361166954	Test Loss: 37.00517484545708
Epoch: 4 / 30	Training Loss: 143.26274248957634	Test Loss: 36.36405465006828
Epoch: 5 / 30	Training Loss: 141.10489463806152	Test Loss: 35.926189661026
Epoch: 6 / 30	Training Loss: 139.08646640181541	Test Loss: 35.67848202586174
Epoch: 7 / 30	Training Loss: 137.7913582623005	Test Loss: 35.40585646033287
Epoch: 8 / 30	Training Loss: 136.35473904013634	Test Loss: 35.08504235744476
Epoch: 9 / 30	Training Loss: 135.20484054088593	Test Loss: 34.93747437000275
Epoch: 10 / 30	Training Loss: 134.1550863981247	Test Loss: 34.81758573651314
Epoch: 11 / 30	Training Loss: 133.29672893881798	Test Loss: 34.710423558950424
Epoch: 12 / 30	Training Loss: 132.54534032940865	Test Loss: 34.62228259444237
Epoch: 13 / 30	Training Loss: 131.5119285285473	Test Loss: 34.5435888171196
Ep

In [49]:
gru_accuracy = accuracy(gruModel, test_loader3, device)

print(gru_accuracy)

79.275


## Task 5(c): LSTM

* Here, we train a LSTM on sentence embeddings obtained by stacking the first 10 word embeddings and applying padding if the sentence is smaller than 10 words.
* We train the neural network using AdamW optimizer with 1e-3 learning rate and 1e-2 weight decay for 30 epochs with batch size 256.
* A relu activation layer between the lstm cell and the linear layer is applied. Also, the output is passed through a tanh layer before being returned by the model.
* Upon re-running the code several times, we observe the accuracy on test set for this model is roughly between **79.2% - 79.8%**.

In [50]:
class LSTMClf(nn.Module):
    def __init__(self):
        super(LSTMClf, self).__init__()
        self.embed_size = 300
        self.hidden_size = 10
        self.out_size = 2
        
        self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.hidden_size, batch_first=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(self.hidden_size, self.out_size)
        self.tanh = nn.Tanh()
        
        
    def forward(self, x):
        x, (hidden, cell) = self.lstm(x)
        x = self.relu(x)
        x = self.linear(x[:, -1, :])
        x = self.tanh(x)
        
        return x
    
lstmModel = LSTMClf()
lstmModel.to(device)
print(lstmModel)

LSTMClf(
  (lstm): LSTM(300, 10, batch_first=True)
  (relu): ReLU()
  (linear): Linear(in_features=10, out_features=2, bias=True)
  (tanh): Tanh()
)


In [51]:
optimizer = torch.optim.AdamW(lstmModel.parameters(), lr=1e-3, weight_decay=1e-2)

In [52]:
train(lstmModel, train_loader3, test_loader3, criterion, optimizer, n_epochs=30, device=device)

Epoch: 1 / 30	Training Loss: 188.53236678242683	Test Loss: 42.60245108604431
Epoch: 2 / 30	Training Loss: 161.78118962049484	Test Loss: 40.11137869954109
Epoch: 3 / 30	Training Loss: 154.7561023235321	Test Loss: 39.04612699151039
Epoch: 4 / 30	Training Loss: 150.59949985146523	Test Loss: 38.261914163827896
Epoch: 5 / 30	Training Loss: 147.96168661117554	Test Loss: 37.8667289018631
Epoch: 6 / 30	Training Loss: 145.9180660545826	Test Loss: 37.60482919216156
Epoch: 7 / 30	Training Loss: 144.42314419150352	Test Loss: 37.16939628124237
Epoch: 8 / 30	Training Loss: 142.6743516921997	Test Loss: 36.983817517757416
Epoch: 9 / 30	Training Loss: 141.31478962302208	Test Loss: 36.718128740787506
Epoch: 10 / 30	Training Loss: 140.29806298017502	Test Loss: 36.9502349793911
Epoch: 11 / 30	Training Loss: 139.1070382297039	Test Loss: 36.48337149620056
Epoch: 12 / 30	Training Loss: 138.49619647860527	Test Loss: 36.24354547262192
Epoch: 13 / 30	Training Loss: 137.58472111821175	Test Loss: 36.2356537282466

In [53]:
lstm_accuracy = accuracy(lstmModel, test_loader3, device)

print(lstm_accuracy) 

79.56
