In [765]:
import os
import numpy as np
# import torchtext as text
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
# from torchtext.vocab import vocab, GloVe
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from transformers import BertModel, BertTokenizer, AutoTokenizer
from datasets import load_dataset, load_dataset_builder, Dataset
torch.manual_seed(42)
from sklearn.metrics import f1_score
# glove = GloVe(name='840B', dim=300)

In [766]:
torch.cuda.is_available()

True

In [767]:
device = torch.device('cuda')

In [768]:
device

device(type='cuda')

In [769]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [770]:
def tokenize(reviews):
    max_len = 0 # we need to defin the maximum length of a review in our dataset because cnns take fixed sized inputs
    tokenized_reviews = []

    # idx = 2
    for review in reviews:
        tokenized = word_tokenize(review)
        tokenized_reviews.append(tokenized)
        max_len = max(max_len, len(tokenized))
    
    return tokenized_reviews, max_len

In [771]:
# Convert tokens to their idx and padding the reviews
def convertidx(tokenized_reviews, max_len, word2ix):
    input_ids = []
    for review in tokenized_reviews:
        review += ['<PAD>'] * (max_len - len(review))
        input_id = [word2ix.get(token, word2ix['<UNK>']) for token in review]
        input_ids.append(input_id)
    return input_ids

In [871]:
# For training and collecting results (because ran code with option 'Run all cells')
DATASET = 'SST'
f = open("results.txt", "a")
f.write(f"\n{DATASET}")
f.close()

In [872]:
FILTER_SIZES = [2, 3, 5]
NO_OF_FILTERS = [200, 200, 400]
EMB = 'F'
FREEZE = False

In [873]:
f = open("results.txt", "a")
f.write(f"\n\nEmbeddings: {EMB}")
f.write(f"\nFilter sizes: {FILTER_SIZES}")
f.write(f"\nNo of filters: {NO_OF_FILTERS}")

f.close()

## MR Dataset

In [850]:
pos_arr = []
neg_arr = []

with open(r'rt-polaritydata\rt-polarity.pos', 'rb') as f:
    pos_count = 0
    for line in f:
        pos_count += 1
        pos_arr.append(line.decode(errors='ignore').lower().strip()) # strings are in base64 format, therefore need to decode it 
    print("Positive count: ", pos_count)

with open(r'rt-polaritydata\rt-polarity.neg', 'rb') as f:
    neg_count = 0
    for line in f:
        neg_count += 1
        neg_arr.append(line.decode(errors='ignore').lower().strip()) # strings are in base64 format, therefore need to decode it 
    print("Negative count: ", neg_count)

reviews = neg_arr.copy()
reviews.extend(pos_arr)
labels = [0] * len(neg_arr)
labels.extend([1] * len(pos_arr))

reviews = np.array(reviews)
labels = np.array(labels)

print(reviews.shape, labels.shape)

from sklearn.model_selection import train_test_split
train_mr_sentences, test_mr_sentences, train_mr_labels, test_mr_labels = train_test_split(reviews, labels, test_size=0.15, random_state=20)

train_mr_sentences, val_mr_sentences, train_mr_labels, val_mr_labels = train_test_split(train_mr_sentences, train_mr_labels, test_size=0.20, random_state= 8)

tokenized_mr_train, max_mr_train_len = tokenize(train_mr_sentences)
tokenized_mr_test, max_mr_test_len = tokenize(test_mr_sentences)
tokenized_mr_val, max_mr_val_len = tokenize(val_mr_sentences)

Positive count:  5331
Negative count:  5331
(10662,) (10662,)


### word2ix for MR

In [851]:
if DATASET == 'MR':
    mr_word2ix = {'<PAD>': 0, '<UNK>': 1}
    mr_ix2word = {0: '<PAD>', 1: '<UNK>'}
    idx = 2
    count = 0
    for review in tokenized_mr_train:
        count += 1
        for token in review:
            if token not in mr_word2ix:
                mr_word2ix[token] = idx
                mr_ix2word[idx] = token
                idx += 1
    print(len(mr_word2ix))

16689


### MR input to model

In [852]:
if DATASET == 'MR':
    mr_train_input_ids = convertidx(tokenized_mr_train, max_mr_train_len, mr_word2ix)
    mr_test_input_ids = convertidx(tokenized_mr_test, max_mr_test_len, mr_word2ix)
    mr_val_input_ids = convertidx(tokenized_mr_val, max_mr_val_len, mr_word2ix)

## SST Dataset

In [874]:
if DATASET == 'SST':
    dataset = load_dataset("gpt3mix/sst2")
    print(dataset)

    sst_train = dataset['train']
    sst_test = dataset['test']
    sst_val = dataset['validation']

    train_sentences = sst_train['text']
    train_labels = sst_train['label']
    test_sentences = sst_test['text']
    test_labels = sst_test['label']
    val_sentences = sst_val['text']
    val_labels = sst_val['label']

    tokenized_sst_train, max_sst_train_len = tokenize(train_sentences)
    tokenized_sst_test, max_sst_test_len = tokenize(test_sentences)
    tokenized_sst_val, max_sst_val_len = tokenize(val_sentences)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6920
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1821
    })
})


### word2ix for SST

In [875]:
if DATASET == 'SST':
    sst_word2ix = {'<PAD>': 0, '<UNK>': 1}
    sst_ix2word = {0: '<PAD>', 1: '<UNK>'}
    idx = 2
    count = 0
    for sentence in tokenized_sst_train:
        count += 1
        for token in sentence:
            if token not in sst_word2ix:
                sst_word2ix[token] = idx
                sst_ix2word[idx] = token
                idx += 1
    print(len(sst_word2ix))

16274


### SST input to model

In [876]:
if DATASET == 'SST':
    sst_train_input_ids = convertidx(tokenized_sst_train, max_sst_train_len, sst_word2ix)
    sst_test_input_ids = convertidx(tokenized_sst_test, max_sst_test_len, sst_word2ix)
    sst_val_input_ids = convertidx(tokenized_sst_val, max_sst_val_len, sst_word2ix)

## Subj Dataset

In [775]:
subj_dataset = load_dataset("SetFit/subj")

subj_train = subj_dataset['train']
subj_test = subj_dataset['test'] 

Repo card metadata block was not found. Setting CardData to empty.


In [776]:
subj_train_text = subj_train['text']
subj_train_label = subj_train['label']
subj_test_text = subj_test['text']
subj_test_label = subj_test['label']

# creating val set
from sklearn.model_selection import train_test_split
subj_train_text, subj_val_text, subj_train_label, subj_val_label = train_test_split(subj_train_text, subj_train_label, test_size=0.20, random_state=20)

tokenized_subj_train, max_subj_train_len = tokenize(subj_train_text)
tokenized_subj_test, max_subj_test_len = tokenize(subj_test_text)
tokenized_subj_val, max_subj_val_len = tokenize(subj_val_text)

### word2ix for Subj

In [777]:
if DATASET == 'SUBJ':
    subj_word2ix = {'<PAD>': 0, '<UNK>': 1}
    subj_ix2word = {0: '<PAD>', 1: '<UNK>'}
    idx = 2
    count = 0
    for sentence in tokenized_subj_train:
        count += 1
        for token in sentence:
            if token not in subj_word2ix:
                subj_word2ix[token] = idx
                subj_ix2word[idx] = token
                idx += 1
    print(len(subj_word2ix))

18013


### Subj input to model

In [778]:
if DATASET == 'SUBJ':
    subj_train_input_ids = convertidx(tokenized_subj_train, max_subj_train_len, subj_word2ix)
    subj_test_input_ids = convertidx(tokenized_subj_test, max_subj_test_len, subj_word2ix)
    subj_val_input_ids = convertidx(tokenized_subj_val, max_subj_val_len, subj_word2ix)

In [779]:
fastText_path = r"C:\Users\akash\Downloads\crawl-300d-2M\crawl-300d-2M.vec"
gloVe_path = r".vector_cache\glove.840B.300d.txt"

## Creating embedding list (GloVe, fastText)

In [877]:
# creating an embedding dictionary which stores the embeddings for the words in our vocabulary (to feed it to emdedding layer), we could use embeddings directly but this was the recommendedway in many articles
def get_embeddings(emb, word2ix):
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2ix), 300))
    embeddings[word2ix['<PAD>']] = np.zeros((300,))

    if emb == 'glove':
        fin = open(gloVe_path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    else:
        fin = open(fastText_path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    count = 0
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2ix:
            count += 1
            embeddings[word2ix[word]] = np.array(tokens[1:], dtype=np.float32)
    embeddings = torch.tensor(embeddings)
    print(len(word2ix), count)
    return embeddings

### creating embeddings list (Bert) Try

In [878]:
if DATASET == 'MR':
    mr_glove_embeddings = get_embeddings("glove", mr_word2ix)
    mr_fast_embeddings = get_embeddings("fastText", mr_word2ix)

In [879]:
if DATASET == 'SST':
    sst_glove_embeddings = get_embeddings("glove", sst_word2ix)
    sst_fast_embeddings = get_embeddings("fastText", sst_word2ix)

2196017it [00:20, 105913.28it/s]


16274 15657


1999996it [00:18, 110739.11it/s]

16274 15669





In [783]:
if DATASET == 'SUBJ':
    subj_glove_embeddings = get_embeddings("glove", subj_word2ix)
    subj_fast_embeddings = get_embeddings("fastText", subj_word2ix)

2196017it [00:20, 105761.75it/s]


18013 16796


1999996it [00:18, 110652.75it/s]

18013 16690





# Datasets and Dataloaders

In [880]:
from torch.utils.data import (TensorDataset, DataLoader)

#### For MR dataset

In [881]:
if DATASET == 'MR':
    train_X = torch.tensor(mr_train_input_ids)
    train_y = torch.tensor(train_mr_labels)
    val_X = torch.tensor(mr_val_input_ids)
    val_y = torch.tensor(val_mr_labels)
    test_X = torch.tensor(mr_test_input_ids)
    test_y = torch.tensor(test_mr_labels)


#### For SST dataset

In [882]:
if DATASET == 'SST':
    train_X = torch.tensor(sst_train_input_ids)
    train_y = torch.tensor(train_labels)
    val_X = torch.tensor(sst_val_input_ids)
    val_y = torch.tensor(val_labels)
    test_X = torch.tensor(sst_test_input_ids)
    test_y = torch.tensor(test_labels)

### For Subj dataset

In [787]:
if DATASET == 'SUBJ':
    train_X = torch.tensor(subj_train_input_ids)
    train_y = torch.tensor(subj_train_label)
    val_X = torch.tensor(subj_val_input_ids)
    val_y = torch.tensor(subj_val_label)
    test_X = torch.tensor(subj_test_input_ids)
    test_y = torch.tensor(subj_test_label)

In [883]:
train_data = TensorDataset(train_X, train_y)
train_dataLoader = DataLoader(train_data, batch_size=50, shuffle=True)
test_data = TensorDataset(test_X, test_y)
test_dataLoader = DataLoader(test_data, batch_size=50, shuffle=True)
val_data = TensorDataset(val_X, val_y)
val_dataLoader = DataLoader(val_data, batch_size=50, shuffle=True)

In [884]:
class CNN(nn.Module):
    def __init__(self, pretrained_embedding=None, freeze_embedding=False, vocab_size=None, embed_dim=300, filter_sizes=[3, 4, 5], num_filters=[100, 100, 100], num_classes=2, dropout=0.5):
        super(CNN, self).__init__()

        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding)
            # print(self.embedding.shape)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=self.embed_dim,padding_idx=0,
                                          )
        # conv1d because we will convolve in only 1 direction, row wise, just as ngrams
        self.conv1d = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels= num_filters[i], # number of feature maps
                      kernel_size=filter_sizes[i]) # filter size
                      for i in range(len(filter_sizes))
        ])

        self.linear = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        x_emb = self.embedding(input_ids).float() # (batch_size, max_len, embed_dim)
        # print("emb_input: ", x_emb.shape)

        # Conv1d takes word embedding dimensions as input (in_channels)
        x_emb = x_emb.permute(0, 2, 1) # (batch_size, embed_dim, max_len)
        # print("reshaped_emb_input: ", x_emb.shape)

        # x_conv_list = [F.relu(conv1d(x_emb)) for conv1d in self.conv1d]

        x_conv_list = []
        '''
        contains feature maps for all the sentences in the batch
        Therefore, for every sentence, there will be 100 feature maps of a certain filter size
        suppose, 50 sentence, the output will be (supposing using filter size 3 and total filters = 3),
        (50, 100, 60) where 60 is the dimension of feature map (after performing convolution operation)
        
        '''
        for layer in self.conv1d: # self.conv1d will have no of layers equal to the filter sizes, i.e if f_sizes = [3, 4, 5], then no of layers = 3
            out = layer(x_emb) # convolving the sentences with a certain filter size
            # print(out.shape) (batch_size, num_filters, output of that filter size after convolution)
            out = F.relu(out) # applying relu on the output
            # print(out.shape)
            x_conv_list.append(out)
            
        # print("length of conv list: ",len(x_conv_list))
        # print(len(x_conv_list[0]))
        # print(x_conv_list[0].shape)
        # print(x_conv_list[1].shape)
        # print(x_conv_list[2].shape)
        # print(x_conv_list[3].shape)
        

        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        
        # print("length of pool list: ", len(x_pool_list))
        # print(x_pool_list[0])
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc_input = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        
        # Compute logits. Output shape: (batch_size, num_classes)
        logits = self.linear(self.dropout(x_fc_input))

        return logits

In [885]:
FREEZE = True # manually run this cell to train the model without finetuning

In [886]:
if DATASET == 'MR':
    if EMB == 'G':
        embeddings = mr_glove_embeddings
    else:
        embeddings = mr_fast_embeddings
    word2ix = mr_word2ix
elif DATASET == 'SST':
    if EMB == 'G':
        embeddings = sst_glove_embeddings
    else:
        embeddings = sst_fast_embeddings
    word2ix = sst_word2ix
else:
    if EMB == 'G':
        embeddings = subj_glove_embeddings
    else:
        embeddings = subj_fast_embeddings
    word2ix = subj_word2ix

cnn_model = CNN(pretrained_embedding=None,
                        freeze_embedding=FREEZE,
                        vocab_size=len(word2ix),
                        embed_dim=300,
                        filter_sizes=FILTER_SIZES,
                        num_filters=NO_OF_FILTERS,
                        num_classes=2,
                        dropout=0.6).to(device)

In [887]:
# device = 'cpu'

In [888]:
cnn_model.train()
criterion = nn.CrossEntropyLoss()
epochs = 10
learning_rates = [0.1, 0.15, 0.2]
max_accuracy = -1
max_accuracy_lr = -1
for learning_rate in learning_rates:
    print(f"\nLEARNING RATE:", learning_rate)
    optimizer = optim.Adadelta(cnn_model.parameters(),lr=learning_rate,rho=0.95)

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        train_loss = []
        cnn_model.train()
        for reviews, labels in tqdm(train_dataLoader):
                cnn_model.zero_grad()
                outputs = cnn_model(reviews.to(device))
                labels = labels.type(torch.LongTensor).to(device)

                loss = criterion(outputs, labels)

                train_loss.append(loss.item())

                loss.backward()
                optimizer.step()

        avg_train_loss = np.mean(train_loss)
        print("Train loss: ", avg_train_loss)

        cnn_model.eval()
        val_loss = []
        val_accuracy = []
        val_f1 = []
        for reviews, labels in tqdm(val_dataLoader):
            with torch.no_grad():
                outputs = cnn_model(reviews.to(device))
            labels = labels.type(torch.LongTensor).to(device)
            loss = criterion(outputs, labels)
            val_loss.append(loss.item())

            # Get the predictions
            preds = torch.argmax(outputs, dim=1).flatten()

            # Calculate the accuracy rate
            accuracy = (preds == labels).cpu().numpy().mean() * 100
            f1 = f1_score(labels.cpu(), preds.cpu(), average='macro').mean() * 100
            val_accuracy.append(accuracy)
            val_f1.append(f1)

        # Compute the average accuracy and loss over the validation set.
        val_loss = np.mean(val_loss)
        val_accuracy = np.mean(val_accuracy)
        val_f1 = np.mean(val_f1)

        if val_f1 > max_accuracy:
             max_accuracy = val_f1
             max_accuracy_lr = learning_rate

             torch.save({
            "model_param":cnn_model.state_dict(),
            "optim_param": optimizer.state_dict(),
            "lr": learning_rate,
            "acc":max_accuracy,
            }, f"saved_model/model")

        print("Val loss: ", val_loss)
        print("Val accuracy: ", val_accuracy)
        print("Val F1 accuracy: ", val_f1)

print(f"\nMaximum val accuracy of f1: {max_accuracy} was achieved with learning rate {max_accuracy_lr}.")



LEARNING RATE: 0.1
Epoch 1


100%|██████████| 139/139 [00:01<00:00, 89.44it/s]


Train loss:  0.7625382992861082


100%|██████████| 18/18 [00:00<00:00, 358.96it/s]


Val loss:  0.6374097433355119
Val accuracy:  64.20202020202021
Val F1 accuracy:  60.872814804239056
Epoch 2


100%|██████████| 139/139 [00:01<00:00, 100.49it/s]


Train loss:  0.637005295256059


100%|██████████| 18/18 [00:00<00:00, 360.15it/s]


Val loss:  0.6002695129977332
Val accuracy:  69.53535353535354
Val F1 accuracy:  67.7404934880622
Epoch 3


100%|██████████| 139/139 [00:01<00:00, 103.01it/s]


Train loss:  0.5509159957333435


100%|██████████| 18/18 [00:00<00:00, 364.17it/s]


Val loss:  0.5916820598973168
Val accuracy:  68.14141414141415
Val F1 accuracy:  66.14223636367508
Epoch 4


100%|██████████| 139/139 [00:01<00:00, 103.73it/s]


Train loss:  0.48009997606277466


100%|██████████| 18/18 [00:00<00:00, 364.41it/s]


Val loss:  0.5681101630131403
Val accuracy:  69.69696969696969
Val F1 accuracy:  69.13549661589525
Epoch 5


100%|██████████| 139/139 [00:01<00:00, 103.07it/s]


Train loss:  0.41523735879136503


100%|██████████| 18/18 [00:00<00:00, 363.90it/s]


Val loss:  0.5698237849606408
Val accuracy:  70.4040404040404
Val F1 accuracy:  68.61849705421798
Epoch 6


100%|██████████| 139/139 [00:01<00:00, 103.73it/s]


Train loss:  0.3591656040587871


100%|██████████| 18/18 [00:00<00:00, 403.76it/s]


Val loss:  0.5718346155352063
Val accuracy:  71.54545454545455
Val F1 accuracy:  70.87238353990014
Epoch 7


100%|██████████| 139/139 [00:01<00:00, 103.65it/s]


Train loss:  0.3078277327602716


100%|██████████| 18/18 [00:00<00:00, 403.97it/s]


Val loss:  0.5472906397448646
Val accuracy:  71.86868686868688
Val F1 accuracy:  71.41536618653605
Epoch 8


100%|██████████| 139/139 [00:01<00:00, 102.98it/s]


Train loss:  0.266539671009393


100%|██████████| 18/18 [00:00<00:00, 409.04it/s]


Val loss:  0.5457764400376214
Val accuracy:  72.7878787878788
Val F1 accuracy:  72.20013430078632
Epoch 9


100%|██████████| 139/139 [00:01<00:00, 103.62it/s]


Train loss:  0.22600880720846944


100%|██████████| 18/18 [00:00<00:00, 358.97it/s]


Val loss:  0.5950714631213082
Val accuracy:  70.04040404040404
Val F1 accuracy:  68.70320216775745
Epoch 10


100%|██████████| 139/139 [00:01<00:00, 104.36it/s]


Train loss:  0.1951811856074299


100%|██████████| 18/18 [00:00<00:00, 408.90it/s]


Val loss:  0.5711164044009315
Val accuracy:  73.30303030303031
Val F1 accuracy:  72.8160223279687

LEARNING RATE: 0.15
Epoch 1


100%|██████████| 139/139 [00:01<00:00, 105.61it/s]


Train loss:  0.19349815197985806


100%|██████████| 18/18 [00:00<00:00, 408.05it/s]


Val loss:  0.6865217222107781
Val accuracy:  68.33333333333333
Val F1 accuracy:  66.43000605290183
Epoch 2


100%|██████████| 139/139 [00:01<00:00, 105.62it/s]


Train loss:  0.183614582895375


100%|██████████| 18/18 [00:00<00:00, 412.07it/s]


Val loss:  0.6277393615908093
Val accuracy:  72.1010101010101
Val F1 accuracy:  71.37254236151225
Epoch 3


100%|██████████| 139/139 [00:01<00:00, 104.69it/s]


Train loss:  0.15465222071293447


100%|██████████| 18/18 [00:00<00:00, 386.81it/s]


Val loss:  0.631396492322286
Val accuracy:  71.39393939393939
Val F1 accuracy:  70.75141352365534
Epoch 4


100%|██████████| 139/139 [00:01<00:00, 105.20it/s]


Train loss:  0.15025160161496923


100%|██████████| 18/18 [00:00<00:00, 410.50it/s]


Val loss:  0.6197957371671995
Val accuracy:  70.85858585858585
Val F1 accuracy:  70.50910833054888
Epoch 5


100%|██████████| 139/139 [00:01<00:00, 105.14it/s]


Train loss:  0.12178701872555472


100%|██████████| 18/18 [00:00<00:00, 356.12it/s]


Val loss:  0.7425930549701055
Val accuracy:  70.73737373737373
Val F1 accuracy:  69.22792176443534
Epoch 6


100%|██████████| 139/139 [00:01<00:00, 104.99it/s]


Train loss:  0.11759113228256754


100%|██████████| 18/18 [00:00<00:00, 407.02it/s]


Val loss:  0.6765238493680954
Val accuracy:  72.0909090909091
Val F1 accuracy:  71.6283318528404
Epoch 7


100%|██████████| 139/139 [00:01<00:00, 105.53it/s]


Train loss:  0.09616686435912153


100%|██████████| 18/18 [00:00<00:00, 407.77it/s]


Val loss:  0.6548079401254654
Val accuracy:  71.26262626262627
Val F1 accuracy:  70.70630315112183
Epoch 8


100%|██████████| 139/139 [00:01<00:00, 105.13it/s]


Train loss:  0.08349720490493363


100%|██████████| 18/18 [00:00<00:00, 402.83it/s]


Val loss:  0.6665197511514028
Val accuracy:  71.70707070707071
Val F1 accuracy:  71.41279112602456
Epoch 9


100%|██████████| 139/139 [00:01<00:00, 104.83it/s]


Train loss:  0.07318937949997058


100%|██████████| 18/18 [00:00<00:00, 430.28it/s]


Val loss:  0.7154515948560503
Val accuracy:  70.92929292929293
Val F1 accuracy:  70.22060740390158
Epoch 10


100%|██████████| 139/139 [00:01<00:00, 104.91it/s]


Train loss:  0.07076273267592886


100%|██████████| 18/18 [00:00<00:00, 363.40it/s]


Val loss:  0.759763091802597
Val accuracy:  72.72727272727272
Val F1 accuracy:  72.37334142894808

LEARNING RATE: 0.2
Epoch 1


100%|██████████| 139/139 [00:01<00:00, 104.34it/s]


Train loss:  0.07708192226751674


100%|██████████| 18/18 [00:00<00:00, 404.97it/s]


Val loss:  0.7180887129571702
Val accuracy:  72.48484848484848
Val F1 accuracy:  71.82377269238795
Epoch 2


100%|██████████| 139/139 [00:01<00:00, 104.46it/s]


Train loss:  0.07662307976899173


100%|██████████| 18/18 [00:00<00:00, 350.55it/s]


Val loss:  0.7700362337960137
Val accuracy:  72.29292929292929
Val F1 accuracy:  71.75188546906054
Epoch 3


100%|██████████| 139/139 [00:01<00:00, 104.90it/s]


Train loss:  0.08334481114481422


100%|██████████| 18/18 [00:00<00:00, 402.30it/s]


Val loss:  0.7459264662530687
Val accuracy:  73.32323232323232
Val F1 accuracy:  72.88826473016135
Epoch 4


100%|██████████| 139/139 [00:01<00:00, 104.76it/s]


Train loss:  0.07122433878709193


100%|██████████| 18/18 [00:00<00:00, 418.35it/s]


Val loss:  0.8020736243989732
Val accuracy:  72.15151515151516
Val F1 accuracy:  71.74314706714921
Epoch 5


100%|██████████| 139/139 [00:01<00:00, 104.71it/s]


Train loss:  0.0756750688452622


100%|██████████| 18/18 [00:00<00:00, 355.03it/s]


Val loss:  0.7772238320774503
Val accuracy:  72.23232323232324
Val F1 accuracy:  71.86600645645365
Epoch 6


100%|██████████| 139/139 [00:01<00:00, 105.04it/s]


Train loss:  0.06837263066495816


100%|██████████| 18/18 [00:00<00:00, 406.50it/s]


Val loss:  0.8356140951315562
Val accuracy:  73.07070707070707
Val F1 accuracy:  72.39860301132836
Epoch 7


100%|██████████| 139/139 [00:01<00:00, 104.88it/s]


Train loss:  0.07298124550419638


100%|██████████| 18/18 [00:00<00:00, 396.25it/s]


Val loss:  0.8782962411642075
Val accuracy:  71.84848484848484
Val F1 accuracy:  71.23608217897895
Epoch 8


100%|██████████| 139/139 [00:01<00:00, 105.17it/s]


Train loss:  0.07064691850607344


100%|██████████| 18/18 [00:00<00:00, 409.82it/s]


Val loss:  0.8279485636287265
Val accuracy:  71.56565656565657
Val F1 accuracy:  71.12721444295205
Epoch 9


100%|██████████| 139/139 [00:01<00:00, 105.09it/s]


Train loss:  0.06350174850764141


100%|██████████| 18/18 [00:00<00:00, 403.66it/s]


Val loss:  0.9040196273061964
Val accuracy:  71.83838383838383
Val F1 accuracy:  71.26954320248194
Epoch 10


100%|██████████| 139/139 [00:01<00:00, 105.10it/s]


Train loss:  0.06637013650988396


100%|██████████| 18/18 [00:00<00:00, 401.64it/s]

Val loss:  0.8537693371375402
Val accuracy:  72.37373737373737
Val F1 accuracy:  71.95967088102755

Maximum val accuracy of f1: 72.88826473016135 was achieved with learning rate 0.2.





In [889]:
model_path =  r"saved_model\model"
checkpoint = torch.load(model_path)
best_model = CNN(pretrained_embedding=None,
                        freeze_embedding=FREEZE,
                        vocab_size=len(word2ix),
                        embed_dim=300,
                        filter_sizes=FILTER_SIZES,
                        num_filters=NO_OF_FILTERS,
                        num_classes=2,
                        dropout=0.5).to(device)
optimizer = optim.Adadelta(cnn_model.parameters(),lr=checkpoint['lr'],rho=0.95)
best_model.load_state_dict(checkpoint["model_param"])
optimizer.load_state_dict(checkpoint["optim_param"])
print(f"""Best model learning rate: {checkpoint["lr"]} and accuracy on val set: {checkpoint["acc"]}""")


Best model learning rate: 0.2 and accuracy on val set: 72.88826473016135


### Test set

In [890]:
best_model.eval()
test_accuracy = []
test_f1 = []
for reviews, labels in tqdm(test_dataLoader):
    with torch.no_grad():
        outputs = best_model(reviews.to(device))
        labels = labels.type(torch.LongTensor).to(device)

        # Get the predictions
        preds = torch.argmax(outputs, dim=1).flatten()
        accuracy = (preds == labels).cpu().numpy().mean() * 100
        f1 = f1_score(labels.cpu(), preds.cpu(), average='macro').mean() * 100
        test_accuracy.append(accuracy)
        test_f1.append(f1)

test_accuracy = np.mean(test_accuracy)
test_f1 = np.mean(test_f1)

print("Accuracy on test set: ", test_accuracy)
print("F1 accuracy on test set: ", test_f1)

100%|██████████| 37/37 [00:00<00:00, 99.12it/s] 

Accuracy on test set:  73.93822393822394
F1 accuracy on test set:  73.41054700518366





In [891]:
print("Embeddings ", EMB)
print("Finetuned: ", not FREEZE)
print("Filter sizes: ", FILTER_SIZES)
print("No of filters: ", NO_OF_FILTERS)

Embeddings  F
Finetuned:  False
Filter sizes:  [2, 3, 5]
No of filters:  [200, 200, 400]


In [892]:
f = open("results.txt", "a")
f.write(f"\n\nFinetuned: {not FREEZE}")
f.write(f"\nTest accuracy: {test_accuracy}")
f.close()


### Baseline Accuracy

In [893]:
sum = torch.sum(train_y) # labels are 0 and 1 so if sum is greater than half the length of the training set, then majority label is 1
majority_label = 0
if sum > (train_y.shape[0] / 2):
    majority_label = 1

In [894]:
majority_label

0

In [895]:
majority_label_test_y = torch.full((test_y.shape[0],), majority_label)

In [896]:
majority_label_test_y.shape

torch.Size([1821])

In [897]:
baseline_dataset = TensorDataset(test_X, majority_label_test_y)
baseline_dataLoader = DataLoader(baseline_dataset, batch_size=50, shuffle=True)

In [899]:
best_model.eval()
baseline_accuracy = []
baseline_f1 = []
for reviews, labels in tqdm(baseline_dataLoader):
    with torch.no_grad():
        outputs = best_model(reviews.to(device))
        labels = labels.type(torch.LongTensor).to(device)

        # Get the predictions
        preds = torch.argmax(outputs, dim=1).flatten()
        accuracy = (preds == labels).cpu().numpy().mean() * 100
        f1 = f1_score(labels.cpu(), preds.cpu(), average='macro').mean() * 100
        baseline_accuracy.append(accuracy)
        baseline_f1.append(f1)

baseline_accuracy = np.mean(baseline_accuracy)
baseline_f1 = np.mean(baseline_f1)

print("Baseline accuracy: ", baseline_accuracy)
print("Baseline f1 acccuracy: ", baseline_f1)

In [722]:
f = open("results.txt", "a")
f.write(f"\n\nBaseline accuracy: {baseline_accuracy}")
f.close()

### Prediction

In [None]:
# text = "Regrettably, this film, despite a promising premise and commendable performances, flounders in execution. Its narrative lacks coherence, characters lack depth, and thematic resonance is absent. Visual flair and a talented cast can't salvage this disjointed endeavor, leaving audiences longing for substance and cohesion in a cinematic experience."
# tokens = word_tokenize(text.lower())
# padded_tokens = tokens + ['<PAD>'] * (max_mr_train_len - len(tokens))
# input_id = [mr_word2ix.get(token, mr_word2ix['<UNK>']) for token in padded_tokens]
# input_id = torch.tensor(input_id)
# print(input_id.shape)
# input_id = input_id.unsqueeze(dim=0)
# print(input_id.shape)
# # Compute logits
# with torch.no_grad():
#     logits = best_model.forward(input_id.to(device))
# print(logits.shape)
# print(logits)

#     #  Compute probability
# probs = F.softmax(logits, dim=1)
# print(probs.shape)
# print(probs)
# probs = probs.squeeze(dim=0)
# print(probs.shape)
# print(probs)

# if probs[1] > 0.5:
#     print(f"This review is positive.")
# else:
#     print(f"This review is negative.")

## Debugging