# Assignment 5

Build CNN model for sentiment analysis (binary classification) of IMDB Reviews (https://www.kaggle.com/utathya/imdb-review-dataset). You can use data with label="unsup" for pretraining of embeddings. Here you are forbidden to use test dataset for pretraining of embeddings.
Your quality metric is accuracy score on test dataset. Look at "type" column for train/test split.
You can use pretrained embeddings from external sources.
You have to provide data for trials with different hyperparameter values.

You have to beat following baselines:
[3 points] acc = 0.75
[5 points] acc = 0.8
[8 points] acc = 0.9

[2 points] for using unsupervised data

In [39]:
import pandas as pd

import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from collections import Counter
import itertools

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam

from sklearn import model_selection 
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
tt.cuda.is_available()

True

In [6]:
from google.colab import files
uploaded = files.upload()

Saving imdb-review-dataset.zip to imdb-review-dataset.zip


In [7]:
!unzip imdb-review-dataset.zip

Archive:  imdb-review-dataset.zip
  inflating: imdb_master.csv         


In [41]:
data = pd.read_csv('imdb_master.csv', encoding='latin-1')
data = data.drop('Unnamed: 0', axis = 1)
data

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...
99995,train,"Delightfully awful! Made by David Giancola, a ...",unsup,9998_0.txt
99996,train,"Watching Time Chasers, it obvious that it was ...",unsup,9999_0.txt
99997,train,At the beginning we can see members of Troma t...,unsup,999_0.txt
99998,train,"The movie was incredible, ever since I saw it ...",unsup,99_0.txt


In [0]:
clean_data = data[(data['type'] == 'train') & (data['label'] != 'unsup')]
withunsup_data  =   data[(data['type'] == 'train') ]
test_data = data[(data['type'] == 'test')]
train_data, val_data = model_selection.train_test_split(clean_data, test_size=0.05, stratify=clean_data.label)

In [0]:
class InputFeatures(object): 
    def __init__(self, input_ids, label_id):
        self.input_ids = input_ids
        self.label_id = label_id

In [0]:
class Vocab:
    def __init__(self, itos, unk_index):
        self._itos = itos
        self._stoi = {word:i for i, word in enumerate(itos)}
        self._unk_index = unk_index
        
    def __len__(self):
        return len(self._itos)
    
    def word2id(self, word):
        idx = self._stoi.get(word)
        if idx is not None:
            return idx
        return self._unk_index
    
    def id2word(self, idx):
        return self._itos[idx]

In [0]:
class PreProc():
    def __init__(self, max_vocab_size):
        self.special_words = ['<PAD>', '</UNK>', '<S>', '</S>']
        self.unk_index = 1
        self.pad_index = 0
        self.vocab = None
        self.max_vocab_size = max_vocab_size
        
    def tokenize(self, text):
        text = re.sub(r'[^\w\s]','',text, re.UNICODE)
        text = text.lower()
        text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
        text = [lemmatizer.lemmatize(token, "v") for token in text]
        text = [word for word in text if word not in en_stopwords]
        return text
        
    def build_vocab(self, tokens):
        itos = []
        itos.extend(self.special_words)
        
        token_counts = Counter(tokens)
        for word, _ in token_counts.most_common(self.max_vocab_size - len(self.special_words)):
            itos.append(word)
            
        self.vocab = Vocab(itos, self.unk_index)
    
    def transform(self, texts):
        result = []
        for text in texts:
            tokens = ['<S>'] + self.tokenize(text) + ['</S>']
            ids = [self.vocab.word2id(token) for token in tokens]
            result.append(ids)
        return result
    
    def fit_transform(self, texts):
        result = []
        tokenized_texts = [self.tokenize(text) for text in texts]
        self.build_vocab(itertools.chain(*tokenized_texts))

        for tokens in tokenized_texts:
            tokens = ['<S>'] + tokens + ['</S>']
            ids = [self.vocab.word2id(token) for token in tokens]
            result.append(ids)

        return result

In [0]:
def build_features(token_ids, label, max_seq_len, pad_index, label_encoding):
    if len(token_ids) >= max_seq_len:
        ids = token_ids[:max_seq_len]
    else:
        ids = token_ids + [pad_index for _ in range(max_seq_len - len(token_ids))]
    return InputFeatures(ids, label_encoding[label])


def features_to_tensor(features):
    text_tensor = tt.tensor([example.input_ids for example in features], dtype=tt.long)
    labels_tensor = tt.tensor([example.label_id for example in features], dtype=tt.long)
    return text_tensor, labels_tensor

In [0]:
max_seq_len=200
classes = {'neg': 0, 'pos' : 1}
text2id = PreProc(10000)

train_ids = text2id.fit_transform(train_data['review'])
val_ids = text2id.transform(val_data['review'])
test_ids = text2id.transform(test_data['review'])

In [0]:
train_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(train_ids, train_data['label'])]

val_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(val_ids, val_data['label'])]

test_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(test_ids, test_data['label'])]

In [0]:
batch_size = 64

In [0]:
train_tensor, train_labels = features_to_tensor(train_features)
val_tensor, val_labels = features_to_tensor(val_features)
test_tensor, test_labels = features_to_tensor(test_features)

train_dataset = TensorDataset(train_tensor, train_labels)
val_dataset = TensorDataset(val_tensor, val_labels)
test_dataset = TensorDataset(test_tensor, test_labels)

train_loader = DataLoader(train_dataset, batch_size = batch_size)
val_loader = DataLoader(val_dataset, batch_size = batch_size)
test_loader = DataLoader(test_dataset, batch_size = batch_size)

In [0]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.preproc = nn.Sequential(
            nn.Embedding(10000,50)
        )
        self.hidden = nn.Sequential(
            nn.Conv1d(in_channels=50, out_channels=60, kernel_size=3), 
            nn.ReLU(), nn.MaxPool1d(3,stride=2),      
            nn.Conv1d(in_channels=60, out_channels=100, kernel_size=3), 
            nn.ReLU(), nn.MaxPool1d(3, stride=2))
        
        self.output = nn.Sequential(
            nn.Linear(4700,1),
            nn.Sigmoid()
        )
    def forward(self, x):
        batch = x.size(0)
        x = self.preproc(x)
        x = x.transpose(2,1)
        
        y = self.hidden(x).view(batch, -1)
        return  self.output(y)

In [0]:
def fit(net,criterion,train_loader,val_loader,optimizer, epochs):
    best=0
    net.cuda()

    for i in range(epochs):
        tr_loss = 0
        val_loss = 0
        val_accuracy =0

        for xx,yy in train_loader:
            xx, yy = xx.cuda(), yy.cuda()
            optimizer.zero_grad()
            y = net.forward(xx)
            loss = criterion(y,yy.float().view(len(yy),-1))
            tr_loss += loss
            loss.backward()
            optimizer.step()

        tr_loss /= len(train_loader)
    
        with tt.no_grad():
            for xx,yy in val_loader:
    
                all_preds = []
                xx, yy = xx.cuda(), yy.cuda()
                y = net.forward(xx)
                loss = criterion(y,yy.float().view(len(yy),-1))
                val_loss += loss
  
                for index in y:
                    if index>0.5:
                        all_preds.append(1)
                    else:
                        all_preds.append(0)
      
                yy = yy.cpu().numpy()
                val_accuracy += accuracy_score(all_preds,yy)

            val_accuracy /= len(val_loader)
            if val_accuracy>best:
                best = val_accuracy
                tt.save(net.state_dict(), "../model.py")

        print((f"epoch: {i}, train: {tr_loss.item()}, val: {val_accuracy.item()}"))
    net.cpu()

In [60]:
model = MyModel()
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=0.001)
fit(model,criterion,train_loader,val_loader,optimizer,70)

epoch: 0, train: 0.5845698118209839, val: 0.7923253676470587
epoch: 1, train: 0.37915271520614624, val: 0.8358915441176471
epoch: 2, train: 0.2965511381626129, val: 0.8522977941176471
epoch: 3, train: 0.2357231080532074, val: 0.8568933823529411
epoch: 4, train: 0.18305544555187225, val: 0.8591452205882353
epoch: 5, train: 0.1462704837322235, val: 0.8574908088235293
epoch: 6, train: 0.14867553114891052, val: 0.8459558823529412
epoch: 7, train: 0.112944595515728, val: 0.8489889705882353
epoch: 8, train: 0.07815824449062347, val: 0.8551470588235294
epoch: 9, train: 0.05115180462598801, val: 0.8489889705882353
epoch: 10, train: 0.04412904754281044, val: 0.8496783088235293
epoch: 11, train: 0.05954860523343086, val: 0.8465533088235293
epoch: 12, train: 0.04992455244064331, val: 0.8512408088235294
epoch: 13, train: 0.03099685162305832, val: 0.8512408088235294
epoch: 14, train: 0.026197992265224457, val: 0.8225183823529412
epoch: 15, train: 0.010923964902758598, val: 0.8301470588235293
epoch:

In [0]:
y_pred = []
y_true = []
with tt.no_grad():
    model.eval()
    for texts, labels in test_loader:
        model.cuda()
        texts = texts.cuda()
        output = model.forward(texts)
        for i in output:
            if i>0.5:
                y_pred.append(1)
            else:
                y_pred.append(0)
        y_true.extend(labels.tolist())

In [64]:
accuracy_score(y_pred, y_true)

0.8396