# Importing

In [1]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
import re
import matplotlib.pyplot as plt
import spacy
from collections import Counter
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
spacy_eng = spacy.load("en_core_web_sm")
nltk.download('omw-1.4')
nltk.download('stopwords')

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading data

In [2]:
pos_train = '/kaggle/input/aclimdb/aclImdb/train/pos'
pos_df = pd.DataFrame(columns=['text', 'score'])
for idx, file in enumerate(os.listdir(pos_train)):
    file_path = os.path.join(pos_train, file)
    with open(file_path, 'r', errors="ignore") as r:
        text = r.read()
        pos_df.loc[idx, 'text'] = text
        pos_df.loc[idx, 'score'] = int(file_path[-6:-4]) if file_path[-5] == '0' else int(file_path[-5])

In [3]:
neg_train = '/kaggle/input/aclimdb/aclImdb/train/neg'
neg_df = pd.DataFrame(columns=['text', 'score'])
for idx, file in enumerate(os.listdir(neg_train)):
    file_path = os.path.join(neg_train, file)
    with open(file_path, 'r', errors="ignore") as r:
        text = r.read()
        neg_df.loc[idx, 'text'] = text
        neg_df.loc[idx, 'score'] = int(file_path[-6:-4]) if file_path[-5] == '0' else int(file_path[-5])

In [4]:
df = pd.concat([pos_df, neg_df])
df = df.sample(frac=1).reset_index(drop=True)
df.head()
len(df)

25000

In [5]:
pos_test = '/kaggle/input/aclimdb/aclImdb/test/pos'
pos_df = pd.DataFrame(columns=['text', 'score'])
for idx, file in enumerate(os.listdir(pos_test)):
    file_path = os.path.join(pos_test, file)
    with open(file_path, 'r', errors="ignore") as r:
        text = r.read()
        pos_df.loc[idx, 'text'] = text
        pos_df.loc[idx, 'score'] = int(file_path[-6:-4]) if file_path[-5] == '0' else int(file_path[-5])

In [6]:
neg_test = '/kaggle/input/aclimdb/aclImdb/test/neg'
neg_df = pd.DataFrame(columns=['text', 'score'])
for idx, file in enumerate(os.listdir(neg_test)):
    file_path = os.path.join(neg_test, file)
    with open(file_path, 'r', errors="ignore") as r:
        text = r.read()
        neg_df.loc[idx, 'text'] = text
        neg_df.loc[idx, 'score'] = int(file_path[-6:-4]) if file_path[-5] == '0' else int(file_path[-5])

In [7]:
df_test = pd.concat([pos_df, neg_df])
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test.head()
len(df_test)

25000

# Data preprocessing

In [8]:
def lemmatize_text(text):
    lemm=WordNetLemmatizer()
    text = text.split()
    text = list(map(lemm.lemmatize, text))
    return ' '.join(text)

def remove_stopwords(text):
    stop_words = stopwords.words("english")
    no_stop = []
    for word in text.split(' '):
        if word not in stop_words:
            no_stop.append(word)
    return " ".join(no_stop)

def remove_punctuation_func(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

def text_preporation(text):
    text = text.lower()
    text = remove_stopwords(text)
    text = remove_punctuation_func(text)
    text = lemmatize_text(text)
    text = re.sub(r'\bbr\b', '', text)
    text = re.sub(r"\s+", " ", text)
    return text

In [9]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [10]:
df['new_text'] = df['text'].map(text_preporation)
df['score'] = df['score'] - 1
df = df.drop(['text'], axis=1)
df.head()

Unnamed: 0,score,new_text
0,7,paul lukas played russian intellectual making ...
1,6,film like the texas chainsaw massacre suspiria...
2,0,ever hear three word uttered you joe baker afr...
3,1,believe lame pointless wa basically nothing la...
4,7,minimal budget running time eight minute great...


In [11]:
df.score.value_counts()

0    5100
9    4732
7    3009
3    2696
6    2496
2    2420
1    2284
8    2263
Name: score, dtype: int64

In [12]:
df_test['new_text'] = df_test['text'].map(text_preporation)
df_test['score'] = df_test['score'] - 1
df_test = df_test.drop(['text'], axis=1)
df_test.head()

Unnamed: 0,score,new_text
0,1,another son sam definitely oscar winner techni...
1,3,dull acting weak script worst spanish movie ye...
2,6,i ve seen movie quite time time watch it quirk...
3,1,interesting piece bruce weber s like dislike m...
4,9,first saw movie mid 80 s thought funny movie g...


In [13]:
df_test.score.value_counts()

0    5022
9    4999
7    2850
3    2635
2    2541
8    2344
6    2307
1    2302
Name: score, dtype: int64

In [14]:
class Vocabulary:
    def __init__(self,freq_threshold):
        #setting the pre-reserved tokens int to string tokens
        self.itos = {0:"<PAD>",1:"<SOS>",2:"<EOS>",3:"<UNK>"}
        
        #string to int tokens
        #its reverse dict self.itos
        self.stoi = {v:k for k,v in self.itos.items()}
        
        self.freq_threshold = freq_threshold
        
    def __len__(self): 
        return len(self.itos)
    
    @staticmethod
    def tokenize(text):
        return [token.text.lower() for token in spacy_eng.tokenizer(text)]
    
    def save_vocab(self):
        with open('itos.txt', 'w') as f:
            for key in self.itos.keys():
                f.write(f'{key}:{self.itos[key]}')
                f.write('\n')
                        
        with open('stoi.txt', 'w') as f:
            for key in self.stoi.keys():
                f.write(f'{key}:{self.stoi[key]}')
                f.write('\n')
    
    def build_vocab(self, sentence_list):
        frequencies = Counter()
        idx = 4
        
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                frequencies[word] += 1
                #add the word to the vocab if it reaches minum frequecy threshold
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    
    def numericalize(self,text):
        """ For each word in the text corresponding index token for that 
        word form the vocab built as list """
        tokenized_text = self.tokenize(text)
        return [ self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] for token in tokenized_text ]

# Dataset


In [15]:
class MovieDataset(Dataset):
    
    def __init__(self, df, valid_df=pd.DataFrame(), freq_threshold=5):
        self.df = df
        self.freq_threshold = freq_threshold
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.df['new_text'].tolist())
        if not (valid_df.empty):
            self.df = valid_df
        
        
    def __len__(self):
        return len(self.df.new_text)
    
    def __getitem__(self, idx):
        caption_vec = []
        caption_vec += [self.vocab.stoi["<SOS>"]]
        caption_vec += self.vocab.numericalize(self.df.loc[idx, 'new_text'])
        caption_vec += [self.vocab.stoi["<EOS>"]]
        return (torch.LongTensor(caption_vec), self.df.loc[idx, 'score'])

In [16]:
train_dataset = MovieDataset(df)
valid_dataset = MovieDataset(df, df_test)

In [17]:
def collate_fn(batch):
    sentences = [x[0] for x in batch]
    scores = [x[1] for x in batch]
    sentences = pad_sequence(sentences, batch_first=True)
    return sentences, torch.tensor(scores)

In [18]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Model

In [19]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, n_layers=1, bidirectional=False, dropout=0.2):
        super().__init__()
        if bidirectional:
            self.bi = 2
        else:
            self.bi = 1
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True, 
                           bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        self.attention = nn.Linear(hidden_size*self.bi, 1)
        self.fc = nn.Linear(hidden_size*self.bi, 10)
    
    def forward(self, x, hidden=None):
        x = self.embedding(x)
        out, (ht1, ct1) = self.lstm(x)
        attention_weights = torch.softmax(self.attention(out), dim=1)
        attended_vectors = attention_weights * out
        context_vector = torch.sum(attended_vectors, dim=1)
        output = self.fc(context_vector)
        return output

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = Model(len(train_dataset.vocab.itos), 256, 256, 4, True, 0.4)
model.to(device)

Model(
  (embedding): Embedding(26001, 256)
  (lstm): LSTM(256, 256, num_layers=4, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.4, inplace=False)
  (attention): Linear(in_features=512, out_features=1, bias=True)
  (fc): Linear(in_features=512, out_features=10, bias=True)
)

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train and test model

In [22]:
n_epochs = 5
for epoch in range(n_epochs):
    total_loss = 0
    train_accuracy = 0
    valid_loss = 0
    valid_accuracy = 0 
    sentiment_train = 0
    sentiment_valid = 0
    model.train()
    print("-"*5 + f'EPOCH_{epoch}' + '-'*5)
    for sentences, scores in tqdm(train_loader):
        optimizer.zero_grad()
        sentences = sentences.to(device)
        scores = scores.to(device)
        output = model(sentences)
        
        loss = criterion(output, scores)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        output = output.argmax(1).cpu().detach().numpy()
        scores = scores.cpu().detach().numpy()
        accuracy = accuracy_score(output, scores)
        train_accuracy += accuracy
        
        output = list(map(lambda x: 0 if x<=3 else 1, output))
        scores = list(map(lambda x: 0 if x<=3 else 1, scores))
        accuracy = accuracy_score(output, scores)
        sentiment_train += accuracy
    model.eval()
    for sentences, scores in tqdm(valid_loader):
        sentences = sentences.to(device)
        scores = scores.to(device)
        output = model(sentences)
        loss = criterion(output, scores)
        valid_loss += loss.item()
        
        output = output.argmax(1).cpu().detach().numpy()
        scores = scores.cpu().detach().numpy()
        accuracy = accuracy_score(output, scores)
        valid_accuracy += accuracy
        
        output = list(map(lambda x: 0 if x<=3 else 1, output))
        scores = list(map(lambda x: 0 if x<=3 else 1, scores))
        accuracy = accuracy_score(output, scores)
        sentiment_valid += accuracy
        
    print(f"Train loss: {(total_loss/len(train_loader)):.3f}", 
          f"Train accuracy: {(train_accuracy/len(train_loader)):.3f}",
          f"Sentiment accuracy: {(sentiment_train/len(train_loader)):.3f}")
    print(f"Validation loss: {(valid_loss/len(valid_loader)):.3f}", 
          f"Validation accuracy: {(valid_accuracy/len(valid_loader)):.3f}",
          f"Sentiment accuracy: {(sentiment_valid/len(valid_loader)):.3f}")
        


-----EPOCH_0-----


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Train loss: 1.801 Train accuracy: 0.309 Sentiment accuracy: 0.715
Validation loss: 1.581 Validation accuracy: 0.394 Sentiment accuracy: 0.842
-----EPOCH_1-----


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Train loss: 1.449 Train accuracy: 0.432 Sentiment accuracy: 0.892
Validation loss: 1.516 Validation accuracy: 0.410 Sentiment accuracy: 0.872
-----EPOCH_2-----


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Train loss: 1.228 Train accuracy: 0.509 Sentiment accuracy: 0.937
Validation loss: 1.474 Validation accuracy: 0.453 Sentiment accuracy: 0.874
-----EPOCH_3-----


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Train loss: 1.039 Train accuracy: 0.583 Sentiment accuracy: 0.965
Validation loss: 1.660 Validation accuracy: 0.434 Sentiment accuracy: 0.869
-----EPOCH_4-----


  0%|          | 0/391 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Train loss: 0.847 Train accuracy: 0.658 Sentiment accuracy: 0.978
Validation loss: 1.906 Validation accuracy: 0.431 Sentiment accuracy: 0.852


In [23]:
torch.save(model.state_dict(), 'model_weights.pth')