<a href="https://colab.research.google.com/github/AchrafAsh/ml_projects/blob/main/nlp/rnn_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
import os
import pandas as pd
import spacy  # for tokenization

from torch.nn.utils.rnn import pad_sequence  # padding of every batch
from torch.utils.data import Dataset, DataLoader

spacy_eng = spacy.load("en_core_web_sm")


class Vocabulary:
    def __init__(self, freq_threshold):
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}

    def __len__(self): return len(self.itos)

    @staticmethod
    def tokenizer_en(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocab(self, sentences):
        frequencies = {}  # store the frequency of each word encountered
        idx = 4  # 0, 1, 2 and 3 are already set

        for sentence in sentences:
            for word in self.tokenizer_en(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
        return

    def toint(self, word):
        if word in self.stoi:
            return self.stoi[word]
        else:
            return self.stoi["<UNK>"]

    def __getitem__(self, idx):
        if idx in self.itos:
            return self.itos[idx]
        else:
            return "<UNK>"

    def encode(self, sentence):
        tokens = self.tokenizer_en(sentence)
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokens
        ]


class SentimentDataset(Dataset):
    def __init__(self, root_dir, filename, freq_threshold=1):
        self.root_dir = root_dir
        self.df = pd.read_csv(os.path.join(root_dir, filename))

        self.sentiments = self.df["sentiment"]
        self.texts = self.df["text"]

        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.texts.tolist())

        self.vocab_size = len(self.vocab)

    def __len__(self): return len(self.df)

    def one_hot_tensor(self, idx):
        tensor = np.zeros(self.vocab_size)
        tensor[idx] = 1
        return tensor

    def __getitem__(self, idx):
        encoded_text = [self.one_hot_tensor(self.vocab.stoi["<SOS>"])]
        encoded_text += [self.one_hot_tensor(encoded_token)
                         for encoded_token in self.vocab.encode(self.texts[idx])]
        encoded_text.append(self.one_hot_tensor(self.vocab.stoi["<EOS>"]))

        return {"text": torch.tensor(encoded_text).float(), "sentiment": torch.tensor(self.sentiments[idx])}


class CollateBatch:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        sentiments = [item["sentiment"] for item in batch]
        texts = [item["text"] for item in batch]
        texts = pad_sequence(texts, batch_first=False,
                             padding_value=self.pad_idx)

        return texts, torch.tensor(sentiments)


def get_loader(root_dir, filename, batch_size=10, num_workers=1, shuffle=True, pin_memory=True):
    dataset = SentimentDataset(root_dir, filename)
    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers,
                        shuffle=shuffle, pin_memory=pin_memory, collate_fn=CollateBatch(pad_idx=pad_idx))
    return loader, dataset

In [13]:
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.whh = nn.Linear(hidden_size, hidden_size)
        self.wxh = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
    
    def forward(self, x, hidden_state): # x (batch_size, input_size)
        return self.relu(self.whh(hidden_state) + self.wxh(x))

In [25]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, output_size=2):
        super(SentimentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.rnn1 = RNNCell(input_size, hidden_size)
        self.rnn2 = RNNCell(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
#         self.whh = nn.Linear(hidden_size, hidden_size)
#         self.wxh = nn.Linear(input_size, hidden_size)
        
    
    def init_hidden_state(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)
    
    def forward(self, x, hidden_state): # x (seq_len, batch_size, input_size)
        h1 = hidden_state
        h2 = hidden_state

        for i in range(x.shape[0]):
            h1 = self.rnn1(x[i], h1)
            h2 = self.rnn2(h1, h2)
        
        output = self.fc(h2)
        return output, hidden_state
    
    def fit(self, dataset, batch_size, epochs, lr=0.001):
        optimizer = torch.optim.SGD(self.parameters(), lr=lr)
        criterion = torch.nn.CrossEntropyLoss() # ignore_index=pad_idx?
        
        for epoch in range(epochs):
            total_loss = 0
            for idx, (texts, sentiments) in enumerate(dataset):
                hidden_state = self.init_hidden_state(batch_size=batch_size)
                
                # forward
                for i in range(texts.shape[0]):
                    output, hidden_state = self.forward(texts[i], hidden_state)
                
                loss = criterion(output, sentiments)
                
                # backward
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1)
                
                # gradient descent or Adam step
                optimizer.step()
                
                total_loss += loss
            
            if (epoch % 10 == 0): print(f"epoch [{epoch+1} / {epochs}] | total loss: {total_loss}")

In [19]:
BATCH_SIZE=1
HIDDEN_SIZE=124
OUTPUT_SIZE=2
LR=0.001
NUM_EPOCHS=200

In [26]:
# LOADING THE DATA
dataloader,dataset = get_loader(".", "small_sentiments.csv", batch_size=BATCH_SIZE)

In [27]:
classifier = SentimentClassifier(input_size=dataset.vocab_size, hidden_size=HIDDEN_SIZE,
                                output_size=OUTPUT_SIZE)

In [28]:
# TRAINING
classifier.fit(dataloader, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, lr=LR)

epoch [1 / 200] | total loss: 2.7797744274139404
epoch [11 / 200] | total loss: 2.7797744274139404
epoch [21 / 200] | total loss: 2.7797744274139404
epoch [31 / 200] | total loss: 2.779665946960449
epoch [41 / 200] | total loss: 2.7793493270874023
epoch [51 / 200] | total loss: 2.7797741889953613
epoch [61 / 200] | total loss: 2.7794530391693115
epoch [71 / 200] | total loss: 2.7795591354370117
epoch [81 / 200] | total loss: 2.7794530391693115
epoch [91 / 200] | total loss: 2.779665470123291
epoch [101 / 200] | total loss: 2.779665231704712
epoch [111 / 200] | total loss: 2.7795591354370117
epoch [121 / 200] | total loss: 2.7795588970184326
epoch [131 / 200] | total loss: 2.7794528007507324
epoch [141 / 200] | total loss: 2.7793490886688232
epoch [151 / 200] | total loss: 2.779348850250244
epoch [161 / 200] | total loss: 2.779348850250244
epoch [171 / 200] | total loss: 2.7794525623321533
epoch [181 / 200] | total loss: 2.779773473739624
epoch [191 / 200] | total loss: 2.77977323532104

In [29]:
def get_sentiment(classifier, vocab, sentence):
    vocab_size = len(vocab)
    
    def one_hot_tensor(idx):
        tensor = [0] * vocab_size
        tensor[idx] = 1
        return tensor
    
    encoded_text = []
    encoded_text.append([one_hot_tensor(vocab.stoi["<SOS>"])])
    encoded_text += [[one_hot_tensor(encoded_token)]
                     for encoded_token in vocab.encode(sentence)]
    encoded_text.append([one_hot_tensor(vocab.stoi["<EOS>"])])

    encoded_text = torch.tensor(encoded_text).float()
    print(encoded_text.shape)
    h0 = classifier.init_hidden_state(batch_size=1)

    for i in range(encoded_text.shape[0]):
        output, h0 = classifier(encoded_text[i], h0)
    return output

In [31]:
classifier.eval()
get_sentiment(classifier, dataset.vocab, "I hate you")

torch.Size([5, 1, 17])


tensor([[-0.0996, -0.0016]], grad_fn=<AddmmBackward>)