## Custom dataset
1. Vocabulary mapping each word to an index
2. Pytorch Dataset to load the data
3. Padding of every batch to have fixed sequence length in a given batch and setup dataloader

In [2]:
import os
import pandas as pd
import spacy # for tokenization

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence # padding of every batch
from torch.utils.data import Dataset, DataLoader

spacy_eng = spacy.load("en_core_web_sm")

In [3]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
        
    def __len__(self): return len(self.itos)
    
    @staticmethod
    def tokenizer_en(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def build_vocab(self, sentences):
        frequencies = {} # store the frequency of each word encountered
        idx = 4 # 0, 1, 2 and 3 are already set
        
        for sentence in sentences:
            for word in self.tokenizer_en(sentence):
                if word not in frequencies: frequencies[word] = 1
                else: frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
        
    def stoi(self, word): 
        if word in self.stoi: return self.stoi[word]
        else: return "<UNK>"
    
    def __getitem__(self, idx): 
        if idx in self.itos: return self.itos[idx]
        else: return -1
    
    def encode(self, sentence):
        tokens = self.tokenizer_en(sentence)
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokens 
        ]

In [4]:
class SentimentDataset(Dataset):
    def __init__(self, root_dir, filename, freq_threshold=1):
        self.root_dir = root_dir
        self.df = pd.read_csv(os.path.join(root_dir,filename))
        
        self.sentiments = self.df["sentiment"]
        self.texts = self.df["text"]
        
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.texts.tolist())
        
    def __len__(self): return len(self.df)
    
    def __getitem__(self, idx):
        encoded_text = [[self.vocab.stoi["<SOS>"]]]
        encoded_text += [[encoded_token] for encoded_token in self.vocab.encode(self.texts[idx])]
        encoded_text.append([self.vocab.stoi["<EOS>"]])
        
        return { "text": torch.tensor(encoded_text).float(), "sentiment": self.sentiments[idx] }

In [5]:
class CollateBatch:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def __call__(self, batch):
        sentiments = [item["sentiment"] for item in batch]
        texts = [item["text"] for item in batch]
        texts = pad_sequence(texts, batch_first=False, padding_value=self.pad_idx)
        
        return texts, torch.tensor(sentiments)

In [6]:
def get_loader(root_dir, filename, batch_size=10, num_workers=1, shuffle=True, pin_memory=True):
    dataset = SentimentDataset(root_dir, filename)
    pad_idx = dataset.vocab.stoi["<PAD>"]
    
    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, 
                        shuffle=shuffle, pin_memory=pin_memory, collate_fn=CollateBatch(pad_idx=pad_idx))
    return loader, dataset

In [34]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, output_size=2):
        super(SentimentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.wh = nn.Linear(hidden_size, hidden_size)
        self.wx = nn.Linear(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def init_hidden_state(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)
    
    def forward(self, x): # x (seq_len, batch_size, input_size)
        hidden_state = self.init_hidden_state(batch_size=x.shape[1])
        
        for i in range(x.shape[0]):
            hidden_state = torch.sigmoid(self.wh(hidden_state) + self.wx(x[i])) # (batch_size, hidden_size)
        
        return F.softmax(self.fc(hidden_state), dim=1)
    
    def overfit(self, text, sentiment, epochs, lr=0.001):
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        criterion = torch.nn.CrossEntropyLoss() # ignore_index=pad_idx?
        
        for epoch in range(epochs):
            total_loss = 0

            # forward
            output = self.forward(text)

            loss = criterion(output, sentiment)

            # backward
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1)

            # gradient descent or Adam step
            optimizer.step()

            total_loss += loss
            if (epoch % 10 == 0): print(f"epoch [{epoch+1} / {epochs}] | total loss: {total_loss}")
    
    
    def fit(self, dataset, batch_size, epochs, lr=0.001):
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        criterion = torch.nn.CrossEntropyLoss() # ignore_index=pad_idx?
        
        for epoch in range(epochs):
            total_loss = 0
            for idx, (texts, sentiments) in enumerate(dataset):
                # forward
                output = self.forward(texts)
                
                loss = criterion(output, sentiments)
                
                # backward
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1)
                
                # gradient descent or Adam step
                optimizer.step()
                
                total_loss += loss
            
            if (epoch % 10 == 0): print(f"epoch [{epoch+1} / {epochs}] | total loss: {total_loss}")
    
    def get_sentiment(self, vocab, sentence):
        encoded_text = [[vocab.stoi["<SOS>"]]]
        encoded_text += [[encoded_token] for encoded_token in vocab.encode(sentence)]
        encoded_text.append([vocab.stoi["<EOS>"]])
        encoded_text = torch.tensor(encoded_text).float()

        return classifier(encoded_text)

In [36]:
BATCH_SIZE=2
HIDDEN_SIZE=64
OUTPUT_SIZE=2
INPUT_SIZE=1
LR=0.1
NUM_EPOCHS=200

In [37]:
classifier = SentimentClassifier(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE,
                                output_size=OUTPUT_SIZE)

In [None]:
# TRY OVERFITTING ONE SAMPLE OF DATA
text, sentiment = dataset[0].values()
sentiment = torch.tensor([sentiment])

# print the sentence
for i in range(text.shape[0]):
    print(dataset.vocab.itos[text[i].item()])

# train on this single sentence
classifier.overfit(text=text, sentiment=sentiment, epochs=NUM_EPOCHS, lr=LR)

In [30]:
# LOADING THE DATA
dataloader,dataset = get_loader("./data/", "small_sentiments.csv", batch_size=BATCH_SIZE)

In [38]:
# TRAINING
classifier.fit(dataloader, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, lr=LR)

epoch [1 / 200] | total loss: 3.6414928436279297
epoch [11 / 200] | total loss: 3.7530465126037598
epoch [21 / 200] | total loss: 3.7530465126037598
epoch [31 / 200] | total loss: 3.753046751022339
epoch [41 / 200] | total loss: 3.753046751022339
epoch [51 / 200] | total loss: 3.7530465126037598
epoch [61 / 200] | total loss: 3.7530465126037598
epoch [71 / 200] | total loss: 3.7530465126037598
epoch [81 / 200] | total loss: 3.7530465126037598
epoch [91 / 200] | total loss: 3.7530465126037598
epoch [101 / 200] | total loss: 3.7530465126037598
epoch [111 / 200] | total loss: 3.753046751022339
epoch [121 / 200] | total loss: 3.7530465126037598
epoch [131 / 200] | total loss: 3.7530465126037598
epoch [141 / 200] | total loss: 3.7530465126037598
epoch [151 / 200] | total loss: 3.753046751022339
epoch [161 / 200] | total loss: 3.7530465126037598
epoch [171 / 200] | total loss: 3.7530465126037598
epoch [181 / 200] | total loss: 3.7530465126037598
epoch [191 / 200] | total loss: 3.753046512603

In [41]:
classifier.get_sentiment(dataset.vocab, "I hate rain")

tensor([[1.0000e+00, 2.7247e-17]], grad_fn=<SoftmaxBackward>)