## Custom dataset
1. Vocabulary mapping each word to an index
2. Pytorch Dataset to load the data
3. Padding of every batch to have fixed sequence length in a given batch and setup dataloader

In [26]:
import os
import pandas as pd
import spacy # for tokenization

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence # padding of every batch
from torch.utils.data import Dataset, DataLoader

spacy_eng = spacy.load("en_core_web_sm")

In [10]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
        
    def __len__(self): return len(self.itos)
    
    @staticmethod
    def tokenizer_en(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def build_vocab(self, sentences):
        frequencies = {} # store the frequency of each word encountered
        idx = 4 # 0, 1, 2 and 3 are already set
        
        for sentence in sentences:
            for word in self.tokenizer_en(sentence):
                if word not in frequencies: frequencies[word] = 1
                else: frequencies[word] += 1
            
            if frequencies[word] == self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
        
    def stoi(self, word): 
        if word in self.stoi: return self.stoi[word]
        else: return "<UNK>"
    
    def __getitem__(self, idx): 
        if idx in self.itos: return self.itos[idx]
        else: return -1
    
    def encode(self, sentence):
        tokens = self.tokenizer_en(sentence)
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokens 
        ]

In [128]:
class SentimentDataset(Dataset):
    def __init__(self, root_dir, filename, freq_threshold=1):
        self.root_dir = root_dir
        self.df = pd.read_csv(os.path.join(root_dir,filename))
        
        self.sentiments = self.df["sentiment"]
        self.texts = self.df["text"]
        
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.texts.tolist())
        
    def __len__(self): return len(self.df)
    
    def __getitem__(self, idx):
        encoded_text = [[self.vocab.stoi["<SOS>"]]]
        encoded_text += [[encoded_token] for encoded_token in self.vocab.encode(self.texts[idx])]
        encoded_text.append([self.vocab.stoi["<EOS>"]])
        
        return { "text": torch.tensor(encoded_text).float(), "sentiment": self.sentiments[idx] }

In [129]:
class CollateBatch:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def __call__(self, batch):
        sentiments = [item["sentiment"] for item in batch]
        texts = [item["text"] for item in batch]
        texts = pad_sequence(texts, batch_first=False, padding_value=self.pad_idx)
        
        return texts, torch.tensor(sentiments)

In [130]:
def get_loader(root_dir, filename, batch_size=10, num_workers=1, shuffle=True, pin_memory=True):
    dataset = SentimentDataset(root_dir, filename)
    pad_idx = dataset.vocab.stoi["<PAD>"]
    
    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, 
                        shuffle=shuffle, pin_memory=pin_memory, collate_fn=CollateBatch(pad_idx=pad_idx))
    return loader, dataset

In [131]:
dataloader,dataset = get_loader("./data/", "small_sentiments.csv", batch_size=2)

In [142]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_size=1, hidden_size=64):
        super(SentimentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.wh = nn.Linear(hidden_size, hidden_size)
        self.wx = nn.Linear(input_size, hidden_size)
        
        self.hidden_state = None
    
    def init_hidden_state(self, batch_size):
        self.hidden_state = torch.zeros(batch_size, self.hidden_size)
    
    def forward(self, x): # x (batch_size, input_size)
        self.hidden_state = torch.sigmoid(self.wh(self.hidden_state) + self.wx(x)) # (batch_size, hidden_size)
        return self.hidden_state
    
    def fit(self, dataset, batch_size=2, epochs=10, lr=0.001):
        self.init_hidden_state(batch_size)
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        criterion = torch.nn.CrossEntropyLoss() # ignore_index=pad_idx?
        
        for epoch in range(epochs):
            total_loss = 0
            for idx, (texts, sentiments) in enumerate(dataset):
                optimizer.zero_grad()
                
                for i in range(texts[0].shape[0]):
                    output = self(texts[0][i])
                
                loss = criterion(output, sentiments)
                loss.backward(retain_graph=True)
                optimizer.step()
                
                total_loss += loss
            
            print(f"epoch [{epoch+1} / {epochs}] | total loss: {total_loss}")

In [113]:
items = iter(dataloader)
x = items.next()

classifier = SentimentClassifier(input_size=1, hidden_size=2) # 2 classes: positive / negative
classifier.init_hidden_state(2)

for i in range(x[0].shape[0]):
    classifier(x[0][i])

classifier.hidden_state

tensor([[0.1828, 0.6293],
        [0.0513, 0.3389]], grad_fn=<SigmoidBackward>)

In [143]:
classifier = SentimentClassifier(input_size=1, hidden_size=2)
classifier.fit(dataloader, epochs=100, lr=.1)

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 2]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [115]:
classifier.hidden_state, torch.tensor(x[1])

(tensor([[0.1828, 0.6293],
         [0.0513, 0.3389]], grad_fn=<SigmoidBackward>), tensor([1, 0]))

In [112]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
nn.CrossEntropyLoss()(input, target)

tensor(0.8524, grad_fn=<NllLossBackward>)

In [None]:
class LSTMFromScratch(nn.Module):
    def __init__(self,):
        super(ScratchLSTM, self).__init__()
        
    def forward(self, x, y):
        pass