In [23]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from rnn_dataset import Vocabulary
from torch.nn.utils.rnn import pad_sequence  # padding of every batch
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

In [2]:
class SentimentDataset(Dataset):
    def __init__(self, root_dir, filename, freq_threshold=1):
        self.root_dir = root_dir
        self.df = pd.read_csv(os.path.join(root_dir, filename))

        self.sentiments = self.df["sentiment"]
        self.texts = self.df["text"]

        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.texts.tolist())

        self.vocab_size = len(self.vocab)

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        encoded_text = [self.vocab.stoi["<SOS>"]]
        encoded_text += [encoded_token for encoded_token 
                         in self.vocab.encode(self.texts[idx])]
        encoded_text.append(self.vocab.stoi["<EOS>"])

        return {"text": torch.tensor(encoded_text).long(), "sentiment": torch.tensor(self.sentiments[idx])}

class CollateBatch:
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sentiments = [item["sentiment"] for item in batch]
        texts = [item["text"] for item in batch]
        texts = pad_sequence(texts, batch_first=False,
                             padding_value=self.padding_idx)

        return texts, torch.tensor(sentiments)


def get_loader(root_dir, filename, batch_size=10, num_workers=1, shuffle=True, pin_memory=True):
    dataset = SentimentDataset(root_dir, filename)
    padding_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers,
                        shuffle=shuffle, pin_memory=pin_memory, collate_fn=CollateBatch(padding_idx=padding_idx))
    return loader, dataset


In [3]:
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.whh = nn.Linear(hidden_size, hidden_size)
        self.wxh = nn.Linear(input_size, hidden_size)
    
    def forward(self, x, hidden_state): # x (batch_size, input_size)
        return torch.sigmoid(self.whh(hidden_state) + self.wxh(x))

In [46]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_size, vocab_size, embedding_size, hidden_size, output_size=2, padding_idx=0):
        super(SentimentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        self.rnn1 = RNNCell(embedding_size, hidden_size)
        self.rnn2 = RNNCell(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def init_hidden_state(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)
    
    def forward(self, x, hidden_state): # x (seq_len, batch_size, input_size)
        h1 = hidden_state
        h2 = hidden_state
        x = self.embeddings(x)

        for i in range(x.shape[0]):
            h1 = self.rnn1(x[i], h1)
            h2 = self.rnn2(h1, h2)
        
        output = F.softmax(self.fc(h2), dim=1)
        return output, hidden_state
    
    def fit(self, dataset, batch_size, epochs, lr=0.001):
        self.train()
        optimizer = torch.optim.SGD(self.parameters(), lr=lr)
        criterion = torch.nn.CrossEntropyLoss() # ignore_index=pad_idx?
        
        for epoch in range(epochs):
            total_loss = 0
            for idx, (texts, sentiments) in enumerate(dataset):
                hidden_state = self.init_hidden_state(batch_size=batch_size)
                
                # forward
                for i in range(texts.shape[0]):
                    output, hidden_state = self.forward(texts[i], hidden_state)
                
                loss = criterion(output, sentiments)
                
                # backward
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1)
                
                # gradient descent or Adam step
                optimizer.step()
                
                total_loss += loss
            
            accuracy = self.evaluate(dataset)
            if (epoch % 10 == 0): print(f"epoch [{epoch+1} / {epochs}] | total loss: {total_loss} | accuracy: {accuracy} ")
    
    def evaluate(self, dataset):
        self.eval()
        
        accuracy = 0
        
        for idx, (texts, sentiments) in enumerate(dataset):
            hidden_state = self.init_hidden_state(batch_size=1)
            
            # forward
            for i in range(texts.shape[0]):
                output, hidden_state = self.forward(texts[i], hidden_state)
            
            if (torch.argmax(output).item() == sentiments.item()): accuracy += 1
        
        return accuracy / len(dataset)

In [47]:
BATCH_SIZE=1
HIDDEN_SIZE=12
EMBED_SIZE=12
OUTPUT_SIZE=2
LR=0.01
NUM_EPOCHS=400

In [40]:
# LOADING THE DATA
dataloader,dataset = get_loader("../data/", "small_sentiments.csv", batch_size=BATCH_SIZE)

In [48]:
classifier = SentimentClassifier(input_size=dataset.vocab_size, hidden_size=HIDDEN_SIZE,
                                vocab_size=len(dataset.vocab), embedding_size=EMBED_SIZE)

In [49]:
# TRAINING
classifier.fit(dataloader, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, lr=LR)

epoch [1 / 400] | total loss: 2.8278090953826904 | accuracy: 0.5 
epoch [11 / 400] | total loss: 2.814012050628662 | accuracy: 0.5 
epoch [21 / 400] | total loss: 2.8038220405578613 | accuracy: 0.5 
epoch [31 / 400] | total loss: 2.797112226486206 | accuracy: 0.5 
epoch [41 / 400] | total loss: 2.792570114135742 | accuracy: 0.5 
epoch [51 / 400] | total loss: 2.7894845008850098 | accuracy: 0.5 
epoch [61 / 400] | total loss: 2.7876052856445312 | accuracy: 0.5 
epoch [71 / 400] | total loss: 2.786376476287842 | accuracy: 0.5 
epoch [81 / 400] | total loss: 2.7855350971221924 | accuracy: 0.5 
epoch [91 / 400] | total loss: 2.7850241661071777 | accuracy: 0.5 
epoch [101 / 400] | total loss: 2.78464937210083 | accuracy: 0.5 
epoch [111 / 400] | total loss: 2.784529447555542 | accuracy: 0.5 
epoch [121 / 400] | total loss: 2.7843949794769287 | accuracy: 0.5 
epoch [131 / 400] | total loss: 2.784329414367676 | accuracy: 0.5 
epoch [141 / 400] | total loss: 2.784271001815796 | accuracy: 0.5 


KeyboardInterrupt: 

Exception in thread Thread-2677:
Traceback (most recent call last):
  File "/home/achraf/anaconda3/envs/pytorch/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/achraf/anaconda3/envs/pytorch/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/achraf/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/home/achraf/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/home/achraf/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
    fd = df.detach()
  File "/home/achraf/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  Fi

In [15]:
def get_sentiment(classifier, vocab, sentence):
    vocab_size = len(vocab)
    
    
    encoded_text = []
    encoded_text.append([vocab.stoi["<SOS>"]])
    encoded_text += [[encoded_token]
                     for encoded_token in vocab.encode(sentence)]
    encoded_text.append([vocab.stoi["<EOS>"]])

    encoded_text = torch.tensor(encoded_text).long()
    print(encoded_text.shape)
    h0 = classifier.init_hidden_state(batch_size=1)

    for i in range(encoded_text.shape[0]):
        output, h0 = classifier(encoded_text[i], h0)
    return output

In [19]:
classifier.eval()
print(get_sentiment(classifier, dataset.vocab, "I hate you"))
print(torch.argmax(get_sentiment(classifier, dataset.vocab, "I love you")))

torch.Size([5, 1])
tensor([[0.4992, 0.5008]], grad_fn=<SoftmaxBackward>)
torch.Size([5, 1])
tensor(1)
