In [1]:
import os

In [2]:
import numpy as np
import pandas as pd

In [3]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer

In [4]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model implementation

In [6]:
class EncoderNN(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super(EncoderNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
    
    def forward(self, input_seq, hidden):
        output, hidden = self.gru(input_seq, hidden)
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=self.device)

In [7]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size, h0_size, h1_size, out_size):
        super(MLPClassifier, self).__init__()
        
        self.fcl1 = nn.Linear(input_size, h0_size)
        self.fcl2 = nn.Linear(h0_size, h1_size)
        self.out = nn.Linear(h1_size, out_size)
        
    def forward(self, x):
        x = F.sigmoid(self.fcl1(x))
        x = F.sigmoid(self.fcl2(x))
        x = F.sigmoid(self.out(x))
        
        y = F.softmax(x)
        return y

In [8]:
class DecoderNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device=device):
        super(DecoderNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.device = device
        
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        output = output.view(-1, self.hidden_size)
        output = self.softmax(self.out(output))
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=self.device)

In [10]:
# TODO(eldmitro): Implement DataLoader for texts
# TODO(eldmitro): Implement train runner
# TODO(eldmitro): Implement test runner

---

### Preprocessing

In [115]:
SOS_TOKEN = '$SOS$'
EOS_TOKEN = '$EOS$'
special_tokens = (
    SOS_TOKEN,
    EOS_TOKEN,
)

def seq_one_hot_encode(sentence, dim_size):
    N = sentence.size(0)
    e_seq = torch.zeros(N, dim_size, dtype=torch.int8)
    e_seq[torch.arange(N, dtype=torch.int64), sentence] = 1
    
    return e_seq

def to_float(sentence, dim_size):
    sentence = sentence.type(torch.float32)
    
    return sentence

class SentimentDataset(Dataset):
    def __init__(
        self,
        negative_examples_path,
        positive_examples_path,
        tokenizer,
        normalizer,
        device,
        transforms=None,
    ):
        super(SentimentDataset, self).__init__()
        
        # data reading from file
        negative_data = []
        with open(negative_examples_path) as input_stream:
            negative_data = input_stream.readlines()

        positive_data = []
        with open(positive_examples_path) as input_stream:
            positive_data = input_stream.readlines()
        # ------------------------------------
            
        # data processing. Norm + tokenization
        negative_data = [tokenizer.tokenize(text) for text in negative_data]
        negative_data = [[stemmer.stem(word) for word in text] for text in negative_data]

        positive_data = [tokenizer.tokenize(text) for text in positive_data]
        positive_data = [[stemmer.stem(word) for word in text] for text in positive_data]
        # ------------------------------------
        
        # dictionary processing
        negative_dictionary = self._get_dictionary(negative_data)
        positive_dictionary = self._get_dictionary(positive_data)

        self.dictionary = negative_dictionary.union(positive_dictionary)
        d = len(self.dictionary)
        
        for token in special_tokens:
            self.dictionary.add(token)
        assert d + len(special_tokens) == len(self.dictionary)
        self.dict_size = len(self.dictionary)
        
        self.word2idx = dict(zip(
            self.dictionary,
            range(self.dict_size)
        ))

        self.idx2word = {value: key for key, value in self.word2idx.items()}
        # ------------------------------------
        
        # target processing
        self.styles = np.array([0] * len(negative_data) + [1] * len(positive_data))

        data = negative_data + positive_data
        assert len(data) == len(self.styles)
        # ------------------------------------
        
        self._align_corpus(data, EOS_TOKEN, self._get_corpus_max_sent_len(data) + 1)
        data = self._map_corpus(data, self.word2idx)
        self.data = torch.tensor(data, dtype=torch.int64, device=device)
        self.styles = torch.tensor(self.styles, dtype=torch.int8, device=device)
        if transforms is None:
            transforms = []
        self.transforms = transforms
        self.device = device
        
    def _get_dictionary(self, corpus):
        dictionary = set()
        for text in corpus:
            dictionary = dictionary.union(set(np.unique(text)))

        return dictionary
    
    def _get_corpus_max_sent_len(self, corpus):
        text_max_length = 0
        for text in corpus:
            if len(text) > text_max_length:
                text_max_length = len(text)
                
        return text_max_length
    
    def _align_corpus(self, corpus, align_token, max_length):
        for i in range(len(corpus)):
            corpus[i] += [align_token] * max(0, (max_length - len(corpus[i])))

    def _map_corpus(self, corpus, mapper):
        return [
            [mapper[word] for word in text]
            for text in corpus
        ]
    
    def __getitem__(self, index):
        t_data = self.data[index]
        for transform in self.transforms:
            t_data = transform(t_data, self.dict_size)
            
        return t_data, self.styles[index]
    
    def __len__(self):
        return self.data.size(0)

In [53]:
tokenizer = TweetTokenizer()
stemmer = PorterStemmer()

In [13]:
data_path = './data'

negative_examples_filename = 'sentiment_negative.raw'
positive_examples_filename = 'sentiment_positive.raw'

negative_examples_path = os.path.join(data_path, negative_examples_filename)
positive_examples_path = os.path.join(data_path, positive_examples_filename)

In [116]:
seq_transforms = [
    seq_one_hot_encode,
    to_float
]

dataset = SentimentDataset(
    negative_examples_path,
    positive_examples_path,
    tokenizer,
    stemmer,
    device,
    seq_transforms
)

In [117]:
loader = DataLoader(dataset, batch_size=3, shuffle=True)

In [118]:
d = dataset.dict_size
batch_size = 3

---

### Models tests

In [None]:
a, b = None, None
for x, y in loader:
    a, b = x, y
    break

##### Encoder

In [120]:
e_hidden_size = 32

In [121]:
encoder = EncoderNN(d, e_hidden_size, device)
hidden_init = encoder.init_hidden(batch_size)

In [122]:
output, hidden = encoder.forward(x, hidden_init)

In [123]:
output.size()

torch.Size([3, 21, 32])

In [124]:
hidden.size()

torch.Size([1, 3, 32])

In [129]:
hidden

tensor([[[ 0.4151,  0.1735, -0.2012,  0.0544, -0.1354,  0.0955,  0.0560,
           0.1517, -0.0641, -0.1201,  0.3866, -0.0194,  0.0447, -0.1052,
          -0.2339, -0.0320, -0.0233, -0.1572,  0.2762,  0.2425,  0.0557,
           0.0448, -0.1122, -0.1686,  0.3160,  0.1523, -0.0463, -0.0954,
          -0.0688, -0.2863,  0.3466,  0.0466],
         [ 0.4180,  0.1745, -0.2068,  0.0593, -0.1351,  0.0971,  0.0598,
           0.1493, -0.0623, -0.1143,  0.3882, -0.0188,  0.0410, -0.1022,
          -0.2366, -0.0274, -0.0216, -0.1535,  0.2765,  0.2502,  0.0495,
           0.0449, -0.1169, -0.1687,  0.3120,  0.1490, -0.0411, -0.0963,
          -0.0717, -0.2824,  0.3487,  0.0447],
         [ 0.4180,  0.1745, -0.2068,  0.0593, -0.1350,  0.0971,  0.0598,
           0.1493, -0.0623, -0.1143,  0.3882, -0.0188,  0.0410, -0.1022,
          -0.2366, -0.0275, -0.0216, -0.1535,  0.2766,  0.2502,  0.0495,
           0.0449, -0.1169, -0.1687,  0.3120,  0.1490, -0.0410, -0.0963,
          -0.0716, -0.2824,  0

##### MLP

In [131]:
h0_size = 32
h1_size = 16
out_size = 2

In [127]:
content_batch = hidden.view(batch_size, -1)

In [130]:
content_batch.size()

torch.Size([3, 32])

In [132]:
mlp = MLPClassifier(
    e_hidden_size,
    h0_size,
    h1_size,
    out_size
)

In [139]:
output = mlp.forward(content_batch)

  


In [140]:
output.size()

torch.Size([3, 2])

---

#### Decoder

In [143]:
d_hidden_size = e_hidden_size

decoder = DecoderNN(d, d_hidden_size, d)

In [212]:
d_content_batch = content_batch.view(1, content_batch.size(0), -1)

In [213]:
SOS_IDX = dataset.word2idx[SOS_TOKEN]
output_init = torch.tensor([SOS_IDX] * batch_size)
output_init = seq_one_hot_encode(output_init, d)
output_init = output_init.view(batch_size, 1, -1)
output_init = to_float(output_init, d)

In [214]:
outputs = []
hiddens = []
anss = []

hidden = d_content_batch
output = output_init
ans = np.array([SOS_TOKEN] * batch_size)
k = 0
while (ans != EOS_TOKEN).any() and k < 3:
    output, hidden = decoder.forward(output, hidden)
    
    k += 1
    
    ans = np.array([dataset.idx2word[int(t)] for t in output.argmax(axis=1)])
    outputs.append(output)
    hiddens.append(hidden)
    anss.append(ans)
    output = output.view(batch_size, 1, -1)

In [215]:
anss

[array(['stylist', 'stylist', 'stylist'], dtype='<U7'),
 array(['kill', 'kill', 'kill'], dtype='<U4'),
 array(['kill', 'kill', 'kill'], dtype='<U4')]

In [216]:
outputs

[tensor([[0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001],
         [0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001],
         [0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001]],
        grad_fn=<SoftmaxBackward>),
 tensor([[0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001],
         [0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001],
         [0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001]],
        grad_fn=<SoftmaxBackward>),
 tensor([[0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001],
         [0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001],
         [0.0001, 0.0001, 0.0002,  ..., 0.0002, 0.0001, 0.0001]],
        grad_fn=<SoftmaxBackward>)]

---

### Experiments

In [218]:
class SentimentMultiDecoder:
    def __init__(
        self,
        input_size,
        encoder_hidden_size,
        mlp_h0_size,
        mlp_h1_size,
        device
    ):
        self.input_size = input_size
        self.encoder_hidden_size = encoder_hidden_size
        self.mlp_h0_size = mlp_h0_size
        self.mlp_h1_size = mlp_h1_size
        self.device = device
        
        self.encoder = EncoderNN(input_size, encoder_hidden_size, device)
        self.mlp = MLPClassifier(input_size, mlp_h0_size, mlp_h1_size, 2)
        self.positive_decoder = DecoderNN(input_size, encoder_hidden_size, input_size, device)
        self.negative_decoder = DecoderNN(input_size, encoder_hidden_size, input_size, device)
    
    def forward(X, y):
        
    
    def backward(self):
        pass
    
    def zero_grad(self):
        pass

In [None]:
# TODO(eldmitro): Make it working
def train(
    dataloader
    encoder_config,
    style_classifier_config,
    decoder_config,
):
    encoder, encoder_opt = encoder_config
    style_classifier, style_classifier_opt, criteria = style_classifier_config
    decoder, decoder_opt, decoder_criteria = decoder_config
    
    encoder_hidden = encoder.init_hidden(X_tensor.size(0))
    
    encoder_opt.zero_grad()
    style_classifier_opt.zero_grad()
    pos_decoder_opt.zero_grad()
    neg_decoder_opt.zero_grad()
    
    loss = 0
    content, hidden = encoder.forward(X_tensor)
    output = style_classifier.forward(content)
    loss += criteria1(output, y_tensor)
    loss += criteria2(output, y_tensor)
     
    for i in range(X_tensor.size(1)):
        d_output, d_hidden = decoder.forward(d_input, content)
        topv, topi = d_output.topk(1)
        loss += decoder_crit(d_output, X_tensor[0, i, :, :])
        
    loss.backward()
    encoder_opt.step()
    style_classifier_opt.step()
    decoder_opt.step()
    
    return loss

---