In [1]:
import os

In [2]:
import numpy as np
import pandas as pd

In [3]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer

In [4]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model implementation

In [6]:
class EncoderNN(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super(EncoderNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
    
    def forward(self, input_seq, hidden):
        output, hidden = self.gru(input_seq, hidden)
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=self.device)

In [7]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size, h0_size, h1_size, out_size):
        super(MLPClassifier, self).__init__()
        
        self.fcl1 = nn.Linear(input_size, h0_size)
        self.fcl2 = nn.Linear(h0_size, h1_size)
        self.out = nn.Linear(h1_size, out_size)
        self.lsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        x = F.sigmoid(self.fcl1(x))
        x = F.sigmoid(self.fcl2(x))
        x = F.sigmoid(self.out(x))
        
        y = self.lsoftmax(x)
        return y

In [8]:
class DecoderNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device=device):
        super(DecoderNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.device = device
        
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.lsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        output = output.view(-1, self.hidden_size)
        output = self.lsoftmax(self.out(output))
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=self.device)

In [220]:
class SentimentMultiDecoder:
    def __init__(
        self,
        input_size,
        encoder_hidden_size,
        mlp_h0_size,
        mlp_h1_size,
        special_tokens_config,
        decoder_max_iter,
        device
    ):
        self.input_size = input_size
        self.encoder_hidden_size = encoder_hidden_size
        self.mlp_h0_size = mlp_h0_size
        self.mlp_h1_size = mlp_h1_size
        self.special_tokens_config = special_tokens_config
        self.decoder_max_iter = decoder_max_iter
        self.device = device
        
        self.encoder = EncoderNN(input_size, encoder_hidden_size, device)
        self.mlp_true = MLPClassifier(encoder_hidden_size, mlp_h0_size, mlp_h1_size, 2)
        self.mlp_false = MLPClassifier(encoder_hidden_size, mlp_h0_size, mlp_h1_size, 2)
        self.positive_decoder = DecoderNN(input_size, encoder_hidden_size, input_size, device)
        self.negative_decoder = DecoderNN(input_size, encoder_hidden_size, input_size, device)
        
        self.optimizers = [
            optim.Adam(self.encoder.parameters()),
            optim.Adam(self.mlp_true.parameters()),
            optim.Adam(self.mlp_false.parameters()),
            optim.Adam(self.positive_decoder.parameters()),
            optim.Adam(self.negative_decoder.parameters()),
        ]
    
    def forward(self, X, y, fit=True):
        batch_size = X.size(0)
        
        hidden_init = self.encoder.init_hidden(X.size(0))
        e_output, e_hidden = self.encoder.forward(X, hidden_init)
        content_batch = e_hidden.view(batch_size, -1)
        
        mlp_true_out = self.mlp_true.forward(content_batch)
        mlp_false_out = self.mlp_false.forward(content_batch)
        
        hidden_init = e_hidden
        output_init = self._get_decoder_init(batch_size, self.input_size)
        positive_mask = y == 1
        negative_mask = ~positive_mask
        
        X_positive = X[positive_mask]
        y_positive = y[positive_mask]
        hidden_init_positive = hidden_init[:, positive_mask, :]
        output_init_positive = output_init[positive_mask]
        
        X_negative = X[negative_mask]
        y_negative = y[negative_mask]
        hidden_init_negative = hidden_init[:, negative_mask, :]
        output_init_negative = output_init[negative_mask]
        output_positive = []
        output_negative = []
        
        if X_positive.size(0) > 0:
            output_positive = self.decoder_forward(
                self.positive_decoder,
                X_positive,
                y_positive,
                hidden_init_positive,
                output_init_positive,
                fit
            )
            
        if X_negative.size(0) > 0:
            output_negative = self.decoder_forward(
                self.negative_decoder,
                X_negative,
                y_negative,
                hidden_init_negative,
                output_init_negative,
                fit
            )
        
        return mlp_true_out, mlp_false_out, output_positive, output_negative
            
            
    def decoder_forward(self, decoder, X, y, hidden_init, output_init, fit):
        _, sos_idx = self.special_tokens_config['SOS']
        _, eos_idx = self.special_tokens_config['EOS']
        output = output_init
        hidden = hidden_init
        
        anses = torch.tensor([sos_idx] * X.size(0))
        
        output_by_batch = [[] for i in range(X.size(0))]
        if fit:
            for i in range(X.size(1)):
                output, hidden = decoder.forward(output, hidden)
                
                for j, out in enumerate(output):
                    output_by_batch[j].append(out)
                    
                output = output.view(X.size(0), 1, -1)
        else:
            k = 0
            # (anses != eos_idx).any() and 
            while (k < self.decoder_max_iter):
                output, hidden = decoder.forward(output, hidden)
                
                anses = output.argmax(axis=1)
                for j, out in enumerate(output):
                    output_by_batch[j].append(out)
                    
                output = output.view(X.size(0), 1, -1)
                k += 1
                
        return output_by_batch
    
    def zero_grad(self):
        for opt in self.optimizers:
            opt.zero_grad()
    
    def opt_step(self):
        for opt in self.optimizers:
            opt.step()
        
    
    def _get_decoder_init(self, batch_size, d):
        sos_token, sos_idx = self.special_tokens_config['SOS']
        output_init = torch.tensor([sos_idx] * batch_size)
        output_init = seq_one_hot_encode(output_init, d)
        output_init = output_init.view(batch_size, 1, -1)
        output_init = to_float(output_init, d)
        
        return output_init

In [162]:
SOS_TOKEN = '$SOS$'
EOS_TOKEN = '$EOS$'
special_tokens = (
    SOS_TOKEN,
    EOS_TOKEN,
)

def seq_one_hot_encode(sentence, dim_size):
    N = sentence.size(0)
    e_seq = torch.zeros(N, dim_size, dtype=torch.int8)
    e_seq[torch.arange(N, dtype=torch.int64), sentence] = 1
    
    return e_seq

def to_float(sentence, dim_size):
    sentence = sentence.type(torch.float32)
    
    return sentence

class SentimentDataset(Dataset):
    def __init__(
        self,
        negative_examples_path,
        positive_examples_path,
        tokenizer,
        normalizer,
        device,
        transforms=None,
    ):
        super(SentimentDataset, self).__init__()
        
        # data reading from file
        negative_data = []
        with open(negative_examples_path) as input_stream:
            negative_data = input_stream.readlines()

        positive_data = []
        with open(positive_examples_path) as input_stream:
            positive_data = input_stream.readlines()
        # ------------------------------------
            
        # data processing. Norm + tokenization
        negative_data = [tokenizer.tokenize(text) for text in negative_data]
        negative_data = [[stemmer.stem(word) for word in text] for text in negative_data]

        positive_data = [tokenizer.tokenize(text) for text in positive_data]
        positive_data = [[stemmer.stem(word) for word in text] for text in positive_data]
        # ------------------------------------
        
        # dictionary processing
        negative_dictionary = self._get_dictionary(negative_data)
        positive_dictionary = self._get_dictionary(positive_data)

        self.dictionary = negative_dictionary.union(positive_dictionary)
        d = len(self.dictionary)
        
        for token in special_tokens:
            self.dictionary.add(token)
        assert d + len(special_tokens) == len(self.dictionary)
        self.dict_size = len(self.dictionary)
        
        self.word2idx = dict(zip(
            self.dictionary,
            range(self.dict_size)
        ))

        self.idx2word = {value: key for key, value in self.word2idx.items()}
        # ------------------------------------
        
        # target processing
        self.styles = np.array([0] * len(negative_data) + [1] * len(positive_data))

        data = negative_data + positive_data
        assert len(data) == len(self.styles)
        # ------------------------------------
        
        self._align_corpus(data, EOS_TOKEN, self._get_corpus_max_sent_len(data) + 1)
        data = self._map_corpus(data, self.word2idx)
        self.data = torch.tensor(data, dtype=torch.int64, device=device)
        self.styles = torch.tensor(self.styles, dtype=torch.int8, device=device)
        if transforms is None:
            transforms = []
        self.transforms = transforms
        self.device = device
        
    def _get_dictionary(self, corpus):
        dictionary = set()
        for text in corpus:
            dictionary = dictionary.union(set(np.unique(text)))

        return dictionary
    
    def _get_corpus_max_sent_len(self, corpus):
        text_max_length = 0
        for text in corpus:
            if len(text) > text_max_length:
                text_max_length = len(text)
                
        return text_max_length
    
    def _align_corpus(self, corpus, align_token, max_length):
        for i in range(len(corpus)):
            corpus[i] += [align_token] * max(0, (max_length - len(corpus[i])))

    def _map_corpus(self, corpus, mapper):
        return [
            [mapper[word] for word in text]
            for text in corpus
        ]
    
    def __getitem__(self, index):
        t_data = self.data[index]
        for transform in self.transforms:
            t_data = transform(t_data, self.dict_size)
            
        return t_data, self.styles[index]
    
    def __len__(self):
        return self.data.size(0)

---

### Preprocessing

In [11]:
tokenizer = TweetTokenizer()
stemmer = PorterStemmer()

In [12]:
data_path = './data'

negative_examples_filename = 'sentiment_negative.raw'
positive_examples_filename = 'sentiment_positive.raw'

negative_examples_path = os.path.join(data_path, negative_examples_filename)
positive_examples_path = os.path.join(data_path, positive_examples_filename)

In [13]:
test_negative_examples_filename = 'sentiment_negative.raw'
test_positive_examples_filename = 'sentiment_positive.raw'

test_negative_examples_path = os.path.join(data_path, negative_examples_filename)
test_positive_examples_path = os.path.join(data_path, positive_examples_filename)

In [133]:
seq_transforms = [
    seq_one_hot_encode,
    to_float
]

dataset = SentimentDataset(
    negative_examples_path,
    positive_examples_path,
    tokenizer,
    stemmer,
    device,
    seq_transforms
)

In [134]:
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [17]:
d = dataset.dict_size
batch_size = 3

---

### Experiments

In [135]:
dict_size = dataset.dict_size
encoder_hidden_size = 64
mlp_h0_size = 64
mlp_h1_size = 32
special_tokens_config = {
    'SOS': (SOS_TOKEN, dataset.word2idx[SOS_TOKEN]),
    'EOS': (EOS_TOKEN, dataset.word2idx[EOS_TOKEN]),
}
decoder_max_iter = 40

In [307]:
multidecoder = SentimentMultiDecoder(
    dict_size,
    encoder_hidden_size,
    mlp_h0_size,
    mlp_h1_size,
    special_tokens_config,
    decoder_max_iter,
    device
)

In [302]:
def train(
    multidecoder,
    dataloader,
    dataset,
    max_iter=10000
):
    nll = nn.NLLLoss()
    kld = nn.KLDivLoss()
    for i, (X, y) in enumerate(dataloader):
        if i >= max_iter:
            break
            
        loss = 0
        y = y.type(torch.int64)
        multidecoder.zero_grad()
        
        positive_mask = y == 1
        negative_mask = ~positive_mask
        X_pos = X[positive_mask]
        X_neg = X[negative_mask]
        
        y_pos = y[positive_mask]
        y_neg = y[negative_mask]
        
        mlp_true_out, mlp_false_out, pos_output, neg_output = multidecoder.forward(X, y, True)
        loss += nll(mlp_true_out, y)
        oh_y = seq_one_hot_encode(y, 2)
        oh_y = oh_y.type(torch.float32)
        loss += kld(mlp_false_out, oh_y)
        if X_pos.size(0) > 0:
            X_pos = X_pos.argmax(axis=2)
            for target_sent, predicted_sent in zip(X_pos, pos_output):    
                predicted_sent = torch.cat(predicted_sent).view(-1, d)
                loss += nll(predicted_sent, target_sent)
                    
        if X_neg.size(0) > 0:
            X_neg = X_neg.argmax(axis=2)
            for target_sent, predicted_sent in zip(X_neg, neg_output):
                predicted_sent = torch.cat(predicted_sent).view(-1, d)
                loss += nll(predicted_sent, target_sent)
                    
        loss.backward()
        multidecoder.opt_step()
        print(loss.item())

In [310]:
train(multidecoder, loader, dataset)

289.7613220214844
277.7751159667969
280.43463134765625
276.170166015625
275.0789794921875
273.52056884765625
274.1607971191406
271.34466552734375
269.54473876953125
269.3447265625
264.5732116699219
262.3013916015625
265.3811340332031
257.9037780761719
263.032470703125
256.74859619140625
255.5208740234375
257.49884033203125
250.32980346679688
250.71661376953125
246.67308044433594
246.31976318359375
241.7596893310547
237.9642333984375
238.8245391845703
229.4404296875
230.3970489501953
225.7128448486328
217.74765014648438
218.75592041015625
210.05630493164062
202.6265106201172
200.57308959960938
190.33163452148438
188.17918395996094
183.77688598632812
186.59107971191406
174.8270263671875
167.34835815429688
165.73011779785156
165.13507080078125
163.8677215576172
166.7075653076172
151.84718322753906
156.5559539794922
158.12347412109375
152.97943115234375
144.72132873535156
146.49191284179688
138.73182678222656
139.24417114257812
126.36003875732422
129.1744842529297
125.14354705810547
127.64

KeyboardInterrupt: 

In [314]:
for x, y in loader:
    break

In [315]:
mlp_t_out, mlp_f_out, pos_output, neg_output = multidecoder.forward(x, y, False)

In [317]:
pos_output

[[tensor([ -9.6577, -10.0166,  -9.0906,  ...,  -9.4352, -10.0754,  -9.9089],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<SelectBackward>),
  tensor([-10.1815, -10.8787,  -8.8608,  ..., -10.6048, -10.8379, -10.7827],
         grad_fn=<S

In [318]:
[t.argmax() for t in pos_output[0]]

[tensor(5044),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429),
 tensor(5429)]

In [319]:
dataset.idx2word[5044]

'$SOS$'

In [320]:
dataset.idx2word[5429]

'$EOS$'