In [1]:
import os

In [2]:
import numpy as np
import pandas as pd

In [3]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer

In [4]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model implementation

In [6]:
class EncoderNN(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super(EncoderNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.cuda(device)
        
    def forward(self, input_seq, hidden):
        output, hidden = self.gru(input_seq, hidden)
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.randn(1, batch_size, self.hidden_size, device=self.device)

In [7]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size, h0_size, h1_size, out_size):
        super(MLPClassifier, self).__init__()
        
        self.fcl1 = nn.Linear(input_size, h0_size)
        self.fcl2 = nn.Linear(h0_size, h1_size)
        self.out = nn.Linear(h1_size, out_size)
        self.lsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        x = F.sigmoid(self.fcl1(x))
        x = F.sigmoid(self.fcl2(x))
        x = F.sigmoid(self.out(x))
        
        y = self.lsoftmax(x)
        return y

In [23]:
class DecoderNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device=device):
        super(DecoderNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.device = device
        
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.lsoftmax = nn.LogSoftmax(dim=1)
        
        self.cuda(device)
        
    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        output = output.view(-1, self.hidden_size)
        output = self.lsoftmax(self.out(output))
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.randn(1, batch_size, self.hidden_size, device=self.device)

In [24]:
SOS_TOKEN = '$SOS$'
EOS_TOKEN = '$EOS$'
special_tokens = (
    SOS_TOKEN,
    EOS_TOKEN,
)

def seq_one_hot_encode(sentence, dim_size):
    N = sentence.size(0)
    e_seq = torch.zeros(N, dim_size, dtype=torch.int8, device=device)
    e_seq[torch.arange(N, dtype=torch.int64), sentence] = 1
    
    return e_seq

def to_float(sentence, dim_size):
    sentence = sentence.type(torch.float32)
    
    return sentence

In [103]:
class SentimentMultiDecoder:
    def __init__(
        self,
        input_size,
        encoder_hidden_size,
        mlp_h0_size,
        mlp_h1_size,
        special_tokens_config,
        decoder_max_iter,
        device
    ):
        self.input_size = input_size
        self.encoder_hidden_size = encoder_hidden_size
        self.mlp_h0_size = mlp_h0_size
        self.mlp_h1_size = mlp_h1_size
        self.special_tokens_config = special_tokens_config
        self.decoder_max_iter = decoder_max_iter
        self.device = device
        
        self.encoder = EncoderNN(input_size, encoder_hidden_size, device)
        self.mlp_true = MLPClassifier(encoder_hidden_size, mlp_h0_size, mlp_h1_size, 2)
        self.mlp_false = MLPClassifier(encoder_hidden_size, mlp_h0_size, mlp_h1_size, 2)
        self.positive_decoder = DecoderNN(input_size, encoder_hidden_size, input_size, device)
        self.negative_decoder = DecoderNN(input_size, encoder_hidden_size, input_size, device)
        
        self.optimizers = [
            optim.Adam(self.encoder.parameters()),
            optim.Adam(self.mlp_true.parameters()),
            optim.Adam(self.mlp_false.parameters()),
            optim.Adam(self.positive_decoder.parameters()),
            optim.Adam(self.negative_decoder.parameters()),
        ]
    
    def forward(self, X, y, fit=True, teacher_forcing=False):
        batch_size = X.size(0)
        
        hidden_init = self.encoder.init_hidden(batch_size)
        e_output, e_hidden = self.encoder.forward(X, hidden_init)
        content_batch = e_hidden.view(batch_size, -1)
        
        mlp_true_out = self.mlp_true.forward(content_batch)
        mlp_false_out = self.mlp_false.forward(content_batch)
        
        hidden_init = e_hidden
        output_init = self._get_decoder_init(batch_size, self.input_size)
        positive_mask = y == 1
        negative_mask = ~positive_mask
        
        X_positive = X[positive_mask]
        y_positive = y[positive_mask]
        hidden_init_positive = hidden_init[:, positive_mask, :]
        output_init_positive = output_init[positive_mask]
        
        X_negative = X[negative_mask]
        y_negative = y[negative_mask]
        hidden_init_negative = hidden_init[:, negative_mask, :]
        output_init_negative = output_init[negative_mask]
        output_positive = []
        output_negative = []
        
        if X_positive.size(0) > 0:
            output_positive = self.decoder_forward(
                self.positive_decoder,
                X_positive,
                y_positive,
                hidden_init_positive,
                output_init_positive,
                fit,
                teacher_forcing
            )
            
        if X_negative.size(0) > 0:
            output_negative = self.decoder_forward(
                self.negative_decoder,
                X_negative,
                y_negative,
                hidden_init_negative,
                output_init_negative,
                fit,
                teacher_forcing
            )
        
        return mlp_true_out, mlp_false_out, output_positive, output_negative
            
            
    def decoder_forward(self, decoder, X, y, hidden_init, output_init, fit, teacher_forcing=False):
        _, sos_idx = self.special_tokens_config['SOS']
        _, eos_idx = self.special_tokens_config['EOS']
        output = output_init
        hidden = hidden_init
        
        anses = torch.tensor([sos_idx] * X.size(0))
        
        decoder_seq_output = []
        if fit:
            for i in range(X.size(1)):
                output, hidden = decoder.forward(output, hidden)
                
                output = output.view(X.size(0), 1, X.size(2))
                decoder_seq_output.append(output)
                if teacher_forcing:
                    output = X[:, i:i+1, :]
        else:
            k = 0
            # (anses != eos_idx).any() and 
            while (k < self.decoder_max_iter):
                output, hidden = decoder.forward(output, hidden)
                
                anses = output.argmax(axis=1)
                output = output.view(X.size(0), 1, X.size(2))
                decoder_seq_output.append(output)
                if teacher_forcing:
                    output = X[:, i:i+1, :]
                k += 1
                
        decoder_output = torch.cat(decoder_seq_output, dim=1)
        return decoder_output
    
    def zero_grad(self):
        for opt in self.optimizers:
            opt.zero_grad()
    
    def opt_step(self):
        for opt in self.optimizers:
            opt.step()
        
    
    def _get_decoder_init(self, batch_size, d):
        sos_token, sos_idx = self.special_tokens_config['SOS']
        output_init = torch.tensor([sos_idx] * batch_size, device=device)
        output_init = seq_one_hot_encode(output_init, d)
        output_init = output_init.view(batch_size, 1, -1)
        output_init = to_float(output_init, d)
        
        return output_init

In [15]:
class SentimentDataset(Dataset):
    def __init__(
        self,
        negative_examples_path,
        positive_examples_path,
        tokenizer,
        normalizer,
        device,
        transforms=None,
    ):
        super(SentimentDataset, self).__init__()
        
        # data reading from file
        negative_data = []
        with open(negative_examples_path) as input_stream:
            negative_data = input_stream.readlines()

        positive_data = []
        with open(positive_examples_path) as input_stream:
            positive_data = input_stream.readlines()
        # ------------------------------------
            
        # data processing. Norm + tokenization
        negative_data = [tokenizer.tokenize(text) for text in negative_data]
        negative_data = [[stemmer.stem(word) for word in text] for text in negative_data]

        positive_data = [tokenizer.tokenize(text) for text in positive_data]
        positive_data = [[stemmer.stem(word) for word in text] for text in positive_data]
        # ------------------------------------
        
        # dictionary processing
        negative_dictionary = self._get_dictionary(negative_data)
        positive_dictionary = self._get_dictionary(positive_data)

        self.dictionary = negative_dictionary.union(positive_dictionary)
        d = len(self.dictionary)
        
        for token in special_tokens:
            self.dictionary.add(token)
        assert d + len(special_tokens) == len(self.dictionary)
        self.dict_size = len(self.dictionary)
        
        self.word2idx = dict(zip(
            self.dictionary,
            range(self.dict_size)
        ))

        self.idx2word = {value: key for key, value in self.word2idx.items()}
        # ------------------------------------
        
        # target processing
        self.styles = np.array([0] * len(negative_data) + [1] * len(positive_data))

        data = negative_data + positive_data
        assert len(data) == len(self.styles)
        # ------------------------------------
        
        self._align_corpus(data, EOS_TOKEN, self._get_corpus_max_sent_len(data) + 1)
        data = self._map_corpus(data, self.word2idx)
        self.data = torch.tensor(data, dtype=torch.int64, device=device)
        self.styles = torch.tensor(self.styles, dtype=torch.int8, device=device)
        if transforms is None:
            transforms = []
        self.transforms = transforms
        self.device = device
        
    def _get_dictionary(self, corpus):
        dictionary = set()
        for text in corpus:
            dictionary = dictionary.union(set(np.unique(text)))

        return dictionary
    
    def _get_corpus_max_sent_len(self, corpus):
        text_max_length = 0
        for text in corpus:
            if len(text) > text_max_length:
                text_max_length = len(text)
                
        return text_max_length
    
    def _align_corpus(self, corpus, align_token, max_length):
        for i in range(len(corpus)):
            corpus[i] += [align_token] * max(0, (max_length - len(corpus[i])))

    def _map_corpus(self, corpus, mapper):
        return [
            [mapper[word] for word in text]
            for text in corpus
        ]
    
    def __getitem__(self, index):
        t_data = self.data[index]
        for transform in self.transforms:
            t_data = transform(t_data, self.dict_size)
            
        return t_data, self.styles[index]
    
    def __len__(self):
        return self.data.size(0)

---

### Preprocessing

In [16]:
tokenizer = TweetTokenizer()
stemmer = PorterStemmer()

In [17]:
data_path = './data'

negative_examples_filename = 'sentiment_negative.raw'
positive_examples_filename = 'sentiment_positive.raw'

negative_examples_path = os.path.join(data_path, negative_examples_filename)
positive_examples_path = os.path.join(data_path, positive_examples_filename)

In [19]:
seq_transforms = [
    seq_one_hot_encode,
    to_float
]

dataset = SentimentDataset(
    negative_examples_path,
    positive_examples_path,
    tokenizer,
    stemmer,
    device,
    seq_transforms
)

In [21]:
batch_size = 32
d = dataset.dict_size

In [20]:
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

---

### Autoencoder test

In [None]:
special_tokens_config = {
    'SOS': (SOS_TOKEN, dataset.word2idx[SOS_TOKEN]),
    'EOS': (EOS_TOKEN, dataset.word2idx[EOS_TOKEN]),
}

def get_decoder_init(batch_size, d):
    sos_token, sos_idx = special_tokens_config['SOS']
    
    output_init = torch.tensor([sos_idx] * batch_size, device=device)
    output_init = seq_one_hot_encode(output_init, d)
    output_init = output_init.view(batch_size, 1, -1)
    output_init = to_float(output_init, d)
        
    return output_init

In [60]:
def autoencoder_forward(
    encoder,
    decoder,
    X,
):
    input_size = X.size(2)
    
    encoder_hidden_init = encoder.init_hidden(X.size(0))
    decoder_output_init = get_decoder_init(X.size(0), input_size)
    
    encoder_output, encoder_hidden = encoder.forward(X, encoder_hidden_init)
    
    decoder_seq_output = []
    decoder_output = decoder_output_init
    decoder_hidden = encoder_hidden
    for i in range(X.size(1)):
        decoder_output, decoder_hidden = decoder.forward(decoder_output, decoder_hidden)
        
        decoder_output = decoder_output.view(X.size(0), 1, input_size)
        decoder_seq_output.append(decoder_output)
        decoder_output = X[:, i:i+1, :]
    
    decoder_output = torch.cat(decoder_seq_output, dim=1)
    
    return decoder_output

In [63]:
def autoencoder_train(
    encoder,
    encoder_opt,
    decoder,
    decoder_opt,
    loader,
    criterion
):
    for iter_num, (X, y) in enumerate(loader):
        loss = 0
        encoder_opt.zero_grad()
        decoder_opt.zero_grad()
        
        decoder_output = autoencoder_forward(encoder, decoder, X)
        idx_X = X.argmax(dim=2)
        for i in range(X.size(0)):
            loss += criterion(decoder_output[i], idx_X[i])
            
        loss.backward()
        encoder_opt.step()
        decoder_opt.step()
        
        if iter_num % 20 == 0:
            print(iter_num, 'Loss: ', loss.item() / X.size(0))

In [84]:
input_size = d
encoder_hidden_size = 16
output_size = input_size

In [85]:
encoder = EncoderNN(
    input_size,
    encoder_hidden_size,
    device
)

decoder = DecoderNN(
    input_size,
    encoder_hidden_size,
    output_size,
    device
)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.01)

nllloss = nn.NLLLoss()

In [86]:
autoencoder_train(encoder, encoder_optimizer, decoder, decoder_optimizer, loader, nllloss)

0 Loss:  8.699359893798828
20 Loss:  4.352877140045166
40 Loss:  3.438206195831299
60 Loss:  2.701504707336426
80 Loss:  2.4488210678100586
100 Loss:  2.742182493209839
120 Loss:  2.2706680297851562
140 Loss:  2.5827114582061768
160 Loss:  2.466979503631592
180 Loss:  2.3361964225769043
200 Loss:  2.576852560043335
220 Loss:  2.245347023010254
240 Loss:  2.2665770053863525
260 Loss:  2.455225944519043
280 Loss:  2.3000247478485107
300 Loss:  2.3237955570220947
320 Loss:  2.243147850036621
340 Loss:  1.9281333684921265
360 Loss:  2.0472254753112793
380 Loss:  1.6999162435531616
400 Loss:  2.2242350578308105
420 Loss:  2.0805490016937256
440 Loss:  2.1297738552093506
460 Loss:  2.0762240886688232
480 Loss:  2.013298511505127
500 Loss:  2.3340320587158203
520 Loss:  1.8765000104904175
540 Loss:  2.1659328937530518
560 Loss:  2.182204246520996
580 Loss:  2.207395076751709
600 Loss:  2.0523924827575684
620 Loss:  2.104949712753296
640 Loss:  2.722355604171753
660 Loss:  2.0577659606933594
6

KeyboardInterrupt: 

In [87]:
for X, y in loader:
    break

In [88]:
decoder_output = autoencoder_forward(encoder, decoder, X)

In [100]:
test_sent_idx = 10
sent = decoder_output[test_sent_idx]

In [101]:
print([dataset.idx2word[int(t)] for t in X.argmax(dim=2)[test_sent_idx]])

['she', 'wa', 'rude', 'and', 'useless', '.', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$']


In [102]:
print([dataset.idx2word[int(t)] for t in sent.argmax(dim=1)])

['we', 'wa', 'veri', 'and', 'the', '.', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$', '$EOS$']


In [82]:
dataset.idx2word[777]

'the'

---

### Experiments

In [None]:
dict_size = dataset.dict_size
encoder_hidden_size = 64
mlp_h0_size = 64
mlp_h1_size = 32
special_tokens_config = {
    'SOS': (SOS_TOKEN, dataset.word2idx[SOS_TOKEN]),
    'EOS': (EOS_TOKEN, dataset.word2idx[EOS_TOKEN]),
}
decoder_max_iter = 40

In [None]:
multidecoder = SentimentMultiDecoder(
    dict_size,
    encoder_hidden_size,
    mlp_h0_size,
    mlp_h1_size,
    special_tokens_config,
    decoder_max_iter,
    device
)

In [None]:
def train(
    multidecoder,
    dataloader,
    dataset,
    max_iter=10000
):
    nll = nn.NLLLoss()
    kld = nn.KLDivLoss()
    for i, (X, y) in enumerate(dataloader):
        if i >= max_iter:
            break
            
        loss = 0
        y = y.type(torch.int64)
        multidecoder.zero_grad()
        
        positive_mask = y == 1
        negative_mask = ~positive_mask
        X_pos = X[positive_mask]
        X_neg = X[negative_mask]
        
        y_pos = y[positive_mask]
        y_neg = y[negative_mask]
        
        mlp_true_out, mlp_false_out, pos_output, neg_output = multidecoder.forward(X, y, True)
        loss += nll(mlp_true_out, y)
        oh_y = seq_one_hot_encode(y, 2)
        oh_y = oh_y.type(torch.float32)
        loss += kld(mlp_false_out, oh_y)
        if X_pos.size(0) > 0:
            X_pos = X_pos.argmax(axis=2)
            for target_sent, predicted_sent in zip(X_pos, pos_output):   
                loss += nll(predicted_sent, target_sent)
                    
        if X_neg.size(0) > 0:
            X_neg = X_neg.argmax(axis=2)
            for target_sent, predicted_sent in zip(X_neg, neg_output):
                loss += nll(predicted_sent, target_sent)
                    
        loss.backward()
        multidecoder.opt_step()
        print(loss.item())

In [None]:
train(multidecoder, loader, dataset)

In [None]:
for x, y in loader:
    break

In [None]:
mlp_t_out, mlp_f_out, pos_output, neg_output = multidecoder.forward(x, y, False)

In [None]:
y[0]

In [None]:
pos_output

In [None]:
[t.argmax() for t in neg_output[0]]

In [None]:
dataset.idx2word[5044]

In [None]:
dataset.idx2word[5429]