In [34]:
import torch
import time
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils import rnn
from ast import literal_eval
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from gensim.models import KeyedVectors
from gensim.models.keyedvectors import Word2VecKeyedVectors
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/odesa_reviews_preprocessed.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,pos,neg,ratingValue,bestRating,rate
0,0,"['Ідеально', 'для', 'ділової', 'поїздки']","['Ідеально', 'для', 'ділової', 'поїздки', 'Гос...",['Nan'],10.0,10.0,1
1,1,"['Затишний', 'чистий', 'номер', 'з', 'усіма', ...","['Затишний', 'чистий', 'номер', 'з', 'усіма', ...","['При', 'бронюванні', 'вказала', 'час', 'прибу...",9.2,10.0,1
2,2,"['Все', 'сподобалося', 'Рекомендую']","['Чисто', 'тихо', 'комфортно', 'Зустрів', 'і',...","['На', 'барі', 'кава', 'тільки', '3', 'в', '1'...",9.6,10.0,1
3,3,"['Зручне', 'розташування', 'чудовий', 'вигляд'...","['Зручне', 'розташування', 'чудовий', 'вигляд'...",['Nan'],10.0,10.0,1
4,4,"['Все', 'чудово', '9,9', 'балів']","['Нові', 'апартаменти', 'на', 'останньому', 'п...","['Немає', 'терміналу', 'для', 'оплати', 'креди...",10.0,10.0,1


In [4]:
embeddings_file = "./embeddings/ubercorpus.cased.tokenized.word2vec.300d"

In [5]:
word2vec = KeyedVectors.load_word2vec_format(embeddings_file)

In [6]:
def create_word2idx_and_embeddings(word2vec):
    word_embeddings = np.zeros((len(word2vec.vocab), word2vec.vector_size))
    word2idx = {}
    for word in word2vec.vocab:
        idx = len(word2idx)
        word2idx[word] = idx
        word_embeddings[idx] = word2vec[word]

    word_embeddings = torch.tensor(word_embeddings, dtype=torch.float32)
    return word2idx, word_embeddings

In [7]:
word2idx, word_embeddings = create_word2idx_and_embeddings(word2vec)

In [8]:
literal_eval(df.iloc[0]['neg'])[0] == 'Nan'

True

In [9]:
def get_features_and_target(data):
    X = []
    y = []
    
    for i in range(0, len(data)):
        title = literal_eval(data.iloc[i]['title'])
        if  len(title) > 0 and title[0] == 'Nan':
            title = []
            
        pos = literal_eval(data.iloc[i]['pos'])
        if len(pos) > 0 and pos[0] == 'Nan':
            pos = []


        neg = literal_eval(data.iloc[i]['neg'])
        if len(neg) > 0 and neg[0] == 'Nan':
            neg = []


        
        x = title + pos + neg
        if len(x) == 0:
            continue
            
        X.append(x)
        
        y.append(data.iloc[i]['rate'] + 1)
        
    return X, y

In [13]:
df[5:10]

Unnamed: 0.1,Unnamed: 0,title,pos,neg,ratingValue,bestRating,rate
5,5,['Все'],['Все'],['Nan'],10.0,10.0,1
6,6,"['Дуже', 'привітний', 'власник']","['Дуже', 'привітний', 'власник', 'Можна', 'бут...",['Nan'],9.2,10.0,1
7,7,"['Уважний', 'та', 'приємний', 'господар']","['Уважний', 'та', 'приємний', 'господар', 'Зру...",['Nan'],10.0,10.0,1
8,8,"['Буду', 'рекомендувати', 'ці', 'апартаменти',...","['Сподобалось', 'розміщення', 'чистота', 'номе...","['Нема', 'зауважень']",10.0,10.0,1
9,9,"['Хороша', 'квартира', 'не', 'дуже', 'приємний...","['Хороший', 'ремонт', 'красивий', 'вид', 'з', ...","['Були', 'проблеми', 'з', 'заселенням', 'Госпо...",8.8,10.0,0


In [10]:
X, y = get_features_and_target(df)

In [15]:
def seq_to_idxs(seq, mapping):    
    outs_seq = []
    for el in seq:
        if el in mapping:
            outs_seq.append(torch.tensor(mapping[el], dtype=torch.long))
    outs_seq = torch.stack(outs_seq, 0)
    return outs_seq



In [18]:
seq_to_idxs(X[0], word2idx)

tensor([168077,     14,  17873,   4318, 168077,     14,  17873,   4318,  25320,
         11908,    304,  36298,   2091, 115031,    364,   8524,   1708,      4,
          3879,      1,   3417,   1557,   3443,  39801,  21722,  19767,   2710,
        103133,  21526,    110,  37735,  90110,  19697, 207722,    233,    969,
          1030])

In [20]:
seq = X[0]
seq

['Ідеально',
 'для',
 'ділової',
 'поїздки',
 'Ідеально',
 'для',
 'ділової',
 'поїздки',
 'Господар',
 'зустрів',
 'о',
 'шостій',
 'ранку',
 'привітний',
 '22',
 'поверх',
 'вид',
 'з',
 'вікна',
 'на',
 'море',
 'Все',
 'поряд',
 'супермаркет',
 'розваги',
 'ресторани',
 'Було',
 'пізнє',
 'виселення',
 'без',
 'доплати',
 'Обов',
 'язково',
 'зупинюся',
 'тут',
 'наступного',
 'разу']

In [22]:
inputs_batch = [seq_to_idxs(seq, word2idx) for seq in X[:2]]
inputs_batch

[tensor([168077,     14,  17873,   4318, 168077,     14,  17873,   4318,  25320,
          11908,    304,  36298,   2091, 115031,    364,   8524,   1708,      4,
           3879,      1,   3417,   1557,   3443,  39801,  21722,  19767,   2710,
         103133,  21526,    110,  37735,  90110,  19697, 207722,    233,    969,
           1030]),
 tensor([292605,   6225,   2413,      4,   3143, 100935, 292605,   6225,   2413,
              4,   3143, 100935,  64468,  52530,      4,   3879,    611,    346,
         127045,   5995,      8,    438,   9561,  16947,    168, 275763,  31865,
             33,   6693,    304,    148,   2091,      1,      5,   1042,    887,
              9,   3673,     42,     67,      5,    174,    169,     54, 317296,
              6,  16595,  19896,     96,     80,   1964,   9484,  12687,    374,
             10,     16,    186,    304, 283193,  38665,  10455,     80,     54,
             45,  31586,      3,  44386,   9913,      5,    895,  60320,    453,
         

In [23]:
order = sorted(enumerate(inputs_batch), key=lambda x: len(x[1]), reverse=True)
order

[(1,
  tensor([292605,   6225,   2413,      4,   3143, 100935, 292605,   6225,   2413,
               4,   3143, 100935,  64468,  52530,      4,   3879,    611,    346,
          127045,   5995,      8,    438,   9561,  16947,    168, 275763,  31865,
              33,   6693,    304,    148,   2091,      1,      5,   1042,    887,
               9,   3673,     42,     67,      5,    174,    169,     54, 317296,
               6,  16595,  19896,     96,     80,   1964,   9484,  12687,    374,
              10,     16,    186,    304, 283193,  38665,  10455,     80,     54,
              45,  31586,      3,  44386,   9913,      5,    895,  60320,    453,
             118,      9,    169,   1230,     55,     78,    346,  85315,   2217,
             166,    374,      6,   2184,    453,     54,      2,  14426,   4061,
            4383,     14,   3447,  14683])),
 (0,
  tensor([168077,     14,  17873,   4318, 168077,     14,  17873,   4318,  25320,
           11908,    304,  36298,   2091, 1

In [24]:
inputs_batch = [inputs_batch[order_[0]] for order_ in order]
inputs_batch

[tensor([292605,   6225,   2413,      4,   3143, 100935, 292605,   6225,   2413,
              4,   3143, 100935,  64468,  52530,      4,   3879,    611,    346,
         127045,   5995,      8,    438,   9561,  16947,    168, 275763,  31865,
             33,   6693,    304,    148,   2091,      1,      5,   1042,    887,
              9,   3673,     42,     67,      5,    174,    169,     54, 317296,
              6,  16595,  19896,     96,     80,   1964,   9484,  12687,    374,
             10,     16,    186,    304, 283193,  38665,  10455,     80,     54,
             45,  31586,      3,  44386,   9913,      5,    895,  60320,    453,
            118,      9,    169,   1230,     55,     78,    346,  85315,   2217,
            166,    374,      6,   2184,    453,     54,      2,  14426,   4061,
           4383,     14,   3447,  14683]),
 tensor([168077,     14,  17873,   4318, 168077,     14,  17873,   4318,  25320,
          11908,    304,  36298,   2091, 115031,    364,   8524,  

In [26]:
targets_batch = torch.tensor([y[:2][order_[0]] for order_ in order])

In [27]:
targets_batch

tensor([2, 2])

In [28]:
inputs_packed = rnn.pack_sequence(inputs_batch)
inputs_packed

PackedSequence(data=tensor([292605, 168077,   6225,     14,   2413,  17873,      4,   4318,   3143,
        168077, 100935,     14, 292605,  17873,   6225,   4318,   2413,  25320,
             4,  11908,   3143,    304, 100935,  36298,  64468,   2091,  52530,
        115031,      4,    364,   3879,   8524,    611,   1708,    346,      4,
        127045,   3879,   5995,      1,      8,   3417,    438,   1557,   9561,
          3443,  16947,  39801,    168,  21722, 275763,  19767,  31865,   2710,
            33, 103133,   6693,  21526,    304,    110,    148,  37735,   2091,
         90110,      1,  19697,      5, 207722,   1042,    233,    887,    969,
             9,   1030,   3673,     42,     67,      5,    174,    169,     54,
        317296,      6,  16595,  19896,     96,     80,   1964,   9484,  12687,
           374,     10,     16,    186,    304, 283193,  38665,  10455,     80,
            54,     45,  31586,      3,  44386,   9913,      5,    895,  60320,
           453,    1

In [30]:
inputs_packed.data

tensor([292605, 168077,   6225,     14,   2413,  17873,      4,   4318,   3143,
        168077, 100935,     14, 292605,  17873,   6225,   4318,   2413,  25320,
             4,  11908,   3143,    304, 100935,  36298,  64468,   2091,  52530,
        115031,      4,    364,   3879,   8524,    611,   1708,    346,      4,
        127045,   3879,   5995,      1,      8,   3417,    438,   1557,   9561,
          3443,  16947,  39801,    168,  21722, 275763,  19767,  31865,   2710,
            33, 103133,   6693,  21526,    304,    110,    148,  37735,   2091,
         90110,      1,  19697,      5, 207722,   1042,    233,    887,    969,
             9,   1030,   3673,     42,     67,      5,    174,    169,     54,
        317296,      6,  16595,  19896,     96,     80,   1964,   9484,  12687,
           374,     10,     16,    186,    304, 283193,  38665,  10455,     80,
            54,     45,  31586,      3,  44386,   9913,      5,    895,  60320,
           453,    118,      9,    169, 

In [35]:
word_embeddings1 = nn.Embedding.from_pretrained(word_embeddings, freeze=True)

In [36]:
embeds = rnn.PackedSequence(word_embeddings1(inputs_packed.data), inputs_packed.batch_sizes)



In [37]:
embeds

PackedSequence(data=tensor([[ 0.4800,  0.0810,  0.2980,  ..., -0.2726, -0.0319,  0.1398],
        [ 0.3919, -0.3237, -0.4925,  ...,  0.4331, -0.0101, -0.3867],
        [-1.0846,  1.0709, -0.2157,  ...,  0.7148, -0.4288,  0.4748],
        ...,
        [-1.0628,  0.9156,  3.7243,  ..., -0.3702,  0.3234,  1.7743],
        [-0.2340, -2.3419,  1.9534,  ..., -0.5497,  1.0787, -1.0878],
        [ 0.4855, -0.9194,  0.5627,  ..., -0.7201,  0.9286, -0.1993]]), batch_sizes=tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), sorted_indices=None, unsorted_indices=None)

In [None]:
inputs_batch = [seq_to_idxs(seq, word2idx) for seq in x_batch]

order = sorted(enumerate(inputs_batch), key=lambda x: len(x[1]), reverse=True)
inputs_batch = [inputs_batch[order_[0]] for order_ in order]
targets_batch = torch.tensor([y_batch[order_[0]] for order_ in order])


In [None]:
len(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
class LSTMClassifier(nn.Module):
    """
    Args:
        embedding_dim -- 300
        hidden_dim -- hidden state dimensionality
        vocab_size -- vocabulary size
        num_classes -- number of classes
        pretrained_embeddings -- None or [vocab_size, embedding_dim] tensor
    """
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes,
                 num_layers=2,
                 pretrained_embeddings=None, device='cpu'):

        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        if pretrained_embeddings is not None:
            self.word_embeddings = nn.Embedding.from_pretrained(pretrained_embeddings,
                                                                freeze=True).to(device)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2,
                            num_layers=self.num_layers,
                            bidirectional=True, batch_first=True)
        self.cls = nn.Linear(hidden_dim*self.num_layers, self.num_classes)
        self.dropout = nn.Dropout(p=0.4)
        self.device = device
        self.to(self.device)

    def forward(self, inputs):
        embeds = rnn.PackedSequence(self.word_embeddings(inputs.data), inputs.batch_sizes)
        h0 = torch.randn(2*self.num_layers, inputs.batch_sizes[0], self.hidden_dim//2).to(self.device)
        c0 = torch.randn(2*self.num_layers, inputs.batch_sizes[0], self.hidden_dim//2).to(self.device)
        lstm_out, (hidden, cell) = self.lstm(embeds, (h0, c0))
        hidden = hidden.permute(1,0,2).contiguous().view(inputs.batch_sizes[0], -1)
        scores = F.log_softmax(self.cls(self.dropout(hidden)), dim=1)
        return scores

In [None]:
class Trainer:
    def __init__(self, model, loss_fn, optimizer, word_to_idx, device='cpu'):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.word_to_idx = word_to_idx
        self.device = device
        
    
    def fit(self, num_epochs, 
            X_train, y_train, train_batch_size, 
            X_val, y_val, val_batch_size, 
            log_interval=50):
        for epoch in range(1, num_epochs+1):
            self.__train(epoch, X_train, y_train, train_batch_size, log_interval)
            self.__validate(epoch, X_val, y_val, val_batch_size)
            
            
    def __train(self, epoch, X, y, batch_size, log_interval=5):
        batch_start = 0
        batch_idx = 0
        while batch_start < len(X):
            start = time.time()
            
            self.optimizer.zero_grad()

            X_batch = X[batch_start:batch_start + batch_size]
            y_batch = y[batch_start:batch_start + batch_size]
            
            inputs_packed, targets_batch = self.__create_batch(X_batch, y_batch)
            inputs_packed = inputs_packed.to(self.device)
            targets_batch = targets_batch.to(self.device)
            outputs = self.model(inputs_packed)

            loss = self.loss_fn(outputs, targets_batch)
            loss.backward()
            self.optimizer.step()
            
            exec_time = time.time() - start

            batch_start += batch_size
            batch_idx += 1
            
            if batch_idx % log_interval == 0:
                print(f"Train Epoch: {epoch} {min(batch_start, len(X))}/{len(X)}. Loss: {loss.item()}. Time: {exec_time} s")        
                
    def __validate(self, epoch, X, y, batch_size):
        with torch.no_grad():
            batch_start = 0
            batch_idx = 0
            start = time.time()
            while batch_start < len(X):
                

                X_batch = X[batch_start:batch_start + batch_size]
                y_batch = y[batch_start:batch_start + batch_size]

                inputs_packed, targets_batch = self.__create_batch(X_batch, y_batch)
                
                inputs_packed = inputs_packed.to(self.device)
                targets_batch = targets_batch.to(self.device)
                outputs = self.model(inputs_packed)
                
                loss = self.loss_fn(outputs, targets_batch)
                F1 = self.__calc_F1_score_avg(outputs, targets_batch)

                batch_start += batch_size
                batch_idx += 1
        
        exec_time = time.time() - start
        print(f"=====> Validation. Epoch: {epoch}. Loss: {loss.item()}, F-1 score: {F1}. Time: {exec_time} s")
        
    
    def test(self, X_test, y_test, target_names):
        with torch.no_grad():
            inputs_packed, targets_batch = self.__create_batch(X_test, y_test)
            inputs_packed = inputs_packed.to(self.device)
            targets_batch = targets_batch.to(self.device)
            outputs = self.model(inputs_packed)
            
            y_pred = outputs.max(dim=1)[1]
            
        return classification_report(targets_batch, y_pred, target_names=target_names)
        
                
    
    def __seq_to_idxs(self, seq, mapping):    
        outs_seq = []
        for el in seq:
            if el in mapping:
                outs_seq.append(torch.tensor(mapping[el], dtype=torch.long))
        outs_seq = torch.stack(outs_seq, 0)
        return outs_seq
    
    def __create_batch(self, x_batch, y_batch):
        inputs_batch = [seq_to_idxs(seq, word2idx) for seq in x_batch]

        order = sorted(enumerate(inputs_batch), key=lambda x: len(x[1]), reverse=True)
        inputs_batch = [inputs_batch[order_[0]] for order_ in order]
        targets_batch = torch.tensor([y_batch[order_[0]] for order_ in order])


        inputs_packed = rnn.pack_sequence(inputs_batch)

        return inputs_packed, targets_batch
    
    def __calc_F1_score_avg(self, outputs, targets):
        pred = outputs.max(dim=1)[1]
        F1 = f1_score(pred, targets, average='weighted')
        return F1


In [None]:
hidden_dim = 128
learning_rate = 0.01

vocab_size = len(word2idx)
num_classes = 3
device = 'cpu' if not torch.cuda.is_available() else 'cuda'

model = LSTMClassifier(embedding_dim=word_embeddings.shape[1],
                       hidden_dim=hidden_dim,
                       vocab_size=vocab_size,
                       num_classes=num_classes,
                       pretrained_embeddings=word_embeddings,
                       device=device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_batch_size = 16
val_batch_size = 16

trainer = Trainer(model, loss_function, optimizer, word2idx, device=device)
trainer.fit(20, X_train, y_train, train_batch_size, X_val, y_val, val_batch_size, 1)

In [None]:
report = trainer.test(X_test, y_test, ['neg', 'neutral', 'pos'])
print(report)