# Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator

import utils

In [2]:
curr_dir = Path('.').resolve()
twitter_dir = curr_dir / "SentiRuEval_twitter"
reviews_dir = curr_dir / "SentiRuEval-2015"

# Experiments

## reading data 

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [5]:
# Takes up to 10 minutes  
gen_ft = utils.get_pretrained_embeddings()

In [6]:
ds_train = utils.get_dataset(reviews_dir/"SentiRuEval_rest_markup_train.xml")
ds_test = utils.get_dataset(reviews_dir / "SentiRuEval_rest_markup_test.xml")

  aspects_df = aspects_df[['review_id', 'term', 'sentiment', 'category', 'from', 'to']][aspects_df['type']=='explicit'][aspects_df['mark']=='Rel']


Text here is normalized: punctuation is thrown away, text is lowered and reduced to "normal form" using pymorphy2 package

In [7]:
ds_train.sample(3)

Unnamed: 0,text,aspects,categories,sentiments,mask_asp
99,"[я, быть, здесь, с, подруга, пока, один, раз, ...","[живую музыку, Заказали столик, первом этаже, ...","[Interior, Service, Interior, Whole, Interior,...","[positive, neutral, neutral, neutral, neutral,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
184,"[хотеться, поделиться, с, весь, пользователь, ...","[ресторане Duplex, ресторан, ресторан, качеств...","[Whole, Whole, Whole, Food, Interior, Interior...","[neutral, neutral, neutral, neutral, neutral, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20,"[отличный, кафе, отмечать, в, немой, свой, ден...","[кафе, еда, блюда, обстановка, администратор, ...","[Whole, Food, Food, Interior, Service, Whole, ...","[positive, positive, positive, positive, posit...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
counts = np.unique(np.hstack(ds_train.mask_asp.values), return_counts = True)

We have three possible results for each word: "aspect (first word)", "non-aspect", "aspect (not first word)". Let's see if these classes are balanced, which they're probably not. And calculate weights for them, if they're unbalanced.

In [10]:
print(f'0: {counts[1][0]/np.sum(counts[1]):.2f}%,\n1: {counts[1][1]/np.sum(counts[1]):.2f}%,\n2: {counts[1][2]/np.sum(counts[1]):.2f}%')

0: 0.88%,
1: 0.10%,
2: 0.01%


In [11]:
#Weights:
1/0.89, 1/0.1, 1/0.01    

(1.1235955056179776, 10.0, 100.0)

Number of words of these classes differs significantly, so we will calculate our loss function weighed accordingly:

 - "0" with 1.2
 - "1" with 10
 - "2" with 100

Let's see if our vocabulary is ok for our task

In [12]:
unks = [0]
for word in utils.get_ds_vocab(ds_train):
    if word not in gen_ft.index2entity:
        unks.append(word)
print(f'{len(unks)/len(utils.get_ds_vocab(ds_train))*100:.2f}% unknown words. examples:\n', unks[::10])

4.67% unknown words. examples:
 [0, '250р', 'albertone', 'банкеть', 'вразный', 'девчёнка', 'закусочка', 'кафитерия', 'марчеллис', 'необычненький', 'оринтироваться', 'панакот', 'приветсвый', 'сумотоха', 'халапена']


for fasttext from wiki & lenta < 5% unknown, that's ok, I guess

In [13]:
voc_ft = dict(zip(gen_ft.index2entity, range(len(gen_ft.index2entity))))

In [15]:
TEXT = Field(sequential = True, use_vocab = False, batch_first=True, unk_token = 977837, pad_token = 977838) 
TARGET = Field(sequential = True, use_vocab = False, batch_first=True, is_target = True, unk_token = 3, pad_token = 4)
train_ds = utils.DataFrameDataset(utils.text2ind(ds_train, voc_ft), {'text': TEXT, 'mask_asp': TARGET})
valid_ds = utils.DataFrameDataset(utils.text2ind(ds_test, voc_ft), {'text': TEXT, 'mask_asp': TARGET})

In [16]:
train_iter, val_iter = BucketIterator.splits(
        (train_ds, valid_ds),
        batch_sizes=(64, 64),
        sort=True,
        sort_key= lambda x: len(x.text),
        sort_within_batch=False,
        device=device,
        repeat=False, 
)

## training model

In [19]:
weights_ft = torch.FloatTensor(gen_ft.vectors)

In [None]:
dom_emb_dim = 100
voc_size = weights_ft.shape[0]
hid_dim = 200
encoder = utils.EncoderRNN(hid_dim, dom_emb_dim, voc_size,weights_ft )
encoder.to(device)
decoder = utils.DecoderRNN(hid_dim, dom_emb_dim, voc_size, weights_ft, output_dim = 5)
decoder.to(device)

In [None]:
model = utils.Seq2Seq(encoder, decoder, device)

In [None]:
model(batch.text, batch.mask_asp).shape

### The training loop

In [None]:
opt = optim.Adam(model.parameters(), lr=1e-4)
loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1., 10, 10, 10, 10]), ignore_index = 4)
epochs = 30

In [144]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter: 
        
        src = batch.text
        trg = batch.mask_asp

        opt.zero_grad()
        
        output = model(src, trg, 0.7)
        
        output_dim = output.shape[-1]
        
        output = output.view(-1, output_dim)
        trg = trg.view(-1)
        
        loss = loss_func(output, trg)
        loss.backward()
        

        
        opt.step()
        
        running_loss += loss.item()

    epoch_loss = running_loss / len(train_iter)
    
    val_loss = 0.0
    model.eval()
    for batch in val_iter:
        
        src = batch.text
        trg = batch.mask_asp
        
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        
        output = output.view(-1, output_dim)
        trg = trg.view(-1)
        
        loss = loss_func(output, trg)
        
        val_loss += loss.item()  
        
    val_loss /= len(val_iter)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

Epoch: 1, Training Loss: 1.4911468625068665, Validation Loss: 1.3939503133296967
Epoch: 2, Training Loss: 1.320342093706131, Validation Loss: 1.291944146156311
Epoch: 3, Training Loss: 1.2084271609783173, Validation Loss: 1.2319333255290985
Epoch: 4, Training Loss: 1.1336623132228851, Validation Loss: 1.1764418184757233
Epoch: 5, Training Loss: 1.0854232609272003, Validation Loss: 1.1433232128620148
Epoch: 6, Training Loss: 1.029188871383667, Validation Loss: 1.1067364513874054
Epoch: 7, Training Loss: 1.0009105801582336, Validation Loss: 1.080284595489502
Epoch: 8, Training Loss: 0.9670122116804123, Validation Loss: 1.036013275384903
Epoch: 9, Training Loss: 0.9518527686595917, Validation Loss: 1.0009718835353851
Epoch: 10, Training Loss: 0.9164797365665436, Validation Loss: 1.0004175454378128
Epoch: 11, Training Loss: 0.895934596657753, Validation Loss: 0.9864820688962936
Epoch: 12, Training Loss: 0.8673616349697113, Validation Loss: 0.9540105909109116
Epoch: 13, Training Loss: 0.876

In [None]:
torch.save(model.state_dict(), 'model.pt')

## results

In [0]:
batch = next(val_iter.__iter__())

In [146]:
batch.mask_asp

tensor([[0, 1, 0,  ..., 4, 4, 4],
        [0, 0, 0,  ..., 4, 4, 4],
        [0, 0, 1,  ..., 4, 4, 4],
        ...,
        [0, 0, 0,  ..., 4, 4, 4],
        [0, 0, 0,  ..., 4, 4, 4],
        [0, 0, 0,  ..., 0, 0, 0]])

In [0]:
results = torch.argmax(F.softmax(model(batch.text, torch.LongTensor(64, batch.mask_asp.shape[1]).fill_(4) , 0.), dim = 2), axis = 2)

In [157]:
results[0]

tensor([0, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1,
        2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], grad_fn=<SelectBackward>)

# What has been done so far

My team (and also I as part of that team) wasn't very organized. So, what I thought was to be done by 4 people, I ended up doing alone. I don't blame anyone, but that's what I've got so far on my own.

1. *understanding the task*.

   I have no prior experience other than our homeworks in this kind of tasks, so it took a lot of time just to understand the purpose and the structure of the problem. I googled some methods and decided to start with some simple ones so that process of debugging wouldn't be too exhausting. And because big model require much more data to be trained.
   
   
2. *data preparation* 

    To understand in what format the data should be stored, how to parse it from xml and so on also took more than a few days. Found pretrained embeddings (our corpus is not big enough to obtain good ones with an at leas ok number of words)
    
    
3. *coding*

    That part was the most fun: trying to remember how seq2seq worked and how to use attention there. Some ideas to try out little bit later:
    - use not general word collection (pre-trained on wiki), but pretrained on twitter. 
    - use BERT for embeddings
    - use more sophisticated approaches than vanilla seq2seq
    - maybe add some attention here (?)
    
   
    
4. *results*

    My model sucks. It is learning. But either it has too few data to train on, or it is just not good enough for this task. Also different weights in loss gives different results, but I couldn't find the optimal ones.

# further work

This part was supposed to be the first step to solve the task of aspect based sentiment analysis.

1. Aspect extraction [Done]
2. Aspect categorization [To-do]
3. Understanding sentiment toward each aspect [To-do]
4. Summarize sentiments to aspects inside each category, report on sentiment of each category. [To-do]


Second task is simple classification. For each aspect, using embedding, bigrams and rhreegrams we'll do CNN classification

On the 3rd task I hope to hear from my team. Or do some more research and implement some clean and simple algorithm.

4th is just summarization.