In [67]:
#!g1.1
from string import punctuation
import pandas as pd
import numpy as np
from tqdm import tqdm
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel
from transformers import AutoTokenizer

In [68]:
#!g1.1
data = pd.read_csv("datasets/hse_data_science_hack/train.csv", index_col=False)
data.head(10)

Unnamed: 0.1,Unnamed: 0,sentence,1category,2category,sentiment
0,4754,При этом всегда получал качественные услуги.,Communication,,+
1,4417,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",?,,−
2,3629,"Вот так ""Мой любимый"" банк МКБ меня обманул.",?,,−
3,11640,Отвратительное отношение к клиентам.,Communication,,−
4,5571,"Всегда в любое время дня и ночи помогут, ответ...",Communication,,+
5,5254,"Все время согласовывалось, всё делалось быстро.",Communication,,+
6,16243,Абсолютное бездействие и нежелание банка работ...,Quality,,−
7,20223,Первая операция на внесение 122 000 руб. была ...,?,,?
8,9383,Ну почему я опять должен звонить и платить ден...,Communication,,−
9,5185,"Получив карту ""Кредит в кармане"" и две бесплат...",Communication,,+


In [69]:
#!g1.1
SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [70]:
#!g1.1
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model_bert = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased").to(device)
for param in model_bert.parameters():
    param.requires_grad = False

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=642.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1649718.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=714355318.0), HTML(value='')))








Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [71]:
#!g1.1
class Dataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, texts, targets):
        
        self.tokenizer = tokenizer
        self.texts = [torch.LongTensor(tokenizer.encode(t)) for t in texts]
        self.texts = torch.nn.utils.rnn.pad_sequence(
            self.texts, 
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id
        )
        self.length = len(texts)
        self.target = torch.LongTensor(targets)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        ids = self.texts[index]
        y = self.target[index]
        return ids, y
    
    def preprocess(self, text):
        tokens = text.lower().split()
        tokens = [token.strip(punctuation) for token in tokens]
        tokens = [token for token in tokens if token]
        return tokens

In [72]:
#!g1.1
texts = data.sentence.values
id2label = {i: l for i, l in enumerate(set(data.sentiment.values))}
label2id = {l: i for i, l in id2label.items()}
targets = [label2id[l] for l in data.sentiment.values]

In [73]:
#!g1.1
train_texts, valid_texts, train_targets, valid_targets = train_test_split(texts, targets, test_size=0.05, stratify=targets)

In [74]:
#!g1.1
training_set = Dataset(tokenizer, train_texts, train_targets)
training_generator = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=True)
valid_set = Dataset(tokenizer, valid_texts, valid_targets)
valid_generator = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=True)

In [76]:
#!g1.1
class CLF(torch.nn.Module):
    
    def __init__(self, pretrained_model, num_classes):
        super().__init__()          
        self.tokenizer = tokenizer
        self.pretrained_model = pretrained_model
        self.linear_1 = nn.Linear(768, 256)
        self.linear_2 = nn.Linear(256, num_classes) 
        self.activation = nn.LogSoftmax(1) 
        
    def forward(self, texts):
        mask = (texts != tokenizer.pad_token_id).long()
        hidden = self.pretrained_model(texts, attention_mask=mask)[0]
        dense_outputs_1 = self.linear_1(hidden[:,0])
        outputs_1 = self.activation(dense_outputs_1)
        dense_outputs = self.linear_2(dense_outputs_1)
        outputs=self.activation(dense_outputs)
        return outputs

In [77]:
#!g1.1
def train(model, iterator, optimizer, criterion):

    epoch_loss = []
    epoch_score = []
    
    model.train()  

    for texts, ys in tqdm(iterator):

        optimizer.zero_grad()   
        predictions = model(texts.to(device)).squeeze()
        loss = criterion(predictions, ys.to(device))        
        loss.backward()
        optimizer.step()  
        preds = predictions.detach().to("cpu").numpy().argmax(1).tolist()
        y_true = ys.tolist()
        epoch_loss.append(loss.item())
        epoch_score.append(f1_score(y_true, preds, average="macro"))

    return np.mean(epoch_loss), np.mean(epoch_score)


def evaluate(model, iterator, criterion):
    epoch_loss = []
    epoch_score = []
    model.eval()  
    with torch.no_grad():
        for texts, ys in tqdm(iterator):   
            predictions = model(texts.to(device)).squeeze()  
            loss = criterion(predictions, ys.to(device))        
            preds = predictions.detach().to("cpu").numpy().argmax(1).tolist()
            y_true = ys.tolist()
            epoch_loss.append(loss.item())  
            epoch_score.append(f1_score(y_true, preds, average="macro"))

    return np.mean(epoch_loss), np.mean(epoch_score)

In [78]:
#!g1.4
model = CLF(model_bert, len(label2id))
optimizer = optim.Adam(model.parameters(), lr=2e-6)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.05)
criterion = nn.NLLLoss() 
model = model.to(device)
criterion = criterion.to(device)

In [None]:
#!g1.4
EPOCHS = 200

train_losses = []
train_evals = []
valid_losses = []
valid_evals = []

best_valid_loss = 1e+6

for i in range(EPOCHS):

    print(f"Epoch: {i+1}")

    train_loss, train_score = train(model, training_generator, optimizer, criterion)
    train_losses.append(train_loss)
    train_evals.append(train_score)
    print(f"Train loss: {train_loss}, Train score: {train_score}")

    val_loss, val_score = evaluate(model, valid_generator, criterion)
    valid_losses.append(val_loss)
    valid_evals.append(val_score)
    print(f"Valid loss: {val_loss}, Valid score: {val_score}")

    scheduler.step()
    
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), f"/home/jupyter/mnt/s3/cheliki/rubert_finetuned/state_dict_model.pth")
        torch.save(optimizer.state_dict(), f"/home/jupyter/mnt/s3/cheliki/rubert_finetuned/state_dict_optimizer.pth")
    
    with open(f'/home/jupyter/mnt/s3/cheliki/rubert_finetuned/info.json', 'w') as file_object:
                info = {
                    'train_losses': train_losses,
                    'train_evals': train_evals,
                }
                file_object.write(json.dumps(info, indent=2))