# Train new model

### Installing the required libraries

In [2]:
!pip install transformers sentencepiece -q


[notice] A new release of pip available: 22.2.2 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Load datasets for model training

In [5]:
# Set the flag to True if notebook running in colab
is_google_colab = False

# Main directory path
path = ''
if is_google_colab:
    path = '/content/drive/MyDrive/'

In [6]:
if is_google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

In [8]:
import pandas as pd

train_dataset = pd.read_csv(path + 'Dataset/train.csv')
test_dataset = pd.read_csv(path + 'Dataset/test.csv')
val_dataset = pd.read_csv(path + 'Dataset/validation.csv')

train_dataset.shape[0], test_dataset.shape[0], val_dataset.shape[0]

(26945, 3367, 3294)

In [9]:
# Convert string to lists in `labels` column
train_dataset.labels = train_dataset.labels.apply(lambda x: eval(x))
test_dataset.labels = test_dataset.labels.apply(lambda x: eval(x))
val_dataset.labels = val_dataset.labels.apply(lambda x: eval(x))

In [10]:
LABELS = ['радость', 'интерес', 'удивление', 'печаль', 'гнев', 'отвращение', 'страх', 'вина', 'нейтрально']

# Let's derive 3 examples for each emotion
for i in range(len(LABELS)):
    print(f"\nEMOTION: {LABELS[i]}")
    for t in train_dataset[train_dataset.labels.apply(lambda x: x==[i])].text.sample(3):
        print(t)


EMOTION: радость
Глюкоза хранитель!!!! Салмонблю, ты творишь чудеса, дорогая.
[ИМЯ] это правда? Я так рада, что отписалась от фото
Я рад, что не слишком много времени было потрачено на это

EMOTION: интерес
Зачем кому-то нужен год, чтобы удовлетворить ваши потребности? Не теряйте ни минуты.
Что он сказал?? Я хочу быть причастным к его позору!
>но частью общей картины неравенства является то, что женщины получают более низкую заработную плату за ту же работу. Честно? Не бывает.

EMOTION: удивление
*Какашка* Оооооооо!
если бы у него были хоть какие-то товарные навыки, увы я думаю ему предстоит нелегкий срок
хо мальчик тебя ждет сюрприз

EMOTION: печаль
Ветка комментариев — сплошной беспорядок, и в 2016 году она была бы уместна. Как же все не меняется….
Я сейчас буквально в слезах. Все, что вы сказали, правда, боже мой.
Я беременна, и моя собака находится с моими родителями по всей стране, пока мы ремонтируем. Ощущений было слишком много 😭

EMOTION: гнев
>поэтому это не похоже на то, что

In [12]:
# Binarize indices using OneHotEncoding
def binarize_labels(labels):
    return [float(i in labels) for i in range(9)]

print(binarize_labels([0]))
print(binarize_labels([1]))
print(binarize_labels([1, 4]))
print(binarize_labels([8]))

[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]


### Load the model

In [13]:
from transformers import BertForSequenceClassification, AutoTokenizer

base_model = 'cointegrated/rubert-tiny2'

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = BertForSequenceClassification.from_pretrained(base_model, num_labels=9, problem_type='multi_label_classification')

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [14]:
model.cuda() # Give the model access to the GPU

model.config.label2id = {label: i for i, label in enumerate(LABELS)}
model.config.id2label = {i: label for i, label in enumerate(LABELS)}

In [17]:
%%time

# Preparing datasets for training
TrainDict = [{**tokenizer(train_dataset.text[i], truncation=True), 'label': binarize_labels(train_dataset.labels[i])} for i in range(train_dataset.shape[0])]
TestDict = [{**tokenizer(test_dataset.text[i], truncation=True), 'label': binarize_labels(test_dataset.labels[i])} for i in range(test_dataset.shape[0])]
ValDict = [{**tokenizer(val_dataset.text[i], truncation=True), 'label': binarize_labels(val_dataset.labels[i])} for i in range(val_dataset.shape[0])]

CPU times: total: 9.44 s
Wall time: 9.77 s


In [18]:
import gc
import torch

# Cleaning unnecessary data during training
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [19]:
from tqdm.auto import tqdm, trange
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from IPython.display import display

# Check the accuracy of the model on validate/test data
def evaluate_model(model, dev_dataloader, verbose=False, labels=None):
    facts, preds = predict_with_model(model, dev_dataloader)
    aucs = get_classification_report(facts, preds, labels)
    if verbose:
        print('aucs:', aucs, np.mean(aucs))
    return np.mean(aucs)

# Get model prediction
def predict_with_model(model, dataloader):
    preds = []
    facts = []

    for batch in tqdm(dataloader):
        facts.append(batch.labels.cpu().numpy())
        batch = batch.to(model.device)
        with torch.no_grad():
            pr = model(input_ids=batch.input_ids, attention_mask=batch.attention_mask, token_type_ids=batch.token_type_ids)
        preds.append(torch.softmax(pr.logits, -1).cpu().numpy())
    facts = np.concatenate(facts)
    preds = np.concatenate(preds)
    return facts, preds

# Get results of the model
def get_classification_report(facts, preds, labels=None):
    print(facts.shape, preds.shape)
    aucs = [roc_auc_score(facts[:, i], preds[:, i]) for i in range(9)]
    return aucs

In [20]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer)
batch_size = 64

# Create training and test (validation) data for model training
train_dataloader = DataLoader(
    TrainDict,
    batch_size=batch_size, drop_last=False, shuffle=True, num_workers=0, collate_fn=data_collator
)
dev_dataloader = DataLoader(
    ValDict,
    batch_size=batch_size, drop_last=False, shuffle=True, num_workers=0, collate_fn=data_collator
)

In [21]:
# Testing model accuracy
evaluate_model(model, dev_dataloader, verbose=True)

  0%|          | 0/52 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(3294, 9) (3294, 9)
aucs: [0.6579395957132441, 0.2854773647842456, 0.5087267472476334, 0.4601284503265094, 0.46081782989733094, 0.5449795166252611, 0.5910863154312936, 0.2626116396634738, 0.46739448865119915] 0.4710179942600213


0.4710179942600213

### Train the model

In [22]:
# Initializing training parameters

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

ewm_loss = 0

window = 500
cleanup_step = 100

In [23]:
%%time

# Training model
for _ in range(8):
    model.train()
    cleanup()

    # Делаем 5 итераций и тестируем модель
    for epoch in trange(5):
        tq = tqdm(train_dataloader)

        for i, batch in enumerate(tq):
            # Calculate the loss function
            try:
                batch = batch.to(model.device)
                output = model(**batch)
                loss = output.loss
                loss.backward()
            except RuntimeError as e:
                print('error on step', i, e)
                loss = None
                cleanup()
                continue

            # Update weights
            optimizer.step()
            optimizer.zero_grad()

            if i % cleanup_step == 0:
                cleanup()

            w = 1 / min(i+1, window)
            ewm_loss = ewm_loss * (1-w) + loss.item() * w
            tq.set_description(f'loss: {ewm_loss:4.4f}')

    # Testing model performance
    model.eval()
    eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
    print(f'epoch {epoch + 1}, step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss}\n\n')

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8739239558936441, 0.8800166313580297, 0.7312637006012956, 0.7423811706828669, 0.7370489390600475, 0.8087626067356575, 0.8437163273407816, 0.9287957485581082, 0.8332669769061115] 0.8199084507929492
epoch 5, step 421: train loss: 0.2312  val auc: 0.8199084507929492




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.904615166237604, 0.8965260677779944, 0.8352400896432638, 0.8019725982996871, 0.8076130906016494, 0.8490482223063128, 0.9055570521181876, 0.9629848896206912, 0.8400860720019703] 0.8670714720674845
epoch 5, step 421: train loss: 0.1766  val auc: 0.8670714720674845




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8859248075669887, 0.8865878934967846, 0.8609451731021223, 0.8221015148302849, 0.8094511920651632, 0.8537668021256642, 0.90953751488686, 0.9662965001456923, 0.8236307274084143] 0.8686935695142193
epoch 5, step 421: train loss: 0.1413  val auc: 0.8686935695142193




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8699623844272819, 0.8518954642035028, 0.8705634544497104, 0.8141455200562109, 0.8016721400025834, 0.8434806929797141, 0.9036465717688426, 0.9667081388841353, 0.8044634110660298] 0.8585041975375568
epoch 5, step 421: train loss: 0.1153  val auc: 0.8585041975375568




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8645103846079399, 0.8215817922059788, 0.8710826995848489, 0.8111878991178753, 0.8007914482877396, 0.8420953916519963, 0.8988118868031532, 0.9650199575410829, 0.791522241131232] 0.8518448556590941
epoch 5, step 421: train loss: 0.0953  val auc: 0.8518448556590941




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8666821516365678, 0.8331591347942391, 0.8826824399622812, 0.8205863515423031, 0.7981572015140068, 0.8454122176338, 0.8998220665797085, 0.9626010018084353, 0.7754024440152367] 0.8538338899429532
epoch 5, step 421: train loss: 0.0801  val auc: 0.8538338899429532




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8550077747453044, 0.8030861798208274, 0.8565119952974025, 0.8030237775185289, 0.779616488114576, 0.8297461377733172, 0.8800260874496683, 0.9576335860209334, 0.7727841737143366] 0.837492911161655
epoch 5, step 421: train loss: 0.0685  val auc: 0.837492911161655




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8481169631393196, 0.7997703288653059, 0.8491935780153569, 0.794299101003116, 0.7664061123919196, 0.8241424129250917, 0.8715866557023763, 0.9523562848910082, 0.7723754798495591] 0.8309163240870059
epoch 5, step 421: train loss: 0.0563  val auc: 0.8309163240870059




### Testing the model

In [24]:
# Loading test data
test_dataloader = DataLoader(
    TestDict,
    batch_size=batch_size, drop_last=False, shuffle=True, num_workers=0, collate_fn=data_collator
)

In [25]:
# Testing the model with new test data
model.eval()
eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
test_loss = evaluate_model(model, test_dataloader, verbose=True)
model.train()
print(f'epoch {epoch}, step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss} test auc: {test_loss}')

  0%|          | 0/52 [00:00<?, ?it/s]

(3294, 9) (3294, 9)
aucs: [0.8481169631393196, 0.7997703288653059, 0.8491935780153569, 0.794299101003116, 0.7664053295548398, 0.8241424129250916, 0.8715866557023764, 0.9523562848910082, 0.7723754798495591] 0.8309162371051082


  0%|          | 0/53 [00:00<?, ?it/s]

(3367, 9) (3367, 9)
aucs: [0.8611487007747546, 0.8081942167101349, 0.8164954074386741, 0.7850328544381334, 0.7451383274787697, 0.8379773039407337, 0.9042158449453283, 0.9497562238426025, 0.7661326579016305] 0.8304546152745291
epoch 4, step 421: train loss: 0.0563  val auc: 0.8309162371051082 test auc: 0.8304546152745291


In [26]:
# Get the predictions for each test
model.eval()
facts, preds = predict_with_model(model, test_dataloader)

  0%|          | 0/53 [00:00<?, ?it/s]

In [28]:
from sklearn.metrics import f1_score

# Calculate the f1-score for each emotions
pd.DataFrame([
    {av: f1_score(facts[:, i], preds[:, i] > 0.5, average=av) for av in ['binary', 'micro', 'macro']}
    for i in range(9)
]).round(4)

Unnamed: 0,binary,micro,macro
0,0.6145,0.9415,0.7914
1,0.476,0.896,0.7092
2,0.5148,0.9658,0.7485
3,0.4405,0.9162,0.6976
4,0.4602,0.8488,0.6862
5,0.3617,0.9644,0.6717
6,0.6164,0.9834,0.804
7,0.7059,0.9896,0.8503
8,0.7202,0.7048,0.7039


In [30]:
# Calculate the average f1-score for all emotions
pd.DataFrame([
    {av: f1_score(facts[:, i], preds[:, i] > 0.5, average=av) for av in ['binary', 'micro', 'macro']}
    for i in range(6)
]).mean().round(4)

binary    0.4780
micro     0.9221
macro     0.7174
dtype: float64

### Save the trained model

In [31]:
model.save_pretrained(path + 'emotion_detection_model')
tokenizer.save_pretrained(path + 'emotion_detection_model')

# Load our model

In [32]:
!pip install transformers -q


[notice] A new release of pip available: 22.2.2 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Set the flag to True if notebook running in colab
is_google_colab = False

# Main directory path
path = ''
if is_google_colab:
    path = '/content/drive/MyDrive/'

In [33]:
if is_google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

In [34]:
import torch
from transformers import BertForSequenceClassification, AutoTokenizer

model = BertForSequenceClassification.from_pretrained(path + 'emotion_detection_model')
tokenizer = AutoTokenizer.from_pretrained(path + 'emotion_detection_model')

### Implement the prediction function

In [35]:
LABELS = ['радость', 'интерес', 'удивление', 'печаль', 'гнев', 'отвращение', 'страх', 'вина', 'нейтрально']

# Predicting emotion in text
@torch.no_grad()
def predict_emotion(text):
    inputs = tokenizer(text, truncation=True, return_tensors='pt')
    inputs = inputs.to(model.device)
    
    outputs = model(**inputs)
    
    pred = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred = pred.argmax(dim=1)
    
    return LABELS[pred[0]]


# Probabilistic prediction of emotion in a text
@torch.no_grad()
def predict_emotions(text):
    inputs = tokenizer(text, truncation=True, return_tensors='pt')
    inputs = inputs.to(model.device)

    outputs = model(**inputs)

    pred = torch.nn.functional.softmax(outputs.logits, dim=1)

    emotions_list = {}
    for i in range(len(pred[0].tolist())):
        emotions_list[LABELS[i]] = round(pred[0].tolist()[i], 4)
    return emotions_list

### Тестируем модель

In [53]:
print(predict_emotion('Обожаю цветы'))
print(predict_emotion('Ненавижу цветы'))
print(predict_emotion('Мне страшно'))
print(predict_emotion('Куда мне стоит сегодня сходить?'))

радость
гнев
страх
интерес


In [51]:
print(predict_emotions('Обожаю цветы'))
print(predict_emotions('Ненавижу цветы'))
print(predict_emotions('Мне страшно'))
print(predict_emotions('Куда мне стоит сегодня сходить?'))

{'радость': 0.88, 'интерес': 0.1065, 'удивление': 0.0039, 'печаль': 0.0009, 'гнев': 0.0005, 'отвращение': 0.0002, 'страх': 0.0011, 'вина': 0.0006, 'нейтрально': 0.0063}
{'радость': 0.0001, 'интерес': 0.0003, 'удивление': 0.0001, 'печаль': 0.0003, 'гнев': 0.9972, 'отвращение': 0.0011, 'страх': 0.0, 'вина': 0.0, 'нейтрально': 0.0009}
{'радость': 0.0008, 'интерес': 0.0038, 'удивление': 0.0018, 'печаль': 0.002, 'гнев': 0.0003, 'отвращение': 0.0045, 'страх': 0.9828, 'вина': 0.0006, 'нейтрально': 0.0034}
{'радость': 0.0002, 'интерес': 0.951, 'удивление': 0.0005, 'печаль': 0.0003, 'гнев': 0.0006, 'отвращение': 0.0001, 'страх': 0.0003, 'вина': 0.0001, 'нейтрально': 0.047}
