# Train new model

### Import libraries

In [248]:
import torch
import pandas as pd
from collections import Counter

### Load datasets for model training

In [249]:
# GLOBAL VARIABLES
LABELS = ['neutral', 'joy', 'sadness', 'anger', 'enthusiasm', 'surprise', 'disgust', 'fear', 'guilt', 'shame']
LABELS_RU = ['нейтрально', 'радость', 'грусть', 'гнев', 'интерес', 'удивление', 'отвращение', 'страх', 'вина', 'стыд']

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Current Device: `{device}`')

Current Device: `cuda`


In [250]:
# Set the flag to True if notebook running in colab
is_google_colab = False

# Main directory path
path = ''
if is_google_colab:
    path = '/content/drive/MyDrive/'

In [251]:
if is_google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

### Create Dataset

In [252]:
def preprocess(show_distribution: bool = False):
    # Import and Clean Dataset
    dataset_fp = path + 'dataset/emotion_dataset_1.csv'
    dataset = pd.read_csv(dataset_fp)
    dataset = dataset[dataset.example_very_unclear == False]
    dataset = dataset.drop(columns=['example_very_unclear', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id'])
    print(f'Old Dataset size: {dataset.shape[0]:_}\n')

    # Print distribution of emotion classes
    labels = ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
              'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
              'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
              'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
              'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
    if show_distribution:
        print_emotion_distribution(labels, dataset)

    dataset = regroup_dataset(dataset)
    dataset2 = get_second_dataset()

    dataset = pd.concat([dataset, dataset2]).drop_duplicates()

    neutrals = dataset[dataset.neutral == 1][:15_000]
    dataset = pd.concat([dataset[dataset.neutral == 0], neutrals])
    dataset = dataset.sample(frac=1, random_state=41).reset_index(drop=True)

    ### <Concat Dublicates>
    dataset_ = dataset.groupby('text', sort=False, as_index=False).sum()

    # Normalize labels data
    for c in LABELS:
        dataset_[c] = dataset_[c].apply(lambda x: int(bool(x)))

    dublicates = dataset[dataset.text.duplicated()]
    dataset = dataset[~dataset.index.isin(dublicates.index)].reset_index(drop=True)
    dataset[LABELS] = dataset_.values[:, 1:]
    ### </Concat Dublicates>

    # Remove outliers
    drop_indices = [1099, 4575, 14120, 16033, 16122]
    dataset = dataset.drop(drop_indices).reset_index(drop=True)

    print(f'New Dataset size: {dataset.shape[0]:_}\n')

    # Print distribution of new labels
    if show_distribution:
        print_emotion_distribution(LABELS, dataset)

    return dataset

def print_emotion_distribution(labels, dataset):
    count = {}
    for label in labels:
        count[label] = dataset[label].sum()

    print('Class Distribution:')
    for label, count in Counter(count).most_common():
        print(f"{(label+':').ljust(15)} {count}")
    print()

def regroup_dataset(dataset):
    # Grouping classes
    new_dataset = dataset[['text', 'neutral']].copy()
    new_dataset['joy'] = dataset['excitement'] + dataset['joy'] + dataset['optimism']
    new_dataset['sadness'] = dataset['disappointment'] + dataset['grief'] + dataset['remorse'] + dataset['sadness']
    new_dataset['anger'] = dataset['anger'] + dataset['annoyance']
    new_dataset['enthusiasm'] = dataset['curiosity'] + dataset['desire']
    new_dataset['surprise'] = dataset['surprise']
    new_dataset['disgust'] = dataset['disgust']
    new_dataset['fear'] = dataset['fear']
    new_dataset['guilt'] = dataset['grief'] + dataset['remorse']
    new_dataset['shame'] = dataset['embarrassment']

    # Clear rows with no labels
    new_dataset = new_dataset[new_dataset.values[:, 1:].sum(axis=1) > 0]

    # Normalize labels data
    for c in LABELS:
        new_dataset[c] = new_dataset[c].apply(lambda x: int(bool(x)))
    return new_dataset

def get_second_dataset():
    dataset2 = pd.read_csv(path + 'dataset/emotion_dataset_2.csv')
    dataset2['text'] = dataset2['Text']
    dataset2 = dataset2[(dataset2.Emotion == 'fear') | (dataset2.Emotion == 'surprise')]
    dataset2['fear'] = (dataset2.Emotion == 'fear').astype('int')
    dataset2['surprise'] = (dataset2.Emotion == 'surprise').astype('int')
    dataset2 = dataset2.drop(columns=['Text', 'Emotion'])
    dataset2[['neutral', 'joy', 'sadness', 'anger', 'enthusiasm', 'disgust', 'guilt', 'shame']] = 0
    return dataset2

In [254]:
# dataset = preprocess(show_distribution=True)
# dataset.to_csv('dataset/goemotion.csv', index=False)

In [25]:
# dataset.to_csv('dataset/goemotion.csv', index=False)
# dataset.head()

In [26]:
# print_emotion_distribution(labels, dataset)

In [4]:
# N = 30000
# for i in range(N, N+1000):
#     print(f"\"{dataset.text[i]}\"\n")

In [463]:
# dataset_ru = dataset.copy()

In [5]:
# text = text.split('\n\n')
# text = [x[1:-1] for x in text]
# text

In [6]:
# len(text), (N, N+1000)

In [7]:
# dataset_ru.text[N:N+1000] = text

In [8]:
# dataset_ru.text = dataset_ru.text.apply(lambda x: x.replace('[ФАМИЛИЯ]', '[NAME]'))
# dataset_ru.to_csv('dataset/ru_goemotion.csv', index=False)
# dataset_ru[N-10:N+50]

In [9]:
# dataset[N-10:N+50]

In [256]:
# dataset_ru = dataset_ru.drop([16122]).reset_index(drop=True)
# dataset_ru.to_csv('dataset/ru_goemotion.csv', index=False)

In [258]:
# dataset_ru[24_850:]

In [260]:
# dataset[24_850:]

In [712]:
# dubs = dataset_ru[dataset_ru.text.duplicated()].text.values
# for text in dubs:
#     print(text, dataset_ru[dataset_ru.text == text].shape[0]-1)

### Load Preprocessed Data

In [261]:
dataset = pd.read_csv('dataset/goemotion.csv')
dataset_ru = pd.read_csv('dataset/ru_goemotion.csv')
df = dataset_ru[:24891]
df.head()

Unnamed: 0,text,neutral,joy,sadness,anger,enthusiasm,surprise,disgust,fear,guilt,shame
0,Разбит и ревнив,1,0,1,1,0,0,0,0,1,0
1,Мне все равно.,0,0,0,1,0,0,0,0,0,0
2,"Когда она сверлила стол [ИМЯ], она даже не зна...",1,0,1,0,0,0,0,0,0,0
3,"Мне кажется, это от Shitpostbot 5000.",1,1,0,0,0,0,0,0,0,0
4,Этот парень должен отправиться в АД.,0,0,0,1,0,0,1,0,0,0


In [262]:
# Let's derive 3 examples for each emotion
for i in range(len(LABELS_RU)):
    print(f"\nEMOTION: {LABELS_RU[i]}")
    for t in df[df.iloc[:, i+1] == 1].text.sample(3):
        print(t)


EMOTION: нейтрально
А, старый трюк со скрещиванием ног, чтобы казалось, что у тебя есть бедра
Мужчины, не получившие других совпадений, были гипергамны, подбирая себе пару с [NAME]
LOOOOOOOOOO" - "Почему они освистывают???" -OOOOCH "oohhhh that". Я всегда забываю, каждый раз"

EMOTION: радость
Я люблю [ФАМИЛИЮ] как игрока. Я по-прежнему считаю, что лучшее у него еще впереди. Просто молюсь, чтобы он оставался на поле
Кого больше волнует Суперкубок. Не могу дождаться, чтобы проверить эту лигу XFL...
Лол, заблуждение сильное

EMOTION: грусть
Число педофилов чертовски велико!
Бывали случаи, когда обходились без презервативов. Без глубокого проникновения, но все же. Немного. Я чувствую себя ужасно
"Но [NAME] хочет, чтобы у тебя были дети!"" Надо было [ИМЯ] тогда дать мне яичники lol

EMOTION: гнев
[NAME] - не очень умный человек. Дать мошеннику карточку, блин. Типа wtf
Ай, нижняя - моя!
Эх. Я говорю, что это точно

EMOTION: интерес
Это потрясающе! Кто-нибудь знает, есть ли версия для EST?


### Split Data

In [263]:
test_size = df.shape[0] // 10
df_train, df_test = df[:df.shape[0]-test_size].reset_index(drop=True), df[-test_size:].reset_index(drop=True)

In [264]:
N_FOLDS = 10

fold = 1 # Folds from 1 to 10 (by default)
fold_size = df_train.shape[0] // N_FOLDS

val_df = df_train[fold_size * (fold-1):fold_size * fold].reset_index(drop=True)
train_df = df_train.loc[~df_train.index.isin(range(fold_size * (fold - 1), fold_size * fold))].reset_index(drop=True)

In [732]:
# print_emotion_distribution(LABELS, val_df)

In [734]:
# print_emotion_distribution(LABELS, train_df)

In [265]:
X_train, y_train = train_df.text, train_df.iloc[:, 1:].values
X_test, y_test = df_test.text, df_test.iloc[:, 1:].values
X_val, y_val = val_df.text, val_df.iloc[:, 1:].values

In [293]:
# for i in range(30_000, 40_000):
#     print(i, dataset[dataset.text == dataset_ru.text[i]].index.values)

In [322]:
# dataset.text[6189]
# dataset_ru.text[6189]

In [323]:
# dataset

### Load the model

In [1026]:
from transformers import BertForSequenceClassification, AutoTokenizer

base_model = 'cointegrated/rubert-tiny2'

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = BertForSequenceClassification.from_pretrained(base_model, num_labels=10, problem_type='multi_label_classification')

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2

In [266]:
model.to(device) # Give the model access to the GPU

model.config.label2id = {label: i for i, label in enumerate(LABELS)}
model.config.id2label = {i: label for i, label in enumerate(LABELS)}

In [267]:
%%time

# Preparing datasets for training
train_dict = [{**tokenizer(X_train[i], truncation=True), 'label': y_train[i].astype(float)} for i in range(train_df.shape[0])]
test_dict = [{**tokenizer(X_test[i], truncation=True), 'label': y_test[i].astype(float)} for i in range(df_test.shape[0])]
val_dict = [{**tokenizer(X_val[i], truncation=True), 'label': y_val[i].astype(float)} for i in range(val_df.shape[0])]

CPU times: total: 2.48 s
Wall time: 2.49 s


In [270]:
import gc
import torch

# Cleaning unnecessary data during training
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [271]:
from tqdm.auto import tqdm, trange
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from IPython.display import display

# Check the accuracy of the model on validate/test data
def evaluate_model(model, dev_dataloader, verbose=False, labels=None):
    facts, preds = predict_with_model(model, dev_dataloader, verbose)
    aucs = get_classification_report(facts, preds, labels)
    if verbose:
        print('aucs:', aucs, round(np.mean(aucs), 4))
    return round(np.mean(aucs), 4)

# Get model prediction
def predict_with_model(model, dataloader, verbose=False):
    preds = []
    facts = []

    tq = dataloader
    if verbose:
        tq = tqdm(dataloader)

    for batch in tq:
        facts.append(batch.labels.cpu().numpy())
        batch = batch.to(model.device)
        with torch.no_grad():
            pr = model(input_ids=batch.input_ids, attention_mask=batch.attention_mask, token_type_ids=batch.token_type_ids)
        preds.append(torch.softmax(pr.logits, -1).cpu().numpy())
    facts = np.concatenate(facts)
    preds = np.concatenate(preds)
    return facts, preds

# Get results of the model
def get_classification_report(facts, preds, labels=None):
    # print(facts.shape, preds.shape)
    aucs = [round(roc_auc_score(facts[:, i], preds[:, i]), 4) for i in range(10)]
    return aucs

In [272]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer)
batch_size = 32

# Create training and test (validation) data for model training
train_dataloader = DataLoader(train_dict, batch_size=batch_size, drop_last=False, shuffle=True, num_workers=0, collate_fn=data_collator)
dev_dataloader = DataLoader(val_dict, batch_size=batch_size, drop_last=False, shuffle=False, num_workers=0, collate_fn=data_collator)

In [273]:
# Testing model accuracy
evaluate_model(model, dev_dataloader, verbose=True)

  0%|          | 0/70 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  tensor = as_tensor(value)


aucs: [0.7062, 0.8328, 0.7984, 0.784, 0.8452, 0.805, 0.7985, 0.8761, 0.8535, 0.7472] 0.8047


0.8047

### Train the model

In [1033]:
# Initializing training parameters

optimizer = torch.optim.AdamW(params=model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: .5 ** epoch)

# ewm_loss = 0
# window = 500

In [1034]:
%%time

cleanup_step = 100

# Training model
for epoch in trange(10):
    model.train()
    cleanup()

    print('LR:', scheduler.get_last_lr())
    tq = tqdm(train_dataloader)
    for i, batch in enumerate(tq):
        # Calculate the loss function
        batch = batch.to(model.device)
        output = model(**batch)
        loss = output.loss
        loss.backward()

        # Update weights
        optimizer.step()
        optimizer.zero_grad()

        if i % cleanup_step == 0:
             # Testing model performance
            model.eval()
            eval_loss = evaluate_model(model, dev_dataloader, verbose=False)
            val_loss = []
            for batch in dev_dataloader:
                batch = batch.to(model.device)
                output = model(**batch)
                val_loss.append(output.loss.item())
            val_loss = sum(val_loss) / len(val_loss)
            print(f'epoch {epoch + 1}, step {i} train loss: {loss.item():.4f} val loss: {val_loss:.4f} val auc: {eval_loss}')
            model.train()
            cleanup()

        tq.set_description(f'loss: {loss.item():.4f}')

    scheduler.step()
    # Testing model performance
    model.eval()
    eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
    print(f'epoch {epoch + 1}, step {i} val auc: {eval_loss}\n\n')

  0%|          | 0/20 [00:00<?, ?it/s]

LR: [5e-05]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 1, step 0 train loss: 0.6971 val loss: 0.6912 val auc: 0.485
epoch 1, step 100 train loss: 0.4696 val loss: 0.4630 val auc: 0.6444
epoch 1, step 200 train loss: 0.4426 val loss: 0.4455 val auc: 0.694
epoch 1, step 300 train loss: 0.4430 val loss: 0.4193 val auc: 0.7378
epoch 1, step 400 train loss: 0.3896 val loss: 0.4026 val auc: 0.7558
epoch 1, step 500 train loss: 0.3656 val loss: 0.3908 val auc: 0.7659
epoch 1, step 600 train loss: 0.3005 val loss: 0.3823 val auc: 0.7756


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.6916, 0.8126, 0.7865, 0.7591, 0.8337, 0.6993, 0.7668, 0.8564, 0.8338, 0.7174] 0.7757
epoch 1, step 630 val auc: 0.7757


LR: [2.5e-05]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 2, step 0 train loss: 0.3372 val loss: 0.3809 val auc: 0.776
epoch 2, step 100 train loss: 0.3902 val loss: 0.3765 val auc: 0.7819
epoch 2, step 200 train loss: 0.2945 val loss: 0.3742 val auc: 0.7848
epoch 2, step 300 train loss: 0.3482 val loss: 0.3711 val auc: 0.7896
epoch 2, step 400 train loss: 0.3258 val loss: 0.3692 val auc: 0.7919
epoch 2, step 500 train loss: 0.3208 val loss: 0.3672 val auc: 0.795
epoch 2, step 600 train loss: 0.3577 val loss: 0.3668 val auc: 0.7973


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7016, 0.8264, 0.7982, 0.7747, 0.8464, 0.7803, 0.7941, 0.8649, 0.8501, 0.747] 0.7984
epoch 2, step 630 val auc: 0.7984


LR: [1.25e-05]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 3, step 0 train loss: 0.3300 val loss: 0.3649 val auc: 0.7985
epoch 3, step 100 train loss: 0.3028 val loss: 0.3640 val auc: 0.7985
epoch 3, step 200 train loss: 0.3319 val loss: 0.3639 val auc: 0.7992
epoch 3, step 300 train loss: 0.3337 val loss: 0.3634 val auc: 0.8001
epoch 3, step 400 train loss: 0.2991 val loss: 0.3627 val auc: 0.8014
epoch 3, step 500 train loss: 0.3083 val loss: 0.3620 val auc: 0.8018
epoch 3, step 600 train loss: 0.2765 val loss: 0.3619 val auc: 0.8025


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7066, 0.83, 0.7984, 0.7823, 0.8439, 0.7956, 0.7978, 0.8723, 0.8516, 0.7438] 0.8022
epoch 3, step 630 val auc: 0.8022


LR: [6.25e-06]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 4, step 0 train loss: 0.2767 val loss: 0.3614 val auc: 0.8023
epoch 4, step 100 train loss: 0.3442 val loss: 0.3609 val auc: 0.8037
epoch 4, step 200 train loss: 0.2893 val loss: 0.3610 val auc: 0.8036
epoch 4, step 300 train loss: 0.2966 val loss: 0.3609 val auc: 0.8028
epoch 4, step 400 train loss: 0.2868 val loss: 0.3613 val auc: 0.8031
epoch 4, step 500 train loss: 0.3027 val loss: 0.3613 val auc: 0.8038
epoch 4, step 600 train loss: 0.3398 val loss: 0.3611 val auc: 0.804


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7065, 0.8321, 0.7984, 0.784, 0.8442, 0.8027, 0.7972, 0.8752, 0.8537, 0.7486] 0.8043
epoch 4, step 630 val auc: 0.8043


LR: [3.125e-06]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 5, step 0 train loss: 0.3363 val loss: 0.3610 val auc: 0.8043
epoch 5, step 100 train loss: 0.3026 val loss: 0.3608 val auc: 0.8043
epoch 5, step 200 train loss: 0.2918 val loss: 0.3609 val auc: 0.8041
epoch 5, step 300 train loss: 0.3056 val loss: 0.3610 val auc: 0.8045
epoch 5, step 400 train loss: 0.3115 val loss: 0.3608 val auc: 0.8046
epoch 5, step 500 train loss: 0.3374 val loss: 0.3606 val auc: 0.8045
epoch 5, step 600 train loss: 0.2986 val loss: 0.3609 val auc: 0.8043


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7063, 0.8327, 0.798, 0.7835, 0.8457, 0.8033, 0.7975, 0.8754, 0.8532, 0.7474] 0.8043
epoch 5, step 630 val auc: 0.8043


LR: [1.5625e-06]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 6, step 0 train loss: 0.2739 val loss: 0.3607 val auc: 0.8043
epoch 6, step 100 train loss: 0.3327 val loss: 0.3608 val auc: 0.8046
epoch 6, step 200 train loss: 0.2637 val loss: 0.3606 val auc: 0.8045
epoch 6, step 300 train loss: 0.3405 val loss: 0.3607 val auc: 0.8047
epoch 6, step 400 train loss: 0.2929 val loss: 0.3607 val auc: 0.8046
epoch 6, step 500 train loss: 0.3710 val loss: 0.3607 val auc: 0.8045
epoch 6, step 600 train loss: 0.2917 val loss: 0.3607 val auc: 0.8045


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7062, 0.8326, 0.7982, 0.7837, 0.8451, 0.8044, 0.7981, 0.8757, 0.8534, 0.748] 0.8045
epoch 6, step 630 val auc: 0.8045


LR: [7.8125e-07]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 7, step 0 train loss: 0.3077 val loss: 0.3607 val auc: 0.8045
epoch 7, step 100 train loss: 0.2633 val loss: 0.3607 val auc: 0.8044
epoch 7, step 200 train loss: 0.2717 val loss: 0.3607 val auc: 0.8044
epoch 7, step 300 train loss: 0.2987 val loss: 0.3607 val auc: 0.8046
epoch 7, step 400 train loss: 0.3176 val loss: 0.3607 val auc: 0.8047
epoch 7, step 500 train loss: 0.3058 val loss: 0.3607 val auc: 0.8046
epoch 7, step 600 train loss: 0.2158 val loss: 0.3608 val auc: 0.8047


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7063, 0.8327, 0.7983, 0.7838, 0.8451, 0.8049, 0.7983, 0.876, 0.8536, 0.7476] 0.8047
epoch 7, step 630 val auc: 0.8047


LR: [3.90625e-07]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 8, step 0 train loss: 0.2861 val loss: 0.3608 val auc: 0.8046
epoch 8, step 100 train loss: 0.3130 val loss: 0.3608 val auc: 0.8047
epoch 8, step 200 train loss: 0.3024 val loss: 0.3608 val auc: 0.8046
epoch 8, step 300 train loss: 0.3504 val loss: 0.3608 val auc: 0.8046
epoch 8, step 400 train loss: 0.3482 val loss: 0.3608 val auc: 0.8047
epoch 8, step 500 train loss: 0.2586 val loss: 0.3607 val auc: 0.8046
epoch 8, step 600 train loss: 0.2790 val loss: 0.3607 val auc: 0.8046


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7063, 0.8328, 0.7984, 0.784, 0.8453, 0.8049, 0.7983, 0.876, 0.8536, 0.7472] 0.8047
epoch 8, step 630 val auc: 0.8047


LR: [1.953125e-07]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 9, step 0 train loss: 0.2992 val loss: 0.3607 val auc: 0.8047
epoch 9, step 100 train loss: 0.2832 val loss: 0.3607 val auc: 0.8047
epoch 9, step 200 train loss: 0.3122 val loss: 0.3607 val auc: 0.8046
epoch 9, step 300 train loss: 0.3127 val loss: 0.3607 val auc: 0.8047
epoch 9, step 400 train loss: 0.2929 val loss: 0.3607 val auc: 0.8047
epoch 9, step 500 train loss: 0.2932 val loss: 0.3607 val auc: 0.8047
epoch 9, step 600 train loss: 0.3029 val loss: 0.3607 val auc: 0.8047


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7062, 0.8328, 0.7984, 0.784, 0.8452, 0.805, 0.7984, 0.876, 0.8536, 0.7473] 0.8047
epoch 9, step 630 val auc: 0.8047


LR: [9.765625e-08]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 10, step 0 train loss: 0.2849 val loss: 0.3607 val auc: 0.8047
epoch 10, step 100 train loss: 0.2914 val loss: 0.3608 val auc: 0.8047
epoch 10, step 200 train loss: 0.2674 val loss: 0.3608 val auc: 0.8047
epoch 10, step 300 train loss: 0.2702 val loss: 0.3607 val auc: 0.8047
epoch 10, step 400 train loss: 0.3314 val loss: 0.3607 val auc: 0.8047
epoch 10, step 500 train loss: 0.2739 val loss: 0.3607 val auc: 0.8047
epoch 10, step 600 train loss: 0.3596 val loss: 0.3607 val auc: 0.8047


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7063, 0.8329, 0.7984, 0.784, 0.8452, 0.805, 0.7984, 0.8761, 0.8535, 0.7473] 0.8047
epoch 10, step 630 val auc: 0.8047


LR: [4.8828125e-08]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 11, step 0 train loss: 0.3257 val loss: 0.3608 val auc: 0.8047
epoch 11, step 100 train loss: 0.3111 val loss: 0.3607 val auc: 0.8047
epoch 11, step 200 train loss: 0.3203 val loss: 0.3607 val auc: 0.8047
epoch 11, step 300 train loss: 0.3136 val loss: 0.3607 val auc: 0.8047
epoch 11, step 400 train loss: 0.3374 val loss: 0.3608 val auc: 0.8047
epoch 11, step 500 train loss: 0.3177 val loss: 0.3608 val auc: 0.8047
epoch 11, step 600 train loss: 0.3481 val loss: 0.3608 val auc: 0.8047


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7062, 0.8328, 0.7984, 0.784, 0.8452, 0.805, 0.7985, 0.8761, 0.8535, 0.7472] 0.8047
epoch 11, step 630 val auc: 0.8047


LR: [2.44140625e-08]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 12, step 0 train loss: 0.3206 val loss: 0.3608 val auc: 0.8047
epoch 12, step 100 train loss: 0.2680 val loss: 0.3608 val auc: 0.8047
epoch 12, step 200 train loss: 0.2924 val loss: 0.3608 val auc: 0.8047
epoch 12, step 300 train loss: 0.2739 val loss: 0.3608 val auc: 0.8047
epoch 12, step 400 train loss: 0.2711 val loss: 0.3608 val auc: 0.8047
epoch 12, step 500 train loss: 0.2474 val loss: 0.3608 val auc: 0.8047
epoch 12, step 600 train loss: 0.2905 val loss: 0.3608 val auc: 0.8047


  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7062, 0.8328, 0.7984, 0.784, 0.8452, 0.805, 0.7985, 0.8761, 0.8535, 0.7472] 0.8047
epoch 12, step 630 val auc: 0.8047


LR: [1.220703125e-08]


  0%|          | 0/631 [00:00<?, ?it/s]

epoch 13, step 0 train loss: 0.3478 val loss: 0.3608 val auc: 0.8047
epoch 13, step 100 train loss: 0.2537 val loss: 0.3608 val auc: 0.8047


KeyboardInterrupt: 

In [843]:
# train_df.to_csv('train.csv', index=False)
# val_df.to_csv('val.csv', index=False)
# df_test.to_csv('test.csv', index=False)

### Testing the model

In [275]:
# Loading test data
test_dataloader = DataLoader(test_dict, batch_size=batch_size, drop_last=False, shuffle=False, num_workers=0, collate_fn=data_collator)

In [299]:
# Testing the model with new test data
model.eval()
eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
test_loss = evaluate_model(model, test_dataloader, verbose=True)
model.train()
print(f'val auc: {eval_loss} test auc: {test_loss}')

  0%|          | 0/70 [00:00<?, ?it/s]

aucs: [0.7062, 0.8328, 0.7984, 0.784, 0.8452, 0.805, 0.7985, 0.8761, 0.8535, 0.7472] 0.8047


  0%|          | 0/78 [00:00<?, ?it/s]

aucs: [0.7319, 0.8234, 0.8069, 0.7884, 0.8493, 0.8047, 0.8147, 0.9034, 0.8528, 0.7145] 0.809
val auc: 0.8047 test auc: 0.809


In [282]:
# Get the predictions for each test
model.eval()
facts, preds = predict_with_model(model, test_dataloader)

In [284]:
from sklearn.metrics import f1_score

# Calculate the f1-score for each emotions
pd.DataFrame([
    {av: f1_score(facts[:, i], preds[:, i] > 0.5, average=av) for av in ['binary', 'micro', 'macro']}
    for i in range(10)
]).round(4)

Unnamed: 0,binary,micro,macro
0,0.3863,0.7192,0.6021
1,0.5833,0.7951,0.7237
2,0.4157,0.8204,0.6548
3,0.4016,0.7642,0.6274
4,0.5386,0.863,0.7291
5,0.194,0.9032,0.5712
6,0.0,0.9156,0.478
7,0.6596,0.9482,0.8158
8,0.0,0.9526,0.4879
9,0.0,0.9606,0.49


In [287]:
# Calculate the average f1-score for all emotions
pd.DataFrame([
    {av: f1_score(facts[:, i], preds[:, i] > 0.5, average=av) for av in ['binary', 'micro', 'macro']}
    for i in range(10)
]).mean().round(4)

binary    0.3179
micro     0.8642
macro     0.6180
dtype: float64

### Save the trained model

In [1043]:
# model.save_pretrained(path + 'emotion_detection')
# tokenizer.save_pretrained(path + 'emotion_detection')

# Load our model

In [2]:
# !pip install transformers -q

In [3]:
# Set the flag to True if notebook running in colab
is_google_colab = False

# Main directory path
path = ''
if is_google_colab:
    path = '/content/drive/MyDrive/'

In [4]:
if is_google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

### Implement the prediction function

In [247]:
import torch
from transformers import BertForSequenceClassification, AutoTokenizer

LABELS = ['neutral', 'joy', 'sadness', 'anger', 'enthusiasm', 'surprise', 'disgust', 'fear', 'guilt', 'shame']
LABELS_RU = ['нейтрально', 'радость', 'грусть', 'гнев', 'интерес', 'удивление', 'отвращение', 'страх', 'вина', 'стыд']

model = BertForSequenceClassification.from_pretrained(path + 'emotion_detection')
tokenizer = AutoTokenizer.from_pretrained(path + 'emotion_detection')

# Predicting emotion in text
@torch.no_grad()
def predict_emotion(text: str, labels: list = LABELS) -> str:
    inputs = tokenizer(text, truncation=True, return_tensors='pt')
    inputs = inputs.to(model.device)

    outputs = model(**inputs)

    pred = torch.nn.functional.sigmoid(outputs.logits)
    pred = pred.argmax(dim=1)

    return labels[pred[0]]


# Probabilistic prediction of emotion in a text
@torch.no_grad()
def predict_emotions(text: str, labels: list = LABELS) -> dict:
    inputs = tokenizer(text, truncation=True, return_tensors='pt')
    inputs = inputs.to(model.device)

    outputs = model(**inputs)

    pred = torch.nn.functional.sigmoid(outputs.logits)

    emotions_list = {}
    for i in range(len(pred[0].tolist())):
        emotions_list[labels[i]] = round(pred[0].tolist()[i], 3)
    return emotions_list

### Тестируем модель

In [58]:
print(predict_emotion('Обожаю цветы'))
print(predict_emotion('Ненавижу цветы'))
print(predict_emotion('Мне страшно'))
print(predict_emotion('Куда мне стоит сегодня сходить?'))

joy
anger
fear
enthusiasm


In [59]:
def print_predict(prompt):
    emotions = predict_emotions(prompt)
    for label in sorted(emotions, key=lambda x: -emotions[x]):
        print(label.title().ljust(10), emotions[label])

print_predict('Как дела?')

Enthusiasm 0.825
Neutral    0.413
Surprise   0.184
Anger      0.155
Joy        0.091
Sadness    0.049
Fear       0.033
Disgust    0.026
Shame      0.023
Guilt      0.009


In [295]:
# NEUTRAL    - Сейчас ровно час дня
# JOY        - Сегодня такой прекрасный день!
# SADNESS    - Жалею что вчера сходил на этот концерт
# ANGER      - Что за бред я только что посмотрел...
# ENTHUSIASM - Куда бы сегодня сходить?
# SURPRISE   - Воу, это было так неожиданно
# DISGUST    - Фу, эта еда просто отвратительна!
# FEAR       - В темной комнате услышал тихий посторонний шорох
# GUILT      - Извини, я не хотел чтобы так все произошло

print_predict('Сегодня такой замечательный день!')

Joy        0.873
Neutral    0.229
Surprise   0.153
Enthusiasm 0.099
Anger      0.047
Sadness    0.045
Fear       0.023
Disgust    0.015
Shame      0.012
Guilt      0.008


In [297]:
print(predict_emotions('Обожаю цветы'))
print(predict_emotions('Ненавижу цветы'))
print(predict_emotions('Мне страшно'))
print(predict_emotions('Сегодня такой прекрасный день!'))
print(predict_emotions('Сегодня такой замечательный день!'))

{'neutral': 0.229, 'joy': 0.827, 'sadness': 0.036, 'anger': 0.039, 'enthusiasm': 0.217, 'surprise': 0.143, 'disgust': 0.011, 'fear': 0.021, 'guilt': 0.007, 'shame': 0.01}
{'neutral': 0.187, 'joy': 0.079, 'sadness': 0.243, 'anger': 0.833, 'enthusiasm': 0.06, 'surprise': 0.05, 'disgust': 0.394, 'fear': 0.037, 'guilt': 0.022, 'shame': 0.108}
{'neutral': 0.097, 'joy': 0.078, 'sadness': 0.128, 'anger': 0.086, 'enthusiasm': 0.081, 'surprise': 0.119, 'disgust': 0.066, 'fear': 0.875, 'guilt': 0.036, 'shame': 0.047}
{'neutral': 0.265, 'joy': 0.865, 'sadness': 0.044, 'anger': 0.047, 'enthusiasm': 0.101, 'surprise': 0.124, 'disgust': 0.014, 'fear': 0.021, 'guilt': 0.008, 'shame': 0.01}
{'neutral': 0.229, 'joy': 0.873, 'sadness': 0.045, 'anger': 0.047, 'enthusiasm': 0.099, 'surprise': 0.153, 'disgust': 0.015, 'fear': 0.023, 'guilt': 0.008, 'shame': 0.012}
