# Baseline solution

In [None]:
!pip3 install emoji transformers

In [None]:
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.cuda.empty_cache()

## Read

In [None]:
from pathlib import Path

data_directory_path = Path("../data/")
data_path = data_directory_path / "train_dataset_train.csv"

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv(data_path, sep=';')#.tail(100)

In [None]:
data

Все тексты инцидентов начинаются с "'"

In [None]:
data["Текст инцидента"].apply(lambda x : (x[0] == "'")).all()

Каждой теме соответствкет только одна группа тем:

In [None]:
(data.groupby(["Тема"])[["Группа тем"]].nunique() > 1).any()

Нет тем, которые со звёздочкой и без одновременно:

In [None]:
themes = set(data["Тема"].unique())

In [None]:
themes.intersection(set(x[2:] for x in data["Тема"].unique()))

## Preprocess

In [None]:
from utils import preprocess

In [None]:
data = preprocess.preprocess_data(data)

In [None]:
data["Текст инцидента"][data["Текст инцидента"].str.contains("в шоке")]

## Dataset

In [None]:
tokenize = lambda x : tokenizer(x, padding=True, truncation='only_first', return_tensors="pt").to(device)

In [None]:
from utils.dataset import prepare_data_for_dataset, MessagesDataset

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
executor_encoder = LabelEncoder().fit(data["Исполнитель"])
group_encoder    = LabelEncoder().fit(data["Группа тем"])
theme_encoder    = LabelEncoder().fit(data["Тема"])

In [None]:
data["Исполнитель"] = executor_encoder.transform(data["Исполнитель"])
data["Группа тем"]  = group_encoder.transform(data["Группа тем"])
data["Тема"]        = theme_encoder.transform(data["Тема"])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_executor, test_executor, train_group, test_group, train_message, test_message, train_theme, test_theme = train_test_split(
    *prepare_data_for_dataset(data, device),
    test_size=0.25,
    random_state=42,
)

In [None]:
train_dataset = MessagesDataset(train_message, train_executor, train_theme, train_group)
test_dataset  = MessagesDataset(test_message, test_executor, test_theme, test_group)

## Model

Используется двуголовая модель.

In [None]:
models_path = Path("../models")
local_weights_filename = "distilbert.pt"
load_local_weights = False

In [None]:
from transformers import AutoConfig, AutoModel, AutoTokenizer

# "sberbank-ai/ruRoberta-large"
# "DeepPavlov/distilrubert-base-cased-conversational"
# "xlm-roberta-base"
language_model_name = "DeepPavlov/distilrubert-base-cased-conversational"

if load_local_weights:
    language_model = AutoModel.from_config(AutoConfig.from_pretrained(language_model_name))
else:
    language_model = AutoModel.from_pretrained(language_model_name)
tokenizer = AutoTokenizer.from_pretrained(language_model_name, model_max_length=512)

In [None]:
for params in language_model.parameters():
    params.requires_grad = False

In [None]:
from utils.classifier import Classifier, evaluate_classifier

In [None]:
classifier = Classifier(language_model, tokenizer, preprocess.theme_to_group(data), executor_encoder, theme_encoder, hid_size=768).to(device)

if load_local_weights:
    classifier.load_state_dict(torch.load(models_path / local_weights_filename))
    classifier = classifier.to(device)

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
nllloss = torch.nn.NLLLoss()
#classifier_loss = lambda y_pred, y: nllloss(y_pred[0], y[0]) + nllloss(y_pred[1], y[1])
classifier_loss = lambda y_pred, y: 0.2 * nllloss(y_pred[0], y[0]) + 0.8 * nllloss(y_pred[1], y[1])
evaluate_classifier(classifier, test_dataloader, classifier_loss, device)

In [None]:
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import f1_score
from tqdm import tqdm

def train_classifier(classifier, train_dataloader, test_dataloader, classifier_loss, device, lr=1e-6, n_epochs: int=5):
    classifier_metrics = {"train_loss" : [], "test_loss" : [], "f1_score_et": [], "f1_score_tg": []}

    opt = torch.optim.Adam(classifier.parameters(), lr=lr)

    for epoch in range(n_epochs):
        avg_loss = 0.0
        total_samples = 0

        for index, batch in enumerate(tqdm(train_dataloader)):
            message, executor, theme, group = batch
            del batch
            
            batch_size = theme.shape[0]

            tokens = classifier.tokenizer(message, padding=True, truncation='only_first',
                                          return_tensors="pt").to(device)
            del message

            y_pred_1, y_pred_2 = classifier(tokens)
            _loss = classifier_loss((y_pred_1, y_pred_2), (executor, theme))
            del y_pred_1
            del y_pred_2
            del executor
            del theme

            opt.zero_grad()
            _loss.backward()
            opt.step()

            avg_loss += _loss.item() * batch_size
            total_samples += batch_size
            
        avg_loss /= total_samples
        classifier_metrics["train_loss"].append(avg_loss)

        test_loss, test_f1_score_et, test_f1_score_tg = evaluate_classifier(classifier, test_dataloader, classifier_loss, device)
        classifier_metrics["test_loss"].append(test_loss)
        classifier_metrics["test_f1_score_et"].append(test_f1_score_et)
        classifier_metrics["test_f1_score_tg"].append(test_f1_score_tg)

        clear_output(True)
        plt.figure(figsize=(18,4))
        for index, (name, history) in enumerate(sorted(classifier_metrics.items())):
            plt.subplot(1, len(classifier_metrics), index + 1)
            plt.title(name)
            plt.plot(range(1, len(history) + 1), history)
            plt.grid()

        plt.show();
        #print("Mean loss=%.3f" % np.mean(metrics['train_loss'][-1:], axis=0)[1], flush=True)

    return classifier_metrics

In [None]:
train_classifier(classifier, train_dataloader, test_dataloader, classifier_loss, device, lr=1e-3, n_epochs=6)

In [None]:
for params in classifier.parameters():
    params.requires_grad = True

In [None]:
train_classifier(classifier, train_dataloader, test_dataloader, classifier_loss, device, lr=1e-6, n_epochs=30)

In [None]:
evaluate_classifier(classifier, test_dataloader, classifier_loss, device)

In [None]:
classifier.predict(data["Текст инцидента"][:10].tolist(), device=device)

In [None]:
theme_encoder.inverse_transform(data["Тема"][:10].tolist())

## Save model

In [None]:
torch.save(classifier.state_dict(), models_path / "distilbert2.pt")

In [None]:
import pickle

with open(models_path / "executor_encoder.obj", 'wb') as file:
    pickle.dump(executor_encoder, file)

with open(models_path / "theme_encoder.obj", 'wb') as file:
    pickle.dump(theme_encoder, file)