# Inference the model on test data

In [None]:
!pip3 install emoji transformers

In [None]:
import torch

In [None]:
device = 'cpu' #'cuda' if torch.cuda.is_available() else 'cpu'
torch.cuda.empty_cache()

## Read

In [None]:
from pathlib import Path

data_directory_path = Path("../data/")
data_path = data_directory_path / "train_dataset_train.csv"
val_data_path = data_path #data_directory_path / "val.csv"

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv(data_path, sep=';')
val_data = pd.read_csv(val_data_path, sep=';').tail(90)

In [None]:
data

In [None]:
val_data = pd.DataFrame(val_data["Текст инцидента"])

In [None]:
val_data

## Preprocess

In [None]:
from utils import preprocess

In [None]:
data = preprocess.preprocess_data(data)
val_data = preprocess.preprocess_data(val_data)

## Dataset

In [None]:
from utils.dataset import prepare_data_for_dataset, MessagesDataset

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
executor_encoder = LabelEncoder().fit(data["Исполнитель"])
group_encoder    = LabelEncoder().fit(data["Группа тем"])
theme_encoder    = LabelEncoder().fit(data["Тема"])

In [None]:
data["Исполнитель"] = executor_encoder.transform(data["Исполнитель"])
data["Группа тем"]  = group_encoder.transform(data["Группа тем"])
data["Тема"]        = theme_encoder.transform(data["Тема"])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
_, test_executor, _, test_group, _, test_message, _, test_theme = train_test_split(
    *prepare_data_for_dataset(data),
    test_size=0.01,
    random_state=42,
)

In [None]:
del _

In [None]:
#train_dataset = MessagesDataset(train_message, train_executor, train_theme, train_group)
test_dataset  = MessagesDataset(test_message, test_executor, test_theme, test_group)

In [None]:
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, messages):
        self.messages  = messages


    def __len__(self):
        return len(self.messages)


    def __getitem__(self, idx):
        return self.messages[idx]

In [None]:
val_dataset = InferenceDataset(val_data["Текст инцидента"].tolist())

## Model

Используется двуголовая модель.

In [None]:
models_path = Path("../models")
local_weights_filename = "full_ruroberta_5.pt"

In [None]:
from transformers import AutoConfig, AutoModel, AutoTokenizer

# "sberbank-ai/ruRoberta-large"
# "DeepPavlov/distilrubert-base-cased-conversational"
# "xlm-roberta-base"
language_model_name = "sberbank-ai/ruRoberta-large"

language_model = AutoModel.from_config(AutoConfig.from_pretrained(language_model_name))
tokenizer = AutoTokenizer.from_pretrained(language_model_name, model_max_length=512)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
#train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader  = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)
val_dataloader   = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=False)

In [None]:
nllloss = torch.nn.NLLLoss()
#classifier_loss = lambda y_pred, y: nllloss(y_pred[0], y[0]) + nllloss(y_pred[1], y[1])
classifier_loss = lambda y_pred, y: 0.0 * nllloss(y_pred[0], y[0]) + 1.0 * nllloss(y_pred[1], y[1])
#evaluate_classifier(classifier, test_dataloader, classifier_loss, device)

In [None]:
from utils.classifier import Classifier, evaluate_classifier

In [None]:
classifier = Classifier(language_model, tokenizer, preprocess.theme_to_group(data), executor_encoder, theme_encoder, hid_size=1024).to(device)

classifier.load_state_dict(torch.load(models_path / local_weights_filename, map_location=device))

In [None]:
evaluate_classifier(classifier, test_dataloader, classifier_loss, device)

In [None]:
del test_dataloader

del test_executor
del test_group
del test_message
del test_theme

## Inference

In [None]:
from tqdm import tqdm

In [None]:
def inference_classifier(classifier, dataloader, device) -> (np.array, np.array, np.array):
    """
    Inference the classifier.
    """
    
    # Exit training mode.
    was_in_training = classifier.training
    classifier.eval()
    
    # Targets and predictions.
    y_pred_all_1 = []
    y_pred_all_2 = []
    
    with torch.no_grad():
        for index, message in enumerate(tqdm(dataloader)):
            tokens = classifier.tokenizer(message, padding=True, truncation='only_first',
                                          return_tensors="pt").to(device)
            del message

            y_pred_1, y_pred_2 = classifier(tokens)

            y_pred_all_1.append(np.argmax(y_pred_1.detach().cpu().numpy(), axis=1))
            y_pred_all_2.append(np.argmax(y_pred_2.detach().cpu().numpy(), axis=1))
            del y_pred_1
            del y_pred_2

    y_pred_all_1 = np.concatenate(y_pred_all_1)
    y_pred_all_2 = np.concatenate(y_pred_all_2)
    y_pred_all_3 = np.vectorize(classifier.theme_to_group.get)(y_pred_all_2)

    # Return to the original mode.
    classifier.train(was_in_training)
    
    return y_pred_all_1, y_pred_all_2, y_pred_all_3

In [None]:
prediction = pd.DataFrame()
prediction_columns = inference_classifier(classifier, val_dataloader, device)
prediction["Тема"] = theme_encoder.inverse_transform(prediction_columns[1])
prediction["Группа тем"] = group_encoder.inverse_transform(prediction_columns[2])

In [None]:
prediction_file_name = local_weights_filename + ".csv"
prediction.to_csv(data_directory_path / prediction_file_name, sep=";")

In [None]:
prediction