# Генерация релевантных ответов представителей органов государственной власти

## Импорты

In [None]:
import os
from pathlib import Path

import pandas as pd
import numpy as np


In [None]:
from utils import *
from model import Config, CustomGPT2Model
from trainer import Trainer, TrainingManager
from custom_data import CustomDataset, CategoricalLabelEncoder

In [None]:
print('IMAGES_PATH ', IMAGES_PATH)
print('DATA_PATH ', DATA_PATH)

seed_all(42)

In [None]:
plot_graph()

# Загрузка данных

In [None]:
dataframe = load_data(os.path.join(DATA_PATH, "data_1.xlsx"))
print("Number of rows and columns in the train data set:", dataframe.shape)

In [None]:
dataframe.head(5)

In [None]:
dataframe.info()

In [None]:
dataframe.describe()

## Анализ данных

In [None]:
plot_hist(dataframe, 'type_problem')

In [None]:
plot_hist(dataframe, 'categoria')

## Препроцессинг

## Удаление пропусков

In [None]:
dataframe = dataframe.dropna(subset=['target', 'source'])
dataframe.info()

### Кодирование меток

In [None]:
work_dataframe = dataframe.copy()

categorical_columns = ['responsible_person', 'type_problem', 'topic', 'categoria', 'region']
datafarme_encoders = CategoricalLabelEncoder(work_dataframe, categorical_columns)

datafarme_encoders.get_classes('type_problem')

print(datafarme_encoders.decode('type_problem', 10))

In [None]:
work_dataframe.head(4)

# Гиперпараметры

In [None]:
work_dataframe = work_dataframe[:10]

In [None]:
config = Config()

config.learning_rate = 1e-5
config.num_epochs = 10
config.max_length = 64
config.temperature = 0.9
config.batch_size = 16
config.test_size = 0.1
config.uniq_name = 'custom_gpt2_model'
config.special_eval = False

# Обучение

In [None]:
training_manager = TrainingManager(
    work_dataframe,
    datafarme_encoders,
    config,
    data_path=DATA_PATH,
    imgs_path=IMAGES_PATH
)

In [None]:
training_manager.fit()

In [None]:
training_manager.plot_main_metrics()

In [None]:
training_manager.plot_special_metrics()

In [None]:
# training_manager.save()

# Предикт

In [None]:
testing_train_data: pd.DataFrame = training_manager.train_dataloader.dataset.dataframe
testing_train_data = testing_train_data.reset_index()
testing_train_data.head()


In [None]:
type_message = testing_train_data['type_problem'][0]
source = testing_train_data['source'][0]
target = testing_train_data['target'][0]

print(datafarme_encoders.decode('type_problem', type_message))

print('Source - ', source)
print('Target - ', target)

In [None]:
generated_text_argmax = training_manager.generate_text(source, type_message, max_length=config.max_length, method='argmax')
print('argmax -', generated_text_argmax)

generated_text_temperature = training_manager.generate_text(source, type_message, max_length=config.max_length, method='sampling', temperature=0.7,)
print('sampling -', generated_text_temperature)

generated_text_top_k = training_manager.generate_text(source, type_message, max_length=config.max_length, method='top_k', top_k=11)
print('top_k -', generated_text_top_k)

generated_text_top_p = training_manager.generate_text(source, type_message, max_length=config.max_length, method='top_p', top_p=0.9)
print('top_p -', generated_text_top_p)

In [None]:
special_metrics = training_manager.trainer.evaluator.evaluate(
    hypotheses=[generated_text_temperature],
    references=[target]
)
print(special_metrics)