### Эксперимент с предобученной моделью их DeepPavlov.

#### Для подготовки данных использовалась только токенизация. Исходила из предположения, что лемматизация и удаление стоп-слов усложнят задачу обнаружения плохо сгенерированного текста.

In [None]:
# Пути к файлам с данными
data_file_path = '/kaggle/input/pp2-dataset/hackaton_result_dataset.xlsx'

In [14]:
# Необходимые модули
import pandas as pd
import json
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import numpy as np
import evaluate
import torch

from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import classification_report

In [5]:
# Данные
df = pd.read_excel(data_file_path)
df = df[['model_annotation', 'label']]
df

Unnamed: 0,model_annotation,label
0,давай по россии значит на коленях быстро блять...,1
1,ну разве можно так с телефоном поступает,0
2,у меня нет с собой в полном адресе я щас дома ...,0
3,а я здесь кто я санитар,0
4,дежурный по кузьминскому военнокомату,0
...,...,...
6503,это студия,1
6504,потише говори у меня рядом течение вдруг сидит...,0
6505,если в поймаю дай бог а зачем тогда будешь рез...,1
6506,а ты все удобром что ли а че будет алло алло т...,1


In [6]:
# Срздание обучающей и валидационной выборок
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)


In [7]:
# Токенизация, используется тот же токенизатор, что и в выбранной модели
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenized_train = tokenizer(train_df['model_annotation'].tolist(), padding=True, truncation=True, return_tensors="pt").to("cuda")
tokenized_test = tokenizer(valid_df['model_annotation'].tolist(), padding=True, truncation=True, return_tensors="pt").to("cuda")

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
# Формирование датасетов
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_texts["input_ids"][idx],
            "attention_mask": self.tokenized_texts["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx]).to('cuda')
        }

train_labels = train_df['label'].tolist()
test_labels = valid_df['label'].tolist()
train_dataset = CustomDataset(tokenized_train, train_labels)
eval_dataset = CustomDataset(tokenized_test, test_labels)

In [74]:
# Аргументы для обучения модели
training_args = TrainingArguments(
    output_dir="output",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    warmup_steps=100,
    weight_decay=0.02,
#     evaluation_strategy="epoch",
#     save_strategy='epoch',
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=150,
    remove_unused_columns=True,
    load_best_model_at_end=True,
    logging_steps=50,
    report_to='none',
    metric_for_best_model="roc_auc"
)

In [75]:
# Функция подсчёта меткири
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    probabilities = eval_pred.predictions[:, 1]  # Assuming the second column contains the probabilities for class 1
    roc_auc = roc_auc_score(labels, probabilities)
    return {"roc_auc": roc_auc}

In [76]:
# Предобученная модель
model = AutoModelForSequenceClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased", num_labels=2).to("cuda")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
# Гиперпараметры для обучения модели
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [78]:
# Обучение модели
trainer.train()



Step,Training Loss,Validation Loss,Roc Auc
50,0.6703,0.668175,0.63462
100,0.6094,0.604002,0.729847
150,0.6266,0.639463,0.734489
200,0.5376,0.611068,0.746775
250,0.4994,0.619693,0.765972
300,0.5101,0.614261,0.766037
350,0.436,0.651965,0.773783
400,0.3234,0.708034,0.774195
450,0.3193,0.704598,0.777074
500,0.3071,0.793847,0.765053




TrainOutput(global_step=600, training_loss=0.43405691146850583, metrics={'train_runtime': 306.5248, 'train_samples_per_second': 84.92, 'train_steps_per_second': 2.659, 'total_flos': 699438601794600.0, 'train_loss': 0.43405691146850583, 'epoch': 3.68})

#### Здесь у меня немного перемешались ячейки. Сначала я находила лучшую модель, потом её сжимала в zip и создавала ссылку для загрузкм.

In [36]:
!zip -r best3.zip /kaggle/working/output/best_model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/output/best_model/ (stored 0%)
  adding: kaggle/working/output/best_model/tokenizer.json (deflated 73%)
  adding: kaggle/working/output/best_model/scheduler.pt (deflated 56%)
  adding: kaggle/working/output/best_model/training_args.bin (deflated 51%)
  adding: kaggle/working/output/best_model/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/output/best_model/optimizer.pt (deflated 53%)
  adding: kaggle/working/output/best_model/vocab.txt (deflated 64%)
  adding: kaggle/working/output/best_model/rng_state.pth (deflated 25%)
  adding: kaggle/working/output/best_model/config.json (deflated 53%)
  adding: kaggle/working/output/best_model/trainer_state.json (deflated 76%)
  adding: kaggle/working/output/best_model/model.safetensors (deflated 7%)
  adding: kaggle/working/output/best_model/special_tokens_map.json (deflated 42%)


In [37]:
from IPython.display import FileLink
FileLink(r'best3.zip')

In [35]:
import os

# Среди сохраненных чекпоинтов выбирается в наилучшей метрикой
checkpoint_files = [f for f in os.listdir(training_args.output_dir) if f.startswith("checkpoint-")]

best_metric = float('-inf')
best_checkpoint = None
for checkpoint_file in checkpoint_files:
    
    trainer.model = AutoModelForSequenceClassification.from_pretrained(os.path.join(training_args.output_dir, checkpoint_file)).to('cuda')

    result = trainer.evaluate()

    if result["eval_roc_auc"] > best_metric:
        best_metric = result["eval_roc_auc"]
        best_checkpoint = checkpoint_file

best_checkpoint_path = os.path.join(training_args.output_dir, best_checkpoint)
best_model_path = os.path.join(training_args.output_dir, "best_model")
os.rename(best_checkpoint_path, best_model_path)





