## Модель

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [1]:
import numpy as np
import pandas as pd

In [3]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'

In [4]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Данные

In [2]:
# df = pd.read_csv("train.csv")
# df_val = pd.read_csv("val.csv")
# df_test = pd.read_csv("test.csv")

# df = pd.read_csv("train_aggreem.csv")
# df_val = pd.read_csv("val_aggreem.csv")
# df_test = pd.read_csv("test_aggreem.csv")

# RST = 1
# RST = 2
# RST = 3
RST = 4
df = pd.read_csv('./splits/train_randst{}.csv'.format(RST))
df_val = pd.read_csv('./splits/val_randst{}.csv'.format(RST))
df_test = pd.read_csv('./splits/test_randst{}.csv'.format(RST))
                     
# train_df = train_df[["text", "inappropriate"]]
# val_df = test_df[["text", "inappropriate"]]
# test_df = test_df[["text", "inappropriate"]]

for d in [df, df_val, df_test]:
    d['inappropriate'] = d['inappropriate'].apply(round)


In [4]:
len(df)+len(df_val)+len(df_test)

124597

In [7]:
df.columns

Index(['text', 'inappropriate', 'offline_crime', 'online_crime', 'drugs',
       'gambling', 'pornography', 'prostitution', 'slavery', 'suicide',
       'terrorism', 'weapons', 'body_shaming', 'health_shaming', 'politics',
       'racism', 'religion', 'sexual_minorities', 'sexism', 'social_injustice',
       'human_labeled', 'toxic_auto'],
      dtype='object')

In [8]:
def round_val(val):
    return round(val)

In [9]:
# label_name = 'inappropriate'
# threshold = 0
# df = df[(df[label_name] >= 1-threshold)|(df[label_name] <=threshold)]
# df_val = df_val[(df_val[label_name] >= 1-threshold)|(df_val[label_name] <=threshold)]
# df_test = df_test[(df_test[label_name] >= 1-threshold) | (df_test[label_name] <=threshold)]

# df[label_name] = df[label_name].apply(round_val)
# df_val[label_name] = df_val[label_name].apply(round_val)
# df_test[label_name] = df_test[label_name].apply(round_val)

In [10]:
df.columns

Index(['text', 'inappropriate', 'offline_crime', 'online_crime', 'drugs',
       'gambling', 'pornography', 'prostitution', 'slavery', 'suicide',
       'terrorism', 'weapons', 'body_shaming', 'health_shaming', 'politics',
       'racism', 'religion', 'sexual_minorities', 'sexism', 'social_injustice',
       'human_labeled', 'toxic_auto'],
      dtype='object')

In [11]:
class UnsafeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [12]:
train_dataset = UnsafeDataset(tokenizer(df.text.tolist(),
                                        max_length=64,
                                        truncation=True,
                                        padding='longest'), df.inappropriate.tolist())

In [13]:
eval_dataset = UnsafeDataset(tokenizer(df_val.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), df_val.inappropriate.tolist())


In [14]:
test_dataset = UnsafeDataset(tokenizer(df_test.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), df_test.inappropriate.tolist())

## Обучение

In [15]:
import os
os.environ['CUDA_VISIBLE_DEVICES']= '6'

In [16]:
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

device = torch.device('cuda')

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self) -> Tuple["torch.device", int]:
        return device

In [17]:
# torch.cuda.set_device(device)
model.to(device);

In [18]:
for param in model.bert.parameters():
    param.requires_grad=True

In [19]:
training_args = TrAr(
    output_dir='./unsafe/FINAL_VERS',   # output directory
    overwrite_output_dir=True,
    num_train_epochs=5,            # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=0,               # number of warmup steps for learning rate scheduler
    weight_decay=1e-8,              # strength of weight decay
    learning_rate=2e-5,
    save_total_limit=2,
    logging_dir='./logs',           # directory for storing logs
    logging_steps=2500,
    eval_steps=2500,
    save_steps=2500,
    evaluation_strategy='steps',metric_for_best_model = 'f1',greater_is_better = True, load_best_model_at_end = True, report_to='none'
)

In [20]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds_prob = pred.predictions[:,-1]
    rauc = roc_auc_score(labels, preds_prob)
    
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        "roc_auc":rauc
    }

In [21]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics  = compute_metrics
)

In [22]:
from transformers.trainer_callback import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(2)) 

In [23]:
training_args.device

device(type='cuda')

In [24]:
trainer.train()

***** Running training *****
  Num examples = 99677
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 15575


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
2500,0.335,0.290585,0.875923,0.873164,0.872847,0.875923,0.931783
5000,0.2038,0.367803,0.870626,0.865481,0.867373,0.870626,0.929167
7500,0.1263,0.526586,0.874318,0.87273,0.871977,0.874318,0.926649


***** Running Evaluation *****
  Num examples = 12460
  Batch size = 32
Saving model checkpoint to ./unsafe/FINAL_VERS/checkpoint-2500
Configuration saved in ./unsafe/FINAL_VERS/checkpoint-2500/config.json
Model weights saved in ./unsafe/FINAL_VERS/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./unsafe/FINAL_VERS/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./unsafe/FINAL_VERS/checkpoint-2500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 12460
  Batch size = 32
Saving model checkpoint to ./unsafe/FINAL_VERS/checkpoint-5000
Configuration saved in ./unsafe/FINAL_VERS/checkpoint-5000/config.json
Model weights saved in ./unsafe/FINAL_VERS/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./unsafe/FINAL_VERS/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./unsafe/FINAL_VERS/checkpoint-5000/special_tokens_map.json
Deleting older checkpoint [unsafe/FINAL_VERS/checkpoint-7500] due to args.save

TrainOutput(global_step=7500, training_loss=0.22169110107421874, metrics={'train_runtime': 1187.4157, 'train_samples_per_second': 419.722, 'train_steps_per_second': 13.117, 'total_flos': 7893134327508480.0, 'train_loss': 0.22169110107421874, 'epoch': 2.41})

## Evaluation

In [25]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 12460
  Batch size = 32


{'eval_loss': 0.2874436378479004,
 'eval_accuracy': 0.8794542536115569,
 'eval_f1': 0.8768171192254086,
 'eval_precision': 0.8765662325398438,
 'eval_recall': 0.8794542536115569,
 'eval_roc_auc': 0.9338746374606537,
 'eval_runtime': 16.4635,
 'eval_samples_per_second': 756.825,
 'eval_steps_per_second': 23.689,
 'epoch': 2.41}

In [1]:
data = [[0.8765, 0.879, 0.876, 0.933],
        [0.872, 0.873, 0.8730, 0.9302],
        [0.8735,0.87399,0.8737,0.9300],
       [0.8724,0.87399,0.8727,0.93009]]

In [4]:
import numpy as np

In [5]:
np.mean(data, axis= 0)

array([0.8736   , 0.874995 , 0.87385  , 0.9308225])

In [6]:
np.std(data, axis= 0)

array([0.0017621 , 0.00234734, 0.00129325, 0.00125917])

In [None]:
rst4
{'eval_loss': 0.2874436378479004,
 'eval_accuracy': 0.8794542536115569,
 'eval_f1': 0.8768171192254086,
 'eval_precision': 0.8765662325398438,
 'eval_recall': 0.8794542536115569,
 'eval_roc_auc': 0.9338746374606537,
 'eval_runtime': 16.4635,
 'eval_samples_per_second': 756.825,
 'eval_steps_per_second': 23.689,
 'epoch': 2.41}

In [None]:
pred = trainer.predict(test_dataset)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [None]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, classification_report, roc_auc_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
# Function to calculate the accuracy of our predictions vs labels
def get_metrics(preds):
    preds, labels = preds.predictions, preds.label_ids
    #standard round approach    
    pred_flat = np.argmax(preds, axis=1).flatten()    
    pr, rec, f, _ = precision_recall_fscore_support(labels, pred_flat, average='weighted')  
    
    print("precision", pr)
    print("recall", rec)
    print("fscore_weighted", f)
    
    #adjust threshold approach
    preds_adj = np.array([[float(el1),float(el2)] for el1,el2 in preds])
    preds_adj = softmax(preds_adj, axis = 1)
    roc_auc = roc_auc_score(labels, preds_adj[:, 1])
    print("roc_auc", roc_auc)
    
    all_metrcis = []
    for threshold in [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1]:
        metrcis = []
        pred_labels = (preds_adj[:, 1] >= threshold).astype(int)
        metrcis.append(threshold)
        metrcis.append(round(f1_score(labels, pred_labels, average='weighted'),2))  
        metrcis.append(round(precision_score(labels, pred_labels),2))  
        metrcis.append(round(recall_score(labels, pred_labels),2))  
        metrcis.append(round(accuracy_score(labels, pred_labels),2))  
        all_metrcis.append(metrcis)

    df_metrics = pd.DataFrame(data = all_metrcis, columns = ['threshold','f1','prec','rec','acc'])
    df_metrics = df_metrics.sort_values(by='f1', ascending=False)
    
    print(classification_report(labels, pred_flat))
    
    print(df_metrics.head())
    
    return f

get_metrics(pred)