## Модель

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [2]:
import numpy as np
import pandas as pd

In [3]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'

In [4]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Данные

In [6]:
df1 = pd.read_csv("../../../DATA/toxic/labeled.csv")
df1.head()

Unnamed: 0,comment,toxic
0,"Верблюдов-то за что? Дебилы, бл...\n",1.0
1,"Хохлы, это отдушина затюканого россиянина, мол...",1.0
2,Собаке - собачья смерть\n,1.0
3,"Страницу обнови, дебил. Это тоже не оскорблени...",1.0
4,"тебя не убедил 6-страничный пдф в том, что Скр...",1.0


In [7]:
df1.columns = ['text','toxic']

In [8]:
set(df1['toxic'])

{0.0, 1.0}

In [9]:
collected_data = []
with open("../../../DATA/toxic/dataset.txt") as f:
    for l in f.readlines():
        split_line = l.split()
        label = split_line[0]
        text = ' '.join(split_line[1:])
        collected_data.append([text,label])
#         break

In [10]:
df2 = pd.DataFrame(data = collected_data, columns = ['text','toxic'])

In [11]:
df2.head()

Unnamed: 0,text,toxic
0,скотина! что сказать,__label__INSULT
1,я сегодня проезжала по рабочей и между домами ...,__label__NORMAL
2,очередной лохотрон. зачем придумывать очередно...,__label__NORMAL
3,"ретро дежавю ... сложно понять чужое сердце , ...",__label__NORMAL
4,а когда мы статус агрогородка получили?,__label__NORMAL


In [12]:
df2['toxic'] = df2['toxic'].apply(lambda x: 1 if x != '__label__NORMAL' else 0)

In [13]:
df2.head()

Unnamed: 0,text,toxic
0,скотина! что сказать,1
1,я сегодня проезжала по рабочей и между домами ...,0
2,очередной лохотрон. зачем придумывать очередно...,0
3,"ретро дежавю ... сложно понять чужое сердце , ...",0
4,а когда мы статус агрогородка получили?,0


In [14]:
df_concat = pd.concat([df1,df2])
df_concat = df_concat.drop_duplicates(subset = ['text'])
df_concat = df_concat.sample(frac=1, random_state = 1).reset_index(drop=True)
df_concat.dropna(inplace = True)
len(df_concat)

262695

In [16]:
set(df_concat['toxic'])

{0.0, 1.0}

In [18]:
df_concat['toxic'] = df_concat['toxic'].apply(round)

In [19]:
df_concat.head()

Unnamed: 0,text,toxic
0,мои помидори без никакого ускорения роста исоз...,0
1,они не понимают что это музыка для души,0
2,"по-моему, это женщина.",0
3,"а майонез, можно заменить сметаной",0
4,кого чьи-то принципы и взгляды не нравится - н...,1


In [20]:
len(df_concat)

262695

In [21]:
tr_ind = int(len(df_concat)*0.8)
val_ind = int(len(df_concat)*0.9)

df_train = df_concat[:tr_ind]
df_val = df_concat[tr_ind:val_ind]
df_test = df_concat[val_ind:]

In [22]:
from collections import Counter
def get_proprtion(dfc):
    return round(len(dfc[dfc['toxic']==0])/len(dfc),2), round(len(dfc[dfc['toxic']==1])/len(dfc),2)

get_proprtion(df_train)

(0.81, 0.19)

In [23]:
get_proprtion(df_val)

(0.81, 0.19)

In [24]:
get_proprtion(df_test)

(0.81, 0.19)

In [25]:
assert len(df_train) + len(df_val) +  len(df_test) == len(df_concat)

In [26]:
df_sanity = pd.concat([df_train,df_val,df_test])

In [27]:
assert len(df_sanity.drop_duplicates(subset = ['text'])) == len(df_concat)

In [28]:
class UnsafeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [29]:
train_dataset = UnsafeDataset(tokenizer(df_train.text.tolist(),
                                        max_length=64,
                                        truncation=True,
                                        padding='longest'), df_train.toxic.tolist())

In [30]:
eval_dataset = UnsafeDataset(tokenizer(df_val.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), df_val.toxic.tolist())


In [31]:
test_dataset = UnsafeDataset(tokenizer(df_test.text.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), df_test.toxic.tolist())

## Обучение

In [32]:
import os
os.environ['CUDA_VISIBLE_DEVICES']= '7'

In [33]:
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

device = torch.device('cuda')

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self) -> Tuple["torch.device", int]:
        return device

In [170]:
# torch.cuda.set_device(device)
# model.to(device);

In [34]:
for param in model.bert.parameters():
    param.requires_grad=True

In [35]:
training_args = TrAr(
    output_dir='./unsafe/FINAL_VERS',   # output directory
    overwrite_output_dir=True,
    num_train_epochs=5,            # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=0,               # number of warmup steps for learning rate scheduler
    weight_decay=1e-8,              # strength of weight decay
    learning_rate=2e-5,
    save_total_limit=2,
    logging_dir='./logs',           # directory for storing logs
    logging_steps=2500,
    eval_steps=2500,
    save_steps=2500,
    evaluation_strategy='steps',metric_for_best_model = 'f1',greater_is_better = True, load_best_model_at_end = True, report_to='none'
)

In [36]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds_prob = pred.predictions[:,-1]
    rauc = roc_auc_score(labels, preds_prob)
    
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        "roc_auc":rauc
    }

In [37]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics  = compute_metrics
)

In [38]:
from transformers.trainer_callback import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(3)) 

In [176]:
training_args.device

device(type='cuda')

In [39]:
dl = trainer.get_train_dataloader()

In [49]:
model.device

device(type='cuda', index=0)

In [55]:
for b in dl:
    break
with torch.no_grad():
    b = {k:v.to(device) for k,v in b.items()}
    out = model(**b)

In [57]:
trainer.train()

***** Running training *****
  Num examples = 210156
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 32840


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
2500,0.1061,0.100015,0.970117,0.970193,0.970288,0.970117,0.991574
5000,0.084,0.080221,0.975637,0.97549,0.975451,0.975637,0.994551
7500,0.0683,0.102341,0.974495,0.974321,0.974284,0.974495,0.993905
10000,0.0487,0.113212,0.974609,0.974503,0.974449,0.974609,0.993022
12500,0.0502,0.08852,0.974951,0.975074,0.97527,0.974951,0.994008


***** Running Evaluation *****
  Num examples = 26269
  Batch size = 32
Saving model checkpoint to ./unsafe/FINAL_VERS/checkpoint-2500
Configuration saved in ./unsafe/FINAL_VERS/checkpoint-2500/config.json
Model weights saved in ./unsafe/FINAL_VERS/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./unsafe/FINAL_VERS/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./unsafe/FINAL_VERS/checkpoint-2500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 26269
  Batch size = 32
Saving model checkpoint to ./unsafe/FINAL_VERS/checkpoint-5000
Configuration saved in ./unsafe/FINAL_VERS/checkpoint-5000/config.json
Model weights saved in ./unsafe/FINAL_VERS/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./unsafe/FINAL_VERS/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./unsafe/FINAL_VERS/checkpoint-5000/special_tokens_map.json
Deleting older checkpoint [unsafe/FINAL_VERS/checkpoint-2500] due to args.save

TrainOutput(global_step=12500, training_loss=0.07146613830566406, metrics={'train_runtime': 2101.0369, 'train_samples_per_second': 500.124, 'train_steps_per_second': 15.63, 'total_flos': 1.31548949903616e+16, 'train_loss': 0.07146613830566406, 'epoch': 1.9})

## Evaluation

In [58]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 26270
  Batch size = 32


{'eval_loss': 0.07844485342502594,
 'eval_accuracy': 0.974571754853445,
 'eval_f1': 0.9744987502580922,
 'eval_precision': 0.9744493570735744,
 'eval_recall': 0.974571754853445,
 'eval_roc_auc': 0.994756509968624,
 'eval_runtime': 33.6315,
 'eval_samples_per_second': 781.113,
 'eval_steps_per_second': 24.412,
 'epoch': 1.9}

In [59]:
pred = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 26270
  Batch size = 32


In [60]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [61]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, classification_report, roc_auc_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
# Function to calculate the accuracy of our predictions vs labels
def get_metrics(preds):
    preds, labels = preds.predictions, preds.label_ids
    #standard round approach    
    pred_flat = np.argmax(preds, axis=1).flatten()    
    pr, rec, f, _ = precision_recall_fscore_support(labels, pred_flat, average='weighted')  
    
    print("precision", pr)
    print("recall", rec)
    print("fscore_weighted", f)
    
    #adjust threshold approach
    preds_adj = np.array([[float(el1),float(el2)] for el1,el2 in preds])
    preds_adj = softmax(preds_adj, axis = 1)
    roc_auc = roc_auc_score(labels, preds_adj[:, 1])
    print("roc_auc", roc_auc)
    
    all_metrcis = []
    for threshold in [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1]:
        metrcis = []
        pred_labels = (preds_adj[:, 1] >= threshold).astype(int)
        metrcis.append(threshold)
        metrcis.append(round(f1_score(labels, pred_labels, average='weighted'),2))  
        metrcis.append(round(precision_score(labels, pred_labels),2))  
        metrcis.append(round(recall_score(labels, pred_labels),2))  
        metrcis.append(round(accuracy_score(labels, pred_labels),2))  
        all_metrcis.append(metrcis)

    df_metrics = pd.DataFrame(data = all_metrcis, columns = ['threshold','f1','prec','rec','acc'])
    df_metrics = df_metrics.sort_values(by='f1', ascending=False)
    
    print(classification_report(labels, pred_flat))
    
    print(df_metrics.head())
    
    return f

get_metrics(pred)

precision 0.9744493570735744
recall 0.974571754853445
fscore_weighted 0.9744987502580922
roc_auc 0.9947765325133202
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     21384
           1       0.94      0.92      0.93      4886

    accuracy                           0.97     26270
   macro avg       0.96      0.96      0.96     26270
weighted avg       0.97      0.97      0.97     26270

   threshold    f1  prec   rec   acc
1        0.1  0.97  0.91  0.95  0.97
2        0.2  0.97  0.92  0.94  0.97
3        0.3  0.97  0.93  0.93  0.97
4        0.4  0.97  0.93  0.93  0.97
5        0.5  0.97  0.94  0.92  0.97


  _warn_prf(average, modifier, msg_start, len(result))


0.9744987502580922

In [65]:
import os
path = "../../../toxic_classifier_rus_ok_2ch"
os.listdir(path)

['.git', '.gitattributes']

In [66]:
trainer.save_model(path)

Saving model checkpoint to ../../../toxic_classifier_rus_ok_2ch
Configuration saved in ../../../toxic_classifier_rus_ok_2ch/config.json
Model weights saved in ../../../toxic_classifier_rus_ok_2ch/pytorch_model.bin
tokenizer config file saved in ../../../toxic_classifier_rus_ok_2ch/tokenizer_config.json
Special tokens file saved in ../../../toxic_classifier_rus_ok_2ch/special_tokens_map.json


In [67]:
tokenizer.save_pretrained(path)

tokenizer config file saved in ../../../toxic_classifier_rus_ok_2ch/tokenizer_config.json
Special tokens file saved in ../../../toxic_classifier_rus_ok_2ch/special_tokens_map.json


('../../../toxic_classifier_rus_ok_2ch/tokenizer_config.json',
 '../../../toxic_classifier_rus_ok_2ch/special_tokens_map.json',
 '../../../toxic_classifier_rus_ok_2ch/vocab.txt',
 '../../../toxic_classifier_rus_ok_2ch/added_tokens.json')

In [68]:
from transformers import TFBertForSequenceClassification

In [69]:
tf_model = TFBertForSequenceClassification.from_pretrained(path, from_pt=True)

loading configuration file ../../../toxic_classifier_rus_ok_2ch/config.json
Model config BertConfig {
  "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "type_vocab_size": 2,
  "use_ca

In [70]:
tf_model.save_pretrained(path)


Configuration saved in ../../../toxic_classifier_rus_ok_2ch/config.json
Model weights saved in ../../../toxic_classifier_rus_ok_2ch/tf_model.h5
