In [1]:
import pandas as pd
import numpy as np
import random
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_metric, Dataset
from sklearn.metrics import classification_report, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
seed_all(200)

In [6]:
model = BertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny2', num_labels=2).to("cuda")
tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny2')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from data_module import CustomDataset
device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
# Creating the dataset and dataloader for the neural network
df = pd.read_csv("out_data/ToxicRussianComments.csv")

MAX_LEN = max([len(comment.strip().split(' ')) for comment in df['comment']])
MAX_LEN    

267

In [8]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (248290, 2)
TRAIN Dataset: (198632, 2)
TEST Dataset: (49658, 2)


In [9]:
from sklearn.metrics import f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    return {'F1': f1}

In [10]:
training_args = TrainingArguments(
    output_dir = './bert_results', #–í—ã—Ö–æ–¥–Ω–æ–π –∫–∞—Ç–∞–ª–æ–≥
    num_train_epochs = 3, #–ö–æ–ª-–≤–æ —ç–ø–æ—Ö –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
    per_device_train_batch_size = 8, #–†–∞–∑–º–µ—Ä –ø–∞–∫–µ—Ç–∞ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ –≤–æ –≤—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è
    per_device_eval_batch_size = 8, #–†–∞–∑–º–µ—Ä –ø–∞–∫–µ—Ç–∞ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ –≤–æ –≤—Ä–µ–º—è –≤–∞–ª–∏–¥–∞—Ü–∏–∏
    weight_decay =0.01, #–ü–æ–Ω–∏–∂–µ–Ω–∏–µ –≤–µ—Å–æ–≤
    logging_dir = './bert_logs', #–ö–∞—Ç–∞–ª–æ–≥ –¥–ª—è —Ö—Ä–∞–Ω–µ–Ω–∏—è –∂—É—Ä–Ω–∞–ª–æ–≤
    load_best_model_at_end = True, #–ó–∞–≥—Ä—É–∂–∞—Ç—å –ª–∏ –ª—É—á—à—É—é –º–æ–¥–µ–ª—å –ø–æ—Å–ª–µ –æ–±—É—á–µ–Ω–∏—è
    learning_rate = 1e-5, #–°–∫–æ—Ä–æ—Å—Ç—å –æ–±—É—á–µ–Ω–∏—è
    evaluation_strategy ='epoch', #–í–∞–ª–∏–¥–∞—Ü–∏—è –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–∏ (–º–æ–∂–Ω–æ —Å–¥–µ–ª–∞—Ç—å –ø–æ—Å–ª–µ –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–≥–æ –∫–æ–ª-–≤–∞ —à–∞–≥–æ–≤)
    logging_strategy = 'epoch', #–õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–∏
    save_strategy = 'epoch', #–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–∏
    save_total_limit = 1,
    seed=200)

In [11]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = training_set,
                  eval_dataset = testing_set,
                  compute_metrics = compute_metrics)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,0.1387,0.099204,0.904767
2,0.0856,0.084746,0.921848
3,0.0756,0.085862,0.92299




TrainOutput(global_step=18624, training_loss=0.09997392929706377, metrics={'train_runtime': 2495.7919, 'train_samples_per_second': 238.76, 'train_steps_per_second': 7.462, 'total_flos': 2291538159229536.0, 'train_loss': 0.09997392929706377, 'epoch': 3.0})

In [13]:
torch.save(model.state_dict(), 'bert_ckpt.pt')

In [14]:
model_path = "fine-tune-bert"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('fine-tune-bert/tokenizer_config.json',
 'fine-tune-bert/special_tokens_map.json',
 'fine-tune-bert/vocab.txt',
 'fine-tune-bert/added_tokens.json')

In [15]:
def get_prediction():
    test_pred = trainer.predict(testing_set)
    labels = np.argmax(test_pred.predictions, axis = -1)
    return labels
pred = get_prediction()



In [21]:
print(classification_report(testing_set.targets, pred))
print(f1_score(testing_set.targets, pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     40803
           1       0.92      0.93      0.92      8855

    accuracy                           0.97     49658
   macro avg       0.95      0.95      0.95     49658
weighted avg       0.97      0.97      0.97     49658

0.9218477465818938


In [27]:
# example
example = test_dataset.loc[0:10]
print(example)


                                              comment  label
0                                —Å–∫–æ—Ç–∏–Ω–∞! —á—Ç–æ —Å–∫–∞–∑–∞—Ç—å      1
1             –∞ –∫–æ–≥–¥–∞ –º—ã —Å—Ç–∞—Ç—É—Å –∞–≥—Ä–æ–≥–æ—Ä–æ–¥–∫–∞ –ø–æ–ª—É—á–∏–ª–∏?      0
2   –∫—Ä–∞—Å–æ—Ç–∞..!! –µ—Å–ª–∏ –µ—Å—Ç—å, —á—Ç–æ –ø–æ–∫–∞–∑–∞—Ç—å??!! –ø–æ—á–µ–º—É...      0
3   —ç—Ç–∏ —Å–∫–∞—Ç—ã ,–Ω–∞ –≥–µ—Ä–æ–µ–≤ –ø–∏–æ–Ω–µ—Ä–æ–≤, –∞–Ω–µ–∫–¥–æ—Ç–æ–≤ –Ω–∞–≤—ã–¥...      0
4                           –∫—Ç–æ –∑–∞—Å–µ–¥–∞—Ç—å –±—É–¥–µ—Ç ..????      0
5   –¥–∞. —è –Ω–∞ –Ω–µ–º –∑–∞ 21 –¥–µ–Ω—å –º–∞—Ä–∞—Ñ–æ–Ω–∞ —Å–∫–∏–Ω—É–ª–∞ 5–∫–≥ –∏...      0
6                                   —Å–µ–∫—Å –º–æ—Å–∞–∂ —á—Ç–æ–ª–∏?      0
7                                        –∫–∞–∫–∏–µ –∂–∞–ª–∫–∏–µ      0
8                —Å–∞–º—ã–µ —É–º–Ω—ã–µ –±—ã—Å—Ç—Ä–µ–Ω—å–∫–æ —Ä–∞–∑–≤–µ—Ä–Ω—É–ª–∏—Å—å!      0
9                      —ç—Ç–æ –∂–µ –Ω–∞–¥–æ —Ç–∞–∫ –Ω–∞—É—á–∏—Ç—å!!!üëçüëçüòÇüòÇ      0
10  –±—Ä–∞—Ç–∫–∞ –ø–æ–∑–¥—Ä–∞–≤–ª—è—é —Ç–µ–±—è —Å –¥–Ω—ë–º —Ä–æ–∂–¥–µ–Ω–∏

In [28]:
example = CustomDataset(example, tokenizer, MAX_LEN)

In [30]:
example_pred = trainer.predict(example)
labels = np.argmax(example_pred.predictions, axis = -1)
print(labels)



[1 0 0 0 0 0 0 0 0 0 0]
