In [None]:
!pip install datasets



In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from datasets import Dataset
from tqdm.auto import tqdm, trange

from torch.optim import Adam
import tensorflow as tf
import gc
from google.colab import files

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.metrics import cohen_kappa_score

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
print('There are %d GPU(s) available.' % torch.cuda.device_count())

There are 1 GPU(s) available.


In [None]:
base_model = 'ai-forever/ruBert-base'
tokenizer = AutoTokenizer.from_pretrained(base_model)
collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=3)
softmax0 = torch.nn.Softmax(dim = 0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df = pd.read_excel('marked_data.xlsx')
df_test = df[8000:]

Изначально 10000 комментариев уже рандомизированы внутри датасета, поэтому можно поделить на train и test просто по строкам

В модели RuBERT для анализа настроений метки должны быть: 0,1,2. Поэтому метки {-1,0,1} были трансформированы в {0,1,2} с сохранением порядка классов.

In [None]:
df_test['labels'] = df_test['labels'] + 1
data_test = Dataset.from_pandas(df_test[['text', 'labels']])
data_test_tokenized = data_test.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
data_test_tokenized

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['labels'] = df_test['labels'] + 1


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [None]:
test_dataloader = DataLoader(data_test_tokenized, shuffle=False, batch_size=32, collate_fn=collator)

## До fine-tuning

In [None]:
gc.collect()
torch.cuda.empty_cache()
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
model.eval()
eval_losses_0 = []
eval_prob_0 = []
eval_prob_1 = []
eval_prob_2 = []
eval_preds_0 = []
eval_targets_0 = []
for batch in tqdm(test_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    eval_losses_0.append(out.loss.item())
    for irow in range(out.logits.size()[0]):
      eval_prob_0.append(softmax0(out.logits[irow])[0])
      eval_prob_1.append(softmax0(out.logits[irow])[1])
      eval_prob_2.append(softmax0(out.logits[irow])[2])
    eval_preds_0.extend(out.logits.argmax(1).tolist())
    eval_targets_0.extend(batch['labels'].tolist())

  0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
df_test0 = df_test.copy()
df_test0['predict'] = eval_preds_0
df_test0['Pneg'] = list(map(lambda x: x.cpu().numpy(), eval_prob_0))
df_test0['Pneutral'] = list(map(lambda x: x.cpu().numpy(), eval_prob_1))
df_test0['Ppos'] = list(map(lambda x: x.cpu().numpy(), eval_prob_2))
df_test0['Ppos-Pneg'] = df_test0['Ppos'] - df_test0['Pneg']
df_test0

Unnamed: 0,post_id,text,labels,predict,Pneg,Pneutral,Ppos,Ppos-Pneg
8000,3700646,Они уху ели,0,0,0.3967407,0.26086214,0.34239715,-0.054344
8001,14957701,ну на самом деле то что индия нам дружественны...,2,0,0.39507672,0.27516237,0.32976094,-0.065316
8002,12145864,Выгода только государствам . Людей что те что ...,0,0,0.37751713,0.3756032,0.24687964,-0.130637
8003,3324391,"Рублёв - в Барселоне, Медведев - в Монако, но ...",0,0,0.43512735,0.3513296,0.21354298,-0.221584
8004,1592388,Занесут ему чемодан ..он успокоится,2,0,0.41620028,0.26867762,0.31512213,-0.101078
...,...,...,...,...,...,...,...,...
9995,1139301,"Не, ну это вообще пи.дец...!😡😤😡😤",2,0,0.40936616,0.33119792,0.25943592,-0.14993
9996,5343030,"Чего он сказал-то? Как назвал , оскорбил? Ниче...",0,0,0.44106147,0.30534002,0.25359848,-0.187463
9997,5595571,"Они все , включая и эту НаЕбулину , живут в Пу...",0,0,0.45907998,0.30341345,0.23750655,-0.221573
9998,6590679,"ну так власятм спасибо скажите , это их рук дело",0,0,0.3904332,0.30655333,0.30301344,-0.08742


In [None]:
print('accuracy', accuracy_score(eval_targets_0, eval_preds_0))
print('macro_precision', precision_score(eval_targets_0, eval_preds_0, average='macro'))
print('macro_recall', recall_score(eval_targets_0, eval_preds_0, average='macro'))
print('macro_f1', f1_score(eval_targets_0, eval_preds_0, average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(eval_targets_0, df_test0['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(eval_targets_0, df_test0['Ppos-Pneg']))

accuracy 0.4635
macro_precision 0.3157020827334677
macro_recall 0.3370181372006991
macro_f1 0.2781991011071215
RMSE for Ppos-Pneg 1.1376465171711139
MAE for Ppos-Pneg 0.8385811259821058


In [None]:
print('precision by class', precision_score(eval_targets_0, eval_preds_0, average=None))
print('recall by class', recall_score(eval_targets_0, eval_preds_0, average=None))
print('f1 by class', f1_score(eval_targets_0, eval_preds_0, average=None))

precision by class [0.5        0.20869565 0.2384106 ]
recall by class [0.8819939  0.03883495 0.09022556]
f1 by class [0.6382039  0.06548431 0.13090909]


In [None]:
confusion_matrix(eval_targets_0, eval_preds_0, labels=[0, 1, 2])

array([[867,  58,  58],
       [537,  24,  57],
       [330,  33,  36]])

In [None]:
cohen_kappa_score(eval_targets_0, eval_preds_0, labels=None, weights= 'quadratic', sample_weight=None)

0.04504876782689482

# Fine-tuning RuBERT

# Fine-tuning на Train размера 1000 с регуляризацией

In [None]:
df_train=df[:1000].copy()
df_train['labels'] = df_train['labels'] + 1
data_train = Dataset.from_pandas(df_train[['text', 'labels']])
data_train_tokenized = data_train.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
train_dataloader = DataLoader(data_train_tokenized, shuffle=False, batch_size=32, collate_fn=collator)
data_train_tokenized

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
df_val=df[1000:2000].copy()
df_val['labels'] = df_val['labels'] + 1
data_val = Dataset.from_pandas(df_val[['text', 'labels']])
data_val_tokenized = data_val.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
val_dataloader = DataLoader(data_val_tokenized, shuffle=False, batch_size=32, collate_fn=collator)
data_val_tokenized

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
optimizer = Adam(model.parameters(), lr=3e-5, weight_decay = 0.01)

In [None]:
gc.collect()
torch.cuda.empty_cache()
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
losses = []
for epoch in trange(3):
    pbar = tqdm(train_dataloader)
    model.train()
    for i, batch in enumerate(pbar):
        out = model(**batch.to(model.device))
        out.loss.backward()
        if i % 1 == 0:
            optimizer.step()
            optimizer.zero_grad()
        losses.append(out.loss.item())
        pbar.set_description(f'loss: {np.mean(losses[-100:]):2.2f}')
    model.eval()
    eval_losses = []
    eval_preds = []
    eval_targets = []
    for batch in tqdm(val_dataloader):
        with torch.no_grad():
                out = model(**batch.to(model.device))
        eval_losses.append(out.loss.item())
        eval_preds.extend(out.logits.argmax(1).tolist())
        eval_targets.extend(batch['labels'].tolist())
    print('recent train loss', np.mean(losses[-100:]), 'eval loss', np.mean(eval_losses), 'accuracy', np.mean(np.array(eval_targets) == eval_preds))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

recent train loss 0.9837000034749508 eval loss 0.9462793935090303 accuracy 0.558


  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

recent train loss 0.9228106695227325 eval loss 0.8605404607951641 accuracy 0.602


  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

recent train loss 0.8518640639570852 eval loss 0.8006778191775084 accuracy 0.663


In [None]:
model.eval()
eval_losses_0 = []
eval_prob_0 = []
eval_prob_1 = []
eval_prob_2 = []
eval_preds_0 = []
eval_targets_0 = []
for batch in tqdm(test_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    eval_losses_0.append(out.loss.item())
    for irow in range(out.logits.size()[0]):
      eval_prob_0.append(softmax0(out.logits[irow])[0])
      eval_prob_1.append(softmax0(out.logits[irow])[1])
      eval_prob_2.append(softmax0(out.logits[irow])[2])
    eval_preds_0.extend(out.logits.argmax(1).tolist())
    eval_targets_0.extend(batch['labels'].tolist())
print('recent train loss', np.mean(losses[-100:]), 'eval loss', np.mean(eval_losses), 'accuracy', np.mean(np.array(eval_targets) == eval_preds))

  0%|          | 0/63 [00:00<?, ?it/s]

recent train loss 0.8518640639570852 eval loss 0.8006778191775084 accuracy 0.663


In [None]:
df_test1 = df_test.copy()
df_test1['predict'] = eval_preds_0
df_test1['Pneg'] = list(map(lambda x: x.cpu().numpy(), eval_prob_0))
df_test1['Pneutral'] = list(map(lambda x: x.cpu().numpy(), eval_prob_1))
df_test1['Ppos'] = list(map(lambda x: x.cpu().numpy(), eval_prob_2))
df_test1['Ppos-Pneg'] = df_test1['Ppos'] - df_test1['Pneg']
df_test1

Unnamed: 0,post_id,text,labels,predict,Pneg,Pneutral,Ppos,Ppos-Pneg
8000,3700646,Они уху ели,0,1,0.24016039,0.6570206,0.10281899,-0.137341
8001,14957701,ну на самом деле то что индия нам дружественны...,2,2,0.08217983,0.19841982,0.7194003,0.63722
8002,12145864,Выгода только государствам . Людей что те что ...,0,0,0.90146977,0.05289949,0.045630753,-0.855839
8003,3324391,"Рублёв - в Барселоне, Медведев - в Монако, но ...",0,0,0.5834239,0.32324904,0.09332709,-0.490097
8004,1592388,Занесут ему чемодан ..он успокоится,2,0,0.41721496,0.2755832,0.3072019,-0.110013
...,...,...,...,...,...,...,...,...
9995,1139301,"Не, ну это вообще пи.дец...!😡😤😡😤",2,0,0.54432106,0.39956498,0.056113925,-0.488207
9996,5343030,"Чего он сказал-то? Как назвал , оскорбил? Ниче...",0,0,0.8713712,0.071334064,0.057294782,-0.814076
9997,5595571,"Они все , включая и эту НаЕбулину , живут в Пу...",0,0,0.8546813,0.10117449,0.04414422,-0.810537
9998,6590679,"ну так власятм спасибо скажите , это их рук дело",0,2,0.11899983,0.1675825,0.7134177,0.594418


In [None]:
print('accuracy', accuracy_score(eval_targets_0, eval_preds_0))
print('macro_precision', precision_score(eval_targets_0, eval_preds_0, average='macro'))
print('macro_recall', recall_score(eval_targets_0, eval_preds_0, average='macro'))
print('macro_f1', f1_score(eval_targets_0, eval_preds_0, average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test1['labels']-1, df_test1['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test1['labels']-1, df_test1['Ppos-Pneg']))

accuracy 0.6575
macro_precision 0.6472683356087973
macro_recall 0.6083093589636864
macro_f1 0.6211980873711268
RMSE for Ppos-Pneg 0.6510700450442162
MAE for Ppos-Pneg 0.5184975135419517


In [None]:
print('precision by class', precision_score(eval_targets_0, eval_preds_0, average=None))
print('recall by class', recall_score(eval_targets_0, eval_preds_0, average=None))
print('f1 by class', f1_score(eval_targets_0, eval_preds_0, average=None))

precision by class [0.69592199 0.57921635 0.66666667]
recall by class [0.79857579 0.55016181 0.47619048]
f1 by class [0.74372335 0.56431535 0.55555556]


In [None]:
confusion_matrix(eval_targets_0, eval_preds_0, labels=[0,1,2])

array([[785, 151,  47],
       [230, 340,  48],
       [113,  96, 190]])

In [None]:
cohen_kappa_score(eval_targets_0, eval_preds_0, labels=None, weights= 'quadratic', sample_weight=None)

0.4940695395989383

In [None]:
df_test1.to_excel('Test_RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch.xlsx')

In [None]:
torch.save(model, 'RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch.pth')
model.save_pretrained('RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch')
tokenizer.save_pretrained('RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch')

In [None]:
files.download('RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch.tar.gz')
files.download('RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch/model.safetensors')
files.download('RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_ec = pd.read_csv('ec_comments.csv')
data_ec = Dataset.from_pandas(df_ec[['text']])
data_ec_tokenized = data_ec.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
ec_dataloader = DataLoader(data_ec_tokenized, shuffle=False, batch_size=4, collate_fn=collator)
data_ec_tokenized

Map:   0%|          | 0/200471 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200471
})

In [None]:
model.eval()
eval_prob_0 = []
eval_prob_1 = []
eval_prob_2 = []
eval_preds_0 = []
for batch in tqdm(ec_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    for irow in range(out.logits.size()[0]):
      eval_prob_0.append(softmax0(out.logits[irow])[0])
      eval_prob_1.append(softmax0(out.logits[irow])[1])
      eval_prob_2.append(softmax0(out.logits[irow])[2])
    eval_preds_0.extend(out.logits.argmax(1).tolist())

  0%|          | 0/50118 [00:00<?, ?it/s]

In [None]:
df_ec1 = df_ec.copy()
df_ec1['predict'] = eval_preds_0
df_ec1['Pneg'] = list(map(lambda x: x.cpu().numpy(), eval_prob_0))
df_ec1['Pneutral'] = list(map(lambda x: x.cpu().numpy(), eval_prob_1))
df_ec1['Ppos'] = list(map(lambda x: x.cpu().numpy(), eval_prob_2))
df_ec1['Ppos-Pneg'] = df_ec1['Ppos'] - df_ec1['Pneg']
df_ec1

Unnamed: 0,post_id,text,comment_time,from_id,group,time,date,year,predict,Pneg,Pneutral,Ppos,Ppos-Pneg
0,1138693,что уже Дальневосточный льготный гектар никому...,1584453896,11298783,aif_ru,2020-03-17 14:04:56,2020-03-17,2020,0,0.8824342,0.08136601,0.036199745,-0.846234
1,2236421,"Ввезти ведь,а не вывезти!",1692304548,691425151,aif_ru,2023-08-17 20:35:48,2023-08-17,2023,0,0.67246246,0.25292945,0.07460811,-0.597854
2,2236421,"Не хотят наши власти, чтоб людям жилось хорошо...",1692321591,548839550,aif_ru,2023-08-18 01:19:51,2023-08-18,2023,0,0.8756214,0.05914013,0.06523842,-0.810383
3,2236421,"Клоун что хотел кого-то подкупать тут, очередн...",1692322774,676449624,aif_ru,2023-08-18 01:39:34,2023-08-18,2023,0,0.47533432,0.41774747,0.10691824,-0.368416
4,2236421,Где-то заработал и решил ввести?----Надо разби...,1692326408,710233104,aif_ru,2023-08-18 02:40:08,2023-08-18,2023,0,0.7828083,0.18355308,0.03363862,-0.74917
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200466,8994804,"Вроде мелочь, а приятно. Я смогу в месяц на эт...",1721797660,248106797,vesti,2024-07-24 05:07:40,2024-07-24,2024,0,0.79804367,0.14169222,0.060264125,-0.73778
200467,8994804,1 \nНам бодро вещают с экрана - \nв России у н...,1721803777,527537411,vesti,2024-07-24 06:49:37,2024-07-24,2024,0,0.7631065,0.11148833,0.12540516,-0.637701
200468,8994804,Поживём ‐ увидим...🤗,1721825504,34849496,vesti,2024-07-24 12:51:44,2024-07-24,2024,1,0.06647796,0.64142585,0.29209617,0.225618
200469,8994804,Где они их берут в таком колличесиве???!!! Мне...,1722172307,756763017,vesti,2024-07-28 13:11:47,2024-07-28,2024,0,0.8663659,0.0998627,0.033771355,-0.832595


In [None]:
df_ec1.to_excel('EC_RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch.xlsx')

In [None]:
df_cb = pd.read_csv('comments_with_cb.csv')
data_cb = Dataset.from_pandas(df_cb[['text']])
data_cb_tokenized = data_cb.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
cb_dataloader = DataLoader(data_cb_tokenized, shuffle=False, batch_size=4, collate_fn=collator)
data_cb_tokenized

Map:   0%|          | 0/74533 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 74533
})

In [None]:
model.eval()
eval_prob_0 = []
eval_prob_1 = []
eval_prob_2 = []
eval_preds_0 = []
for batch in tqdm(cb_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    for irow in range(out.logits.size()[0]):
      eval_prob_0.append(softmax0(out.logits[irow])[0])
      eval_prob_1.append(softmax0(out.logits[irow])[1])
      eval_prob_2.append(softmax0(out.logits[irow])[2])
    eval_preds_0.extend(out.logits.argmax(1).tolist())

  0%|          | 0/18634 [00:00<?, ?it/s]

In [None]:
df_cb1 = df_cb.copy()
df_cb1['predict'] = eval_preds_0
df_cb1['Pneg'] = list(map(lambda x: x.cpu().numpy(), eval_prob_0))
df_cb1['Pneutral'] = list(map(lambda x: x.cpu().numpy(), eval_prob_1))
df_cb1['Ppos'] = list(map(lambda x: x.cpu().numpy(), eval_prob_2))
df_cb1['Ppos-Pneg'] = df_cb1['Ppos'] - df_cb1['Pneg']
df_cb1

Unnamed: 0,post_id,text,comment_time,from_id,group,time,date,year,predict,Pneg,Pneutral,Ppos,Ppos-Pneg
0,1148930,ЦБ не государственный ! По этому и живём в нищ...,1587720062,564715902,aif_ru,2020-04-24 09:21:02,2020-04-24,2020,0,0.84809905,0.096775554,0.055125337,-0.792974
1,1148930,А если заканчивается действие карты 30 июня? Т...,1587724523,132854477,aif_ru,2020-04-24 10:35:23,2020-04-24,2020,1,0.11886659,0.8296932,0.051440172,-0.067426
2,1409029,Вот я дурень старый Уазик продал!,1649912634,269778992,aif_ru,2022-04-14 05:03:54,2022-04-14,2022,0,0.6016232,0.25584584,0.1425309,-0.459092
3,1409029,Почему не делать аналоги самим?,1649912734,198909762,aif_ru,2022-04-14 05:05:34,2022-04-14,2022,0,0.50835377,0.42328802,0.06835821,-0.439996
4,1409029,А с какой это стати прямой импорт из Китая пер...,1649913632,50852415,aif_ru,2022-04-14 05:20:32,2022-04-14,2022,0,0.7316302,0.15841255,0.109957285,-0.621673
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74528,9137126,До СВО Набиуллиной говорили верни активы из за...,1729880283,49800246,vesti,2024-10-25 18:18:03,2024-10-25,2024,0,0.8415063,0.089732446,0.06876124,-0.772745
74529,9137126,В 2014 году Чернобыль еще не входил в состав Р...,1729880968,814800282,vesti,2024-10-25 18:29:28,2024-10-25,2024,0,0.47443506,0.42897835,0.0965865,-0.377849
74530,9137126,"Все нормально, тонем.",1729886884,741226289,vesti,2024-10-25 20:08:04,2024-10-25,2024,2,0.29957148,0.32880625,0.3716223,0.072051
74531,9137126,"Дешевых денег нет, раньше можно было за кордон...",1729914648,757640064,vesti,2024-10-26 03:50:48,2024-10-26,2024,0,0.8173918,0.11740697,0.06520117,-0.752191


In [None]:
df_cb1.to_excel('CB_RuBERT-base_sentiment_classifier_3_labels_1000_3_epoch.xlsx')

# Fine-tuning на Train размера 7000 с регуляризацией

In [None]:
df_train=df[:7000].copy()
df_train['labels'] = df_train['labels'] + 1
data_train = Dataset.from_pandas(df_train[['text', 'labels']])
data_train_tokenized = data_train.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
train_dataloader = DataLoader(data_train_tokenized, shuffle=False, batch_size=32, collate_fn=collator)
data_train_tokenized

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7000
})

In [None]:
df_val=df[7000:8000].copy()
df_val['labels'] = df_val['labels'] + 1
data_val = Dataset.from_pandas(df_val[['text', 'labels']])
data_val_tokenized = data_val.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
val_dataloader = DataLoader(data_val_tokenized, shuffle=False, batch_size=32, collate_fn=collator)
data_val_tokenized

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
optimizer = Adam(model.parameters(), lr=1e-5, weight_decay = 0.01)

In [None]:
gc.collect()
torch.cuda.empty_cache()
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
losses = []
for epoch in trange(2):
    pbar = tqdm(train_dataloader)
    model.train()
    for i, batch in enumerate(pbar):
        out = model(**batch.to(model.device))
        out.loss.backward()
        if i % 1 == 0:
            optimizer.step()
            optimizer.zero_grad()
        losses.append(out.loss.item())
        pbar.set_description(f'loss: {np.mean(losses[-100:]):2.2f}')
    model.eval()
    eval_losses = []
    eval_preds = []
    eval_targets = []
    for batch in tqdm(val_dataloader):
        with torch.no_grad():
                out = model(**batch.to(model.device))
        eval_losses.append(out.loss.item())
        eval_preds.extend(out.logits.argmax(1).tolist())
        eval_targets.extend(batch['labels'].tolist())
    print('recent train loss', np.mean(losses[-100:]), 'eval loss', np.mean(eval_losses), 'accuracy', np.mean(np.array(eval_targets) == eval_preds))

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

recent train loss 0.848661795258522 eval loss 0.8242383468896151 accuracy 0.643


  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

recent train loss 0.7291153383255005 eval loss 0.8033180497586727 accuracy 0.653


In [None]:
model.eval()
eval_losses_0 = []
eval_prob_0 = []
eval_prob_1 = []
eval_prob_2 = []
eval_preds_0 = []
eval_targets_0 = []
for batch in tqdm(test_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    eval_losses_0.append(out.loss.item())
    for irow in range(out.logits.size()[0]):
      eval_prob_0.append(softmax0(out.logits[irow])[0])
      eval_prob_1.append(softmax0(out.logits[irow])[1])
      eval_prob_2.append(softmax0(out.logits[irow])[2])
    eval_preds_0.extend(out.logits.argmax(1).tolist())
    eval_targets_0.extend(batch['labels'].tolist())
print('recent train loss', np.mean(losses[-100:]), 'eval loss', np.mean(eval_losses), 'accuracy', np.mean(np.array(eval_targets) == eval_preds))

  0%|          | 0/63 [00:00<?, ?it/s]

recent train loss 0.7291153383255005 eval loss 0.8033180497586727 accuracy 0.653


In [None]:
df_test2 = df_test.copy()
df_test2['predict'] = eval_preds_0
df_test2['Pneg'] = list(map(lambda x: x.cpu().numpy(), eval_prob_0))
df_test2['Pneutral'] = list(map(lambda x: x.cpu().numpy(), eval_prob_1))
df_test2['Ppos'] = list(map(lambda x: x.cpu().numpy(), eval_prob_2))
df_test2['Ppos-Pneg'] = df_test2['Ppos'] - df_test2['Pneg']
df_test2

Unnamed: 0,post_id,text,labels,predict,Pneg,Pneutral,Ppos,Ppos-Pneg
8000,3700646,Они уху ели,0,1,0.12594551,0.74936515,0.124689326,-0.001256
8001,14957701,ну на самом деле то что индия нам дружественны...,2,2,0.036992107,0.08087258,0.8821353,0.845143
8002,12145864,Выгода только государствам . Людей что те что ...,0,0,0.8360057,0.085141115,0.07885316,-0.757153
8003,3324391,"Рублёв - в Барселоне, Медведев - в Монако, но ...",0,0,0.5983551,0.27937454,0.12227034,-0.476085
8004,1592388,Занесут ему чемодан ..он успокоится,2,2,0.11214641,0.3561073,0.5317463,0.4196
...,...,...,...,...,...,...,...,...
9995,1139301,"Не, ну это вообще пи.дец...!😡😤😡😤",2,0,0.67965007,0.2686157,0.051734272,-0.627916
9996,5343030,"Чего он сказал-то? Как назвал , оскорбил? Ниче...",0,0,0.7761875,0.16566163,0.058150858,-0.718037
9997,5595571,"Они все , включая и эту НаЕбулину , живут в Пу...",0,0,0.88046485,0.06491537,0.05461972,-0.825845
9998,6590679,"ну так власятм спасибо скажите , это их рук дело",0,2,0.14459601,0.21289878,0.6425052,0.497909


In [None]:
print('accuracy', accuracy_score(eval_targets_0, eval_preds_0))
print('macro_precision', precision_score(eval_targets_0, eval_preds_0, average='macro'))
print('macro_recall', recall_score(eval_targets_0, eval_preds_0, average='macro'))
print('macro_f1', f1_score(eval_targets_0, eval_preds_0, average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test2['labels']-1, df_test2['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test2['labels']-1, df_test2['Ppos-Pneg']))

accuracy 0.6755
macro_precision 0.6555902527557583
macro_recall 0.6393771947110746
macro_f1 0.6457651767129198
RMSE for Ppos-Pneg 0.6334986202570728
MAE for Ppos-Pneg 0.49181675277836623


In [None]:
print('precision by class', precision_score(eval_targets_0, eval_preds_0, average=None))
print('recall by class', recall_score(eval_targets_0, eval_preds_0, average=None))
print('f1 by class', f1_score(eval_targets_0, eval_preds_0, average=None))

precision by class [0.72099448 0.61594203 0.62983425]
recall by class [0.7965412  0.55016181 0.57142857]
f1 by class [0.75688739 0.58119658 0.59921156]


In [None]:
confusion_matrix(eval_targets_0, eval_preds_0, labels=[0,1,2])

array([[783, 134,  66],
       [210, 340,  68],
       [ 93,  78, 228]])

In [None]:
cohen_kappa_score(eval_targets_0, eval_preds_0, labels=None, weights= 'quadratic', sample_weight=None)

0.5322335143470545

In [None]:
df_test2.to_excel('Test_RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.xlsx')

In [None]:
torch.save(model, 'RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.pth')

In [None]:
model.save_pretrained('RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch')
tokenizer.save_pretrained('RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch')

('RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/tokenizer_config.json',
 'RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/special_tokens_map.json',
 'RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/vocab.txt',
 'RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/added_tokens.json',
 'RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/tokenizer.json')

In [None]:
!tar -czvf RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.tar.gz RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/

RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/
RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/tokenizer_config.json
RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/tokenizer.json
RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/config.json
RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/model.safetensors
RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/special_tokens_map.json
RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/vocab.txt


In [None]:
files.download('RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.tar.gz')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch/model.safetensors')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_ec = pd.read_csv('ec_comments.csv')
data_ec = Dataset.from_pandas(df_ec[['text']])
data_ec_tokenized = data_ec.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
ec_dataloader = DataLoader(data_ec_tokenized, shuffle=False, batch_size=4, collate_fn=collator)
data_ec_tokenized

Map:   0%|          | 0/200471 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200471
})

In [None]:
model.eval()
eval_prob_0 = []
eval_prob_1 = []
eval_prob_2 = []
eval_preds_0 = []
for batch in tqdm(ec_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    for irow in range(out.logits.size()[0]):
      eval_prob_0.append(softmax0(out.logits[irow])[0])
      eval_prob_1.append(softmax0(out.logits[irow])[1])
      eval_prob_2.append(softmax0(out.logits[irow])[2])
    eval_preds_0.extend(out.logits.argmax(1).tolist())

  0%|          | 0/50118 [00:00<?, ?it/s]

In [None]:
df_ec2 = df_ec.copy()
df_ec2['predict'] = eval_preds_0
df_ec2['Pneg'] = list(map(lambda x: x.cpu().numpy(), eval_prob_0))
df_ec2['Pneutral'] = list(map(lambda x: x.cpu().numpy(), eval_prob_1))
df_ec2['Ppos'] = list(map(lambda x: x.cpu().numpy(), eval_prob_2))
df_ec2['Ppos-Pneg'] = df_ec2['Ppos'] - df_ec2['Pneg']
df_ec2

Unnamed: 0,post_id,text,comment_time,from_id,group,time,date,year,predict,Pneg,Pneutral,Ppos,Ppos-Pneg
0,1138693,что уже Дальневосточный льготный гектар никому...,1584453896,11298783,aif_ru,2020-03-17 14:04:56,2020-03-17,2020,0,0.84649724,0.124360204,0.029142624,-0.817355
1,2236421,"Ввезти ведь,а не вывезти!",1692304548,691425151,aif_ru,2023-08-17 20:35:48,2023-08-17,2023,0,0.52809864,0.39526272,0.07663861,-0.45146
2,2236421,"Не хотят наши власти, чтоб людям жилось хорошо...",1692321591,548839550,aif_ru,2023-08-18 01:19:51,2023-08-18,2023,0,0.86906767,0.061434742,0.069497585,-0.79957
3,2236421,"Клоун что хотел кого-то подкупать тут, очередн...",1692322774,676449624,aif_ru,2023-08-18 01:39:34,2023-08-18,2023,0,0.7296417,0.23232645,0.038031865,-0.69161
4,2236421,Где-то заработал и решил ввести?----Надо разби...,1692326408,710233104,aif_ru,2023-08-18 02:40:08,2023-08-18,2023,0,0.7111372,0.26129055,0.027572278,-0.683565
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200466,8994804,"Вроде мелочь, а приятно. Я смогу в месяц на эт...",1721797660,248106797,vesti,2024-07-24 05:07:40,2024-07-24,2024,2,0.2673429,0.23768777,0.49496937,0.227626
200467,8994804,1 \nНам бодро вещают с экрана - \nв России у н...,1721803777,527537411,vesti,2024-07-24 06:49:37,2024-07-24,2024,0,0.8401136,0.08855728,0.07132916,-0.768784
200468,8994804,Поживём ‐ увидим...🤗,1721825504,34849496,vesti,2024-07-24 12:51:44,2024-07-24,2024,2,0.053516116,0.41252473,0.5339592,0.480443
200469,8994804,Где они их берут в таком колличесиве???!!! Мне...,1722172307,756763017,vesti,2024-07-28 13:11:47,2024-07-28,2024,0,0.8434717,0.13415873,0.022369577,-0.821102


In [None]:
df_ec2.to_excel('EC_RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.xlsx')

In [None]:
df_cb = pd.read_csv('comments_with_cb.csv')
data_cb = Dataset.from_pandas(df_cb[['text']])
data_cb_tokenized = data_cb.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
cb_dataloader = DataLoader(data_cb_tokenized, shuffle=False, batch_size=4, collate_fn=collator)
data_cb_tokenized

Map:   0%|          | 0/74533 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 74533
})

In [None]:
model.eval()
eval_prob_0 = []
eval_prob_1 = []
eval_prob_2 = []
eval_preds_0 = []
for batch in tqdm(cb_dataloader):
    with torch.no_grad():
            out = model(**batch.to(model.device))
    for irow in range(out.logits.size()[0]):
      eval_prob_0.append(softmax0(out.logits[irow])[0])
      eval_prob_1.append(softmax0(out.logits[irow])[1])
      eval_prob_2.append(softmax0(out.logits[irow])[2])
    eval_preds_0.extend(out.logits.argmax(1).tolist())

  0%|          | 0/18634 [00:00<?, ?it/s]

In [None]:
df_cb2 = df_cb.copy()
df_cb2['predict'] = eval_preds_0
df_cb2['Pneg'] = list(map(lambda x: x.cpu().numpy(), eval_prob_0))
df_cb2['Pneutral'] = list(map(lambda x: x.cpu().numpy(), eval_prob_1))
df_cb2['Ppos'] = list(map(lambda x: x.cpu().numpy(), eval_prob_2))
df_cb2['Ppos-Pneg'] = df_cb2['Ppos'] - df_cb2['Pneg']
df_cb2

Unnamed: 0,post_id,text,comment_time,from_id,group,time,date,year,predict,Pneg,Pneutral,Ppos,Ppos-Pneg
0,1148930,ЦБ не государственный ! По этому и живём в нищ...,1587720062,564715902,aif_ru,2020-04-24 09:21:02,2020-04-24,2020,0,0.85569924,0.08490294,0.059397824,-0.796301
1,1148930,А если заканчивается действие карты 30 июня? Т...,1587724523,132854477,aif_ru,2020-04-24 10:35:23,2020-04-24,2020,1,0.15057716,0.8033686,0.04605421,-0.104523
2,1409029,Вот я дурень старый Уазик продал!,1649912634,269778992,aif_ru,2022-04-14 05:03:54,2022-04-14,2022,0,0.63037694,0.29874176,0.0708813,-0.559496
3,1409029,Почему не делать аналоги самим?,1649912734,198909762,aif_ru,2022-04-14 05:05:34,2022-04-14,2022,1,0.31233904,0.63013697,0.057524,-0.254815
4,1409029,А с какой это стати прямой импорт из Китая пер...,1649913632,50852415,aif_ru,2022-04-14 05:20:32,2022-04-14,2022,0,0.50169593,0.2975515,0.20075254,-0.300943
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74528,9137126,До СВО Набиуллиной говорили верни активы из за...,1729880283,49800246,vesti,2024-10-25 18:18:03,2024-10-25,2024,0,0.775259,0.1586137,0.06612723,-0.709132
74529,9137126,В 2014 году Чернобыль еще не входил в состав Р...,1729880968,814800282,vesti,2024-10-25 18:29:28,2024-10-25,2024,1,0.3658613,0.47324476,0.16089393,-0.204967
74530,9137126,"Все нормально, тонем.",1729886884,741226289,vesti,2024-10-25 20:08:04,2024-10-25,2024,2,0.07756513,0.19884132,0.7235936,0.646028
74531,9137126,"Дешевых денег нет, раньше можно было за кордон...",1729914648,757640064,vesti,2024-10-26 03:50:48,2024-10-26,2024,0,0.72705215,0.1615159,0.11143188,-0.61562


In [None]:
df_cb2.to_excel('CB_RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.xlsx')

In [None]:
files.download('CB_RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('EC_RuBERT-base_sentiment_classifier_3_labels_7000_2_epoch.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>