In [1]:
!pip install wandb
!pip install 'transformers[torch]'
!pip install datasets 



In [2]:
import wandb
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, confusion_matrix

from transformers import AutoTokenizer, AutoModelForSequenceClassification, default_data_collator, XLMRobertaForSequenceClassification, XLMRobertaConfig

from torch.utils.data import DataLoader, TensorDataset

from tqdm import tqdm

from IPython.display import clear_output
from datasets import load_dataset, Dataset

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

2024-05-06 14:02:36.347829: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 14:02:36.347957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 14:02:36.490591: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [27]:
def calc_metrics(y_true, y_pred):
    y_true = y_true.cpu()
    y_pred = y_pred.cpu()
    return accuracy_score(y_true, y_pred)

@torch.no_grad()
def test(model, loader, device, tqdm_desc):
    loss_log = []
    acc_log = []
    
    model.eval()
    loss_func = nn.CrossEntropyLoss()

    for input_ids, attention_mask, labels in loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        out = model(input_ids, attention_mask=attention_mask)
        loss = loss_func(out.logits, labels)

        loss_log.append(loss.item())

        pred = torch.argmax(out.logits, dim=1)
        acc_log.append(calc_metrics(labels, pred))

    return loss_log, acc_log


def train(model, optimizer, n_epochs, train_loader, val_loader, batch_size, scheduler=None, log_batch_count=500, preval=True):    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    train_loss = []
    val_loss_hyst = []
    train_acc = []
    
    run = wandb.init(project='xlm-roberta-multiclass', reinit=True)
    wandb.watch(model, nn.CrossEntropyLoss(), log="all", log_freq=100)
    model.train()

    batch = 0
    loss_func = nn.CrossEntropyLoss()
    
    
    if preval:
        print(f"Init loss:")
        val_loss, val_acc = test(model, val_loader, device, tqdm_desc='Validating')
        val_loss_hyst.append(np.mean(val_loss))
        model.train()

        wandb.log({"val": {"acc": np.mean(val_acc), "batch_loss": np.mean(val_loss), "mean_loss": np.mean(val_loss_hyst)}})
        
        print(f" val loss: {np.mean(val_loss)}, val acc: {np.mean(val_acc)}\n")
    
    for epoch in range(n_epochs):
        for input_ids, attention_mask, labels in tqdm(train_loader, desc=f'Training {epoch}/{n_epochs}'):
            batch += 1
            
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            out = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = loss_func(out.logits, labels)
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

            pred = torch.argmax(out.logits, dim=1)
            train_acc.append(calc_metrics(labels, pred))
            
            # train log
            if batch % log_batch_count == 0:
                val_loss, val_acc = test(model, val_loader, device, tqdm_desc='Validating')
                val_loss_hyst.append(np.mean(val_loss))
                model.train()
                
                wandb.log({"val": {"acc": np.mean(val_acc), "batch_loss": np.mean(val_loss), "mean_loss": np.mean(val_loss_hyst)},
                           "train": {"acc": np.mean(train_acc), "batch_loss": loss.item(), "mean_loss": np.mean(train_loss)}})
                
                train_acc = []

        if scheduler is not None:
            scheduler.step()
            
    # последние батчи
    val_loss, val_acc = test(model, val_loader, device, tqdm_desc='Validating')
    val_loss_hyst.append(np.mean(val_loss))
                
    wandb.log({"val": {"acc": np.mean(val_acc), "batch_loss": np.mean(val_loss), "mean_loss": np.mean(val_loss_hyst)},
                "train": {"acc": np.mean(train_acc), "batch_loss": loss.item(), "mean_loss": np.mean(train_loss)}})
    print(f"Last batches:")
    print(f" train loss: {np.mean(train_loss)}, train acc: {np.mean(train_acc)}")
    print(f" val loss: {np.mean(val_loss)}, val acc: {np.mean(val_acc)}\n")

    wandb.unwatch()
    run.finish()

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [5]:
wandb.login(key="eba16103be2afd0b5c96243771d60f5d7e562f68")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
table_path = "/kaggle/input/course-work/full_processed_dataset.csv"
data = pd.read_csv(table_path)
data.dropna(axis=0, how='any', inplace=True)
data.parsed_text = data.parsed_text.astype(str)

print(data.shape)
data.head()

(37119, 3)


Unnamed: 0,url,base_category_nm,parsed_text
0,https://aspect-school.ru,Образование,курс маникюр школа обучение ученик мастер ногт...
1,https://rlagency.ru,Развлечения,билет купить руб подробный любовь дк павлово с...
2,https://kozlovatravel.u-on.ru,Развлечения,Автоматизированная система U ON Travel Забыли ...
3,https://rootsyou.tilda.ws,Красота,крем мыло баттеры аромароллер косметика душа п...
4,https://esk.one/p/wadgpcigy2vveo,Образование,прототип блок схема поддержка mind esk one сер...


In [7]:
lol = []
for i in range(data.shape[0]):
    if data.loc[i, 'parsed_text'].find('Страница входа Восстановить Зарегистрироваться') != -1:
        lol.append(i)
data.drop(index=lol, inplace=True)
print(data.shape)

(36891, 3)


In [8]:
data.rename(columns={"parsed_text": "text"}, inplace=True)

In [9]:
data['base_category_nm'].unique()

array(['Образование', 'Развлечения', 'Красота',
       'Продажа различных товаров', 'Авиабилеты', 'Одежда',
       'Дом и ремонт', 'Одежда и аксессуары', 'Связь и телеком',
       'Медицина', 'Салоны красоты и здоровья',
       'Электроника и бытовая техника', 'Различные услуги',
       'Подарки и сувениры', 'Цифровые услуги',
       'Заведения общественного питания', 'Автозапчасти и автосервисы',
       'Спорт', 'Зоомагазины и ветеринарные клиники', 'Транспорт',
       'Продукты', 'Рукоделие и творчество',
       'Адвокаты и юридические услуги', 'Ювелирные изделия и часы',
       'Книги и канцтовары', 'Отели', 'Печать и сканирование',
       'Танцевальные залы, школы и студии', 'Финансовые услуги', 'Цветы',
       'Магазины игрушек и хобби-товаров', 'Ремонт компьютеров',
       'Жилищно-коммунальные услуги', 'Аптеки', 'Спорттовары',
       'Курьерские услуги', 'Фотостудии', 'Оптика', 'Топливо',
       'Химчистка', 'Ателье и ремонт одежды', 'Ломбарды',
       'Клубы видеоигр', 'Бары, п

In [10]:
num_labels = len(data['base_category_nm'].unique())
cat_to_num = {cat : i for i, cat in enumerate(data['base_category_nm'].unique())}

In [11]:
cat_to_num

{'Образование': 0,
 'Развлечения': 1,
 'Красота': 2,
 'Продажа различных товаров': 3,
 'Авиабилеты': 4,
 'Одежда': 5,
 'Дом и ремонт': 6,
 'Одежда и аксессуары': 7,
 'Связь и телеком': 8,
 'Медицина': 9,
 'Салоны красоты и здоровья': 10,
 'Электроника и бытовая техника': 11,
 'Различные услуги': 12,
 'Подарки и сувениры': 13,
 'Цифровые услуги': 14,
 'Заведения общественного питания': 15,
 'Автозапчасти и автосервисы': 16,
 'Спорт': 17,
 'Зоомагазины и ветеринарные клиники': 18,
 'Транспорт': 19,
 'Продукты': 20,
 'Рукоделие и творчество': 21,
 'Адвокаты и юридические услуги': 22,
 'Ювелирные изделия и часы': 23,
 'Книги и канцтовары': 24,
 'Отели': 25,
 'Печать и сканирование': 26,
 'Танцевальные залы, школы и студии': 27,
 'Финансовые услуги': 28,
 'Цветы': 29,
 'Магазины игрушек и хобби-товаров': 30,
 'Ремонт компьютеров': 31,
 'Жилищно-коммунальные услуги': 32,
 'Аптеки': 33,
 'Спорттовары': 34,
 'Курьерские услуги': 35,
 'Фотостудии': 36,
 'Оптика': 37,
 'Топливо': 38,
 'Химчистка

In [12]:
data['target'] = data['base_category_nm'].apply(lambda x: cat_to_num[x])

In [13]:
data.head()

Unnamed: 0,url,base_category_nm,text,target
0,https://aspect-school.ru,Образование,курс маникюр школа обучение ученик мастер ногт...,0
1,https://rlagency.ru,Развлечения,билет купить руб подробный любовь дк павлово с...,1
2,https://kozlovatravel.u-on.ru,Развлечения,Автоматизированная система U ON Travel Забыли ...,1
3,https://rootsyou.tilda.ws,Красота,крем мыло баттеры аромароллер косметика душа п...,2
4,https://esk.one/p/wadgpcigy2vveo,Образование,прототип блок схема поддержка mind esk one сер...,0


In [43]:
from collections import Counter

lol = Counter(data.target)
lol

Counter({0: 8364,
         1: 4110,
         6: 3933,
         12: 2312,
         7: 1808,
         16: 1427,
         15: 1323,
         11: 1192,
         2: 966,
         8: 958,
         14: 933,
         20: 932,
         3: 693,
         29: 633,
         4: 602,
         17: 581,
         5: 580,
         25: 569,
         10: 491,
         13: 444,
         22: 432,
         9: 425,
         21: 422,
         32: 320,
         24: 313,
         30: 254,
         23: 228,
         28: 226,
         18: 207,
         36: 190,
         34: 180,
         19: 168,
         27: 164,
         26: 150,
         35: 102,
         37: 55,
         40: 47,
         39: 33,
         31: 28,
         38: 22,
         42: 22,
         41: 20,
         33: 18,
         43: 9,
         44: 5})

In [28]:
# Load model directly

config = XLMRobertaConfig().from_pretrained("FacebookAI/xlm-roberta-base")
config.num_labels = num_labels

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
model = XLMRobertaForSequenceClassification(config).to(device)

In [15]:
model.classifier

XLMRobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=45, bias=True)
)

In [16]:
%%time

TEST_SIZE = 0.3
SPLIT_RANDOM_SEED = 42
MAX_LENGTH = 512

def encode(examples):
    result = tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding="max_length")
    return result

dataset = Dataset.from_pandas(data)
tokenized_datasets = dataset.map(encode, batched=True, remove_columns="text")

Map:   0%|          | 0/36891 [00:00<?, ? examples/s]

CPU times: user 34.3 s, sys: 457 ms, total: 34.7 s
Wall time: 16.3 s


In [38]:
%%time


input_ids_train, input_ids_v, attention_mask_train, attention_mask_v, label_train, label_v = train_test_split(torch.tensor(tokenized_datasets['input_ids']), 
                                                                                                  torch.tensor(tokenized_datasets['attention_mask']), 
                                                                                                  torch.tensor(tokenized_datasets['target']), 
                                                                                                  random_state=SPLIT_RANDOM_SEED, test_size=TEST_SIZE, shuffle=True)

input_ids_val, input_ids_test, attention_mask_val, attention_mask_test, label_val, label_test = train_test_split(input_ids_v, attention_mask_v, label_v, 
                                                                                                  random_state=SPLIT_RANDOM_SEED, test_size=0.33, shuffle=True)

train_dataset = TensorDataset(input_ids_train, attention_mask_train, label_train)
val_dataset = TensorDataset(input_ids_val, attention_mask_val, label_val)
test_dataset = TensorDataset(input_ids_test, attention_mask_test, label_test)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [36]:
np.bincount(label_train.numpy())

array([5884, 2866,  678,  477,  411,  407, 2746, 1215,  668,  293,  339,
        836, 1638,  307,  659,  940, 1007,  398,  136,  106,  667,  311,
        300,  165,  225,  405,  103,  118,  162,  442,  182,   18,  227,
         10,  129,   66,  137,   37,   15,   24,   34,    7,   18,    7,
          3])

In [37]:
np.bincount(label_v.numpy())

array([2480, 1244,  288,  216,  191,  173, 1187,  593,  290,  132,  152,
        356,  674,  137,  274,  383,  420,  183,   71,   62,  265,  111,
        132,   63,   88,  164,   47,   46,   64,  191,   72,   10,   93,
          8,   51,   36,   53,   18,    7,    9,   13,   13,    4,    2,
          2])

In [18]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [29]:
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=2e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=3, eta_min=3e-10)
train(model, optimizer, 1, train_loader, val_loader, batch_size, scheduler, 100, True)

VBox(children=(Label(value='0.001 MB of 0.181 MB uploaded\r'), FloatProgress(value=0.0073064500857416385, max=…

Init loss:
 val loss: 3.7874760709959885, val acc: 0.03169258004926108



Training 0/1: 100%|██████████| 1614/1614 [1:07:15<00:00,  2.50s/it]  


Last batches:
 train loss: 2.491223642189411, train acc: 0.40803571428571433
 val loss: 2.015323805141038, val acc: 0.44731373152709364



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [30]:
test_loss, test_acc = test(model, test_loader, device, tqdm_desc='Test')
print(f'loss = {np.mean(test_loss)}, accuracy = {np.mean(test_acc)}')

loss = 1.9894375816703362, accuracy = 0.45098253275109174


In [26]:
import gc
torch.cuda.empty_cache()
gc.collect()

0