In [1]:
%%capture
!pip install datasets
!pip install 'numpy < 2.0'

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import multiprocessing
import torch
from torch import nn
# import torch_xla.core.xla_model as xm
from datasets import Dataset, Features, ClassLabel, Value, load_from_disk

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
fields = ['text', 'topic']

train_df = pd.read_csv('../data/train.csv',
                       dtype={'topic': object,
                              'text_clean': object},
                       usecols=fields)
val_df = pd.read_csv('../data/val.csv',
                     dtype={'topic': object,
                            'text_clean': object},
                     usecols=fields)
test_df = pd.read_csv('../data/test.csv',
                      dtype={'topic': object,
                             'text_clean': object},
                      usecols=fields)

In [None]:
# from sklearn.utils import resample

# max_count = train_df['topic'].value_counts().max()
# balanced = []
# for label, grp in train_df.groupby('topic'):
#     if len(grp) < max_count:
#         ups = resample(grp, replace=True, n_samples=max_count, random_state=42)
#         balanced.append(ups)
#     else:
#         balanced.append(grp)

In [None]:
# train_df = pd.concat(balanced).sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
num_cores = multiprocessing.cpu_count()

In [5]:
unique_labels = sorted(train_df['topic'].unique())

features = Features({
    'text': Value('string'),
    'topic': ClassLabel(names=unique_labels)
})

In [None]:
def tokenize_batch(batch):
    toks = tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )
    toks['labels'] = batch['topic']
    return toks


train_ds = Dataset.from_pandas(train_df, features=features)
val_ds = Dataset.from_pandas(val_df, features=features)
test_ds = Dataset.from_pandas(test_df, features=features)

train_ds = train_ds.map(tokenize_batch,
                        batched=True,
                        remove_columns=train_df.columns.tolist())
val_ds = val_ds.map(tokenize_batch,
                    batched=True,
                    remove_columns=val_df.columns.tolist())
test_ds = test_ds.map(tokenize_batch,
                      batched=True,
                      remove_columns=test_df.columns.tolist())

train_ds.set_format('torch')
val_ds.set_format('torch')
test_ds.set_format('torch')

In [None]:
train_ds.save_to_disk('/content/drive/MyDrive/data/train_ds.hf')
val_ds.save_to_disk('/content/drive/MyDrive/data/val_ds.hf')
test_ds.save_to_disk('/content/drive/MyDrive/data/test_ds.hf')

In [None]:
# train_ds = load_from_disk('/content/drive/MyDrive/data/train_ds.hf')
# val_ds = load_from_disk('/content/drive/MyDrive/data/val_ds.hf')
# test_ds = load_from_disk('/content/drive/MyDrive/data/test_ds.hf')

In [None]:
total_classes = train_df['topic'].nunique()  # in case previous cell wasn't launched

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=total_classes,
    problem_type="single_label_classification"
)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
total_samples = train_df.shape[0]
vc = train_df['topic'].value_counts()

class_weights = torch.tensor([
    total_samples / (total_classes * count)
    for count in train_df['topic'].value_counts().sort_index()
], device=device)

loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [21]:
class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        if loss_fn is None:
            raise ValueError("Pass your weighted loss via `loss_fn`")
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        weight = self.loss_fn.weight.to(logits.dtype)
        loss_fn = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir='out',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=1000,
    report_to='none',
    fp16=True,
    save_total_limit=10
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    loss_fn=loss_fn,

)

trainer.train()

In [None]:
# trainer.train(resume_from_checkpoint=True)

In [23]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [24]:
def predict_one(text: str):
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    return inputs

In [25]:
def classify(text: str):
    inputs = predict_one(text).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = nn.functional.softmax(logits, dim=-1)
        pred_idx = torch.argmax(probs, dim=-1).item()
        confidence = probs[0, pred_idx].item()
    return pred_idx, confidence, probs.squeeze().tolist()

In [26]:
texts = [
    "Экономические итоги первого квартала перевыполнили прогнозы.",
    "Новый фильм режиссёра выйдет в прокат этим летом."
]
[unique_labels[classify(text)[0]] for text in texts]

['Экономика', 'Культура']

In [27]:
trainer.save_model("/content/drive/MyDrive/models/bert")

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/models/bert").to(device)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/models/bert")

In [15]:
from torch.utils.data import DataLoader

val_loader = DataLoader(val_ds, batch_size=16)
test_loader = DataLoader(test_ds, batch_size=16)

In [None]:
from tqdm.auto import tqdm

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


In [17]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

print(
    classification_report(
        all_labels,
        all_preds,
        target_names=unique_labels
    )
)

                   precision    recall  f1-score   support

           Бизнес       0.00      0.00      0.00         4
      Бывший СССР       0.87      0.91      0.89      2609
              Дом       0.82      0.84      0.83       800
         Из жизни       0.80      0.78      0.79      2778
   Интернет и СМИ       0.71      0.81      0.75      2530
         Культура       0.89      0.88      0.88      2298
              Мир       0.89      0.82      0.85      6746
  Наука и техника       0.71      0.83      0.77      3090
      Путешествия       0.86      0.71      0.78      1291
           Россия       0.79      0.78      0.78      6891
Силовые структуры       0.62      0.67      0.64      1729
            Спорт       0.95      0.98      0.97      3359
         Ценности       0.93      0.76      0.84      1405
        Экономика       0.86      0.74      0.80      4065

         accuracy                           0.81     39595
        macro avg       0.76      0.75      0.76     3