In [1]:
%%capture
!pip install datasets

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import multiprocessing
import torch
from torch import nn
# import torch_xla.core.xla_model as xm
from datasets import Dataset, Features, ClassLabel, Value, load_from_disk

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
fields = ['text', 'topic', 'date']
df = pd.read_csv('/content/drive/MyDrive/data/preprocessed.csv', parse_dates=['date'],
                 dtype={'text': object, 'topic': object},
                 usecols=fields)
df = df.sort_values('date').reset_index(drop=True)

In [4]:
# df.drop(columns=['date', 'url', 'title', 'title_clean', 'text_clean', 'title_tokens', 'text_tokens'], inplace=True)
df.drop(columns=['date'], inplace=True)

In [5]:
train_end = int(df.shape[0] * 0.9)
val_end = int(df.shape[0] * 0.95)

train_df = df[:train_end]
val_df = df[train_end:val_end]
test_df = df[val_end:]

In [None]:
total_classes = df['topic'].unique().shape[0]

for split_name, split_df in [('train', train_df), ('val', val_df), ('test', test_df)]:
    vc = split_df['topic'].value_counts(normalize=True).round(3)
    print(f"\n{split_name.upper()} распределение по классам (доля):\n", vc)
    print(f"Есть все классы: {vc.shape[0] == total_classes}")



TRAIN распределение по классам (доля):
 topic
Россия               0.225
Мир                  0.190
Экономика            0.111
Спорт                0.086
Культура             0.074
Бывший СССР          0.073
Наука и техника      0.073
Интернет и СМИ       0.059
Из жизни             0.036
Дом                  0.028
Силовые структуры    0.023
Бизнес               0.010
Ценности             0.007
Путешествия          0.006
Name: proportion, dtype: float64
Есть все классы: True

VAL распределение по классам (доля):
 topic
Россия               0.164
Мир                  0.146
Спорт                0.100
Силовые структуры    0.075
Экономика            0.069
Культура             0.068
Интернет и СМИ       0.068
Бывший СССР          0.067
Наука и техника      0.054
Ценности             0.053
Дом                  0.040
Из жизни             0.038
Путешествия          0.033
Бизнес               0.025
Name: proportion, dtype: float64
Есть все классы: True

TEST распределение по классам (доля):
 to

In [None]:
from sklearn.utils import resample

max_count = train_df['topic'].value_counts().max()
balanced = []
for label, grp in train_df.groupby('topic'):
    if len(grp) < max_count:
        ups = resample(grp, replace=True, n_samples=max_count, random_state=42)
        balanced.append(ups)
    else:
        balanced.append(grp)

In [None]:
train_df = pd.concat(balanced).sample(frac=1, random_state=42).reset_index(drop=True)

In [17]:
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
num_cores = multiprocessing.cpu_count()

In [34]:
unique_labels = sorted(df['topic'].unique())

features = Features({
    'text': Value('string'),
    'topic': ClassLabel(names=unique_labels)
})

In [None]:
def tokenize_batch(batch):
    toks = tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )
    toks['labels'] = batch['topic']
    return toks


train_ds = Dataset.from_pandas(train_df, features=features)
val_ds = Dataset.from_pandas(val_df, features=features)
test_ds = Dataset.from_pandas(test_df, features=features)

train_ds = train_ds.map(tokenize_batch,
                         batched=True,
                         remove_columns=train_df.columns.tolist())
val_ds = val_ds.map(tokenize_batch,
                     batched=True,
                     remove_columns=val_df.columns.tolist())
test_ds = test_ds.map(tokenize_batch,
                       batched=True,
                       remove_columns=test_df.columns.tolist())

train_ds.set_format('torch')
val_ds.set_format('torch')
test_ds.set_format('torch')

Map:   0%|          | 0/661693 [00:00<?, ? examples/s]

Map:   0%|          | 0/36761 [00:00<?, ? examples/s]

Map:   0%|          | 0/36761 [00:00<?, ? examples/s]

In [None]:
# train_ds.save_to_disk('/content/drive/MyDrive/data/train_ds.hf')
# val_ds.save_to_disk('/content/drive/MyDrive/data/val_ds.hf')
# test_ds.save_to_disk('/content/drive/MyDrive/data/test_ds.hf')

In [6]:
train_ds = load_from_disk('/content/drive/MyDrive/data/train_ds.hf')
val_ds = load_from_disk('/content/drive/MyDrive/data/val_ds.hf')
test_ds = load_from_disk('/content/drive/MyDrive/data/test_ds.hf')

  table = cls._concat_blocks(blocks, axis=0)


In [7]:
num_classes = df['topic'].nunique()

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
    problem_type="single_label_classification"
)

NameError: name 'model_name' is not defined

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
total_samples = train_df.shape[0]
vc = train_df['topic'].value_counts()

class_weights = torch.tensor([
    total_samples / (num_classes * count)
    for count in train_df['topic'].value_counts()
], device=device)

loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [22]:
class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        if loss_fn is None:
            raise ValueError("Pass your weighted loss via `loss_fn`")
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        weight = self.loss_fn.weight.to(logits.dtype)
        loss_fn = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# %%capture
# !pip install deepspeed

Collecting deepspeed
  Downloading deepspeed-0.16.7.tar.gz (1.5 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hjson (from deepspeed)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting ninja (from deepspeed)
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->deepspeed)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->deepspeed)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->deepspeed)
  Dow

In [None]:
# %%capture
# !pip install mpi4py

Collecting mpi4py
  Downloading mpi4py-4.0.3.tar.gz (466 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/466.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m466.3/466.3 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: mpi4py
  Building wheel for mpi4py (pyproject.toml) ... [?25l[?25hdone
  Created wheel for mpi4py: filename=mpi4py-4.0.3-cp311-cp311-linux_x86_64.whl size=4438171 sha256=2e1723ad960948fd408081981bee4a25e12a0dbfac42923878d4deb2ab0f2eb2
  Stored in directory: /root/.cache/pip/wheels/5c/56/17/bf6ba37aa971a191a8b9eaa188bf5ec855b8911c1c56fb1f84
Successfully built mpi4py
Installing collected packages: mpi4py
Successfully installed 

In [None]:
# import deepspeed
# deepspeed.init_distributed()

In [8]:
!rm -r /content/content

In [23]:
training_args = TrainingArguments(
    output_dir='out',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=1000,
    report_to='none',

    fp16=True,
    save_total_limit=10
    # torch_compile=True,
    # torch_compile_backend="inductor",
    # torch_compile_mode="reduce-overhead",
    # gradient_accumulation_steps=4,
    # dataloader_num_workers=4,
    # deepspeed="/content/drive/MyDrive/data/ds_config.json"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    loss_fn=loss_fn,

)

# trainer.train()

  super().__init__(*args, **kwargs)


In [24]:
trainer.train(resume_from_checkpoint=True)

Step,Training Loss
189000,0.1831
190000,0.1447
191000,0.1296
192000,0.1599
193000,0.1573
194000,0.1784
195000,0.1737
196000,0.1493
197000,0.1539
198000,0.1578


TrainOutput(global_step=248136, training_loss=0.03725401405194021, metrics={'train_runtime': 3929.4636, 'train_samples_per_second': 505.178, 'train_steps_per_second': 63.148, 'total_flos': 5.223525045082583e+17, 'train_loss': 0.03725401405194021, 'epoch': 3.0})

In [25]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [31]:
def predict_one(text: str):
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    return inputs

In [32]:
def classify(text: str):
    inputs = predict_one(text).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = nn.functional.softmax(logits, dim=-1)
        pred_idx = torch.argmax(probs, dim=-1).item()
        confidence = probs[0, pred_idx].item()
    return pred_idx, confidence, probs.squeeze().tolist()

In [39]:
texts = [
    "Экономические итоги первого квартала перевыполнили прогнозы.",
    "Новый фильм режиссёра выйдет в прокат этим летом."
]
[unique_labels[classify(text)[0]] for text in texts]

['Экономика', 'Культура']

In [42]:
trainer.save_model("/content/drive/MyDrive/models/bert")

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/models/bert").to(device)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/models/bert")

In [10]:
from torch.utils.data import DataLoader

val_loader = DataLoader(val_ds, batch_size=16)
test_loader = DataLoader(test_ds, batch_size=16)

In [15]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


In [21]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

acc = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {acc:.4f}")

report = classification_report(
    all_labels,
    all_preds,
    target_names=[str(i) for i in model.config.id2label.values()]
)
print(report)

Accuracy: 0.8229
              precision    recall  f1-score   support

     LABEL_0       0.52      0.69      0.59       926
     LABEL_1       0.95      0.89      0.92      2471
     LABEL_2       0.88      0.49      0.63      1453
     LABEL_3       0.59      0.76      0.66      1393
     LABEL_4       0.77      0.86      0.81      2488
     LABEL_5       0.90      0.88      0.89      2502
     LABEL_6       0.89      0.83      0.86      5370
     LABEL_7       0.91      0.77      0.83      1986
     LABEL_8       0.80      0.80      0.80      1216
     LABEL_9       0.76      0.85      0.80      6042
    LABEL_10       0.66      0.79      0.72      2757
    LABEL_11       0.96      0.98      0.97      3656
    LABEL_12       0.96      0.86      0.91      1949
    LABEL_13       0.87      0.69      0.77      2552

    accuracy                           0.82     36761
   macro avg       0.82      0.80      0.80     36761
weighted avg       0.84      0.82      0.82     36761

