In [2]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

2025-05-02 10:36:59.225091: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746182219.653352      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746182219.778539      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
df = pd.read_csv('/kaggle/input/cleared-new/cleared_new.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5700 entries, 0 to 5699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5700 non-null   object
 1   class   5700 non-null   object
dtypes: object(2)
memory usage: 89.2+ KB


In [5]:
df['class'] = df['class'].apply(lambda x: ast.literal_eval(str(x)))

In [6]:
all_labels = ['личная жизнь', 'политика', 'реклама', 'соцсети', 'спорт', 'юмор']

# Сортируем метки для воспроизводимости
sorted_labels = sorted(all_labels)

# Инициализируем бинаризатор
mlb = MultiLabelBinarizer(classes=sorted_labels)
mlb.fit(sorted_labels)

binary_matrix = mlb.transform(df['class'])
df['labels'] = list(binary_matrix)
df.head()

Unnamed: 0,text,class,labels
0,твой лучший секс спрятан здесь 🔞 делюсь канал...,"[реклама, личная жизнь]","[1, 0, 1, 0, 0, 0]"
1,⭐️ кнопка: ⭐️start⭐️(https://t.me/major/start...,[соцсети],"[0, 0, 0, 1, 0, 0]"
2,а продолжение где? правильно. в моем сообществ...,[соцсети],"[0, 0, 0, 1, 0, 0]"
3,тем временем моя авторская телега уверенно в т...,[соцсети],"[0, 0, 0, 1, 0, 0]"
4,"у меня есть двоюродная сестра, у нее есть сын ...",[личная жизнь],"[1, 0, 0, 0, 0, 0]"


In [7]:
class_mapping = {label: idx for idx, label in enumerate(mlb.classes_)}
class_mapping

{'личная жизнь': 0,
 'политика': 1,
 'реклама': 2,
 'соцсети': 3,
 'спорт': 4,
 'юмор': 5}

In [8]:
pre_dataset = Dataset.from_pandas(df)

In [9]:

# Бинарные метки (n_samples, n_classes)
labels = np.array(pre_dataset["labels"])

# Стратифицированное разбиение
msss = MultilabelStratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)

train_idx, val_idx = next(msss.split(np.zeros(len(labels)), labels))

train_dataset = pre_dataset.select(train_idx)
val_dataset = pre_dataset.select(val_idx)

In [10]:
def print_label_distribution(pre_dataset, name):
    labels = np.array(pre_dataset["labels"])
    label_counts = pd.DataFrame(labels).sum(axis=0)
    print(f"{name} распределение:\n{label_counts}")

print_label_distribution(train_dataset, "Train")
print_label_distribution(val_dataset, "Validation")

Train распределение:
0     426
1     311
2     840
3     759
4    1376
5     873
dtype: int64
Validation распределение:
0    107
1     78
2    210
3    190
4    344
5    218
dtype: int64


In [11]:
dataset = DatasetDict({
    "train": train_dataset,
    "test": val_dataset  # или "test" для тестовой выборки
})

In [12]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'class', 'labels'],
        num_rows: 4560
    })
    test: Dataset({
        features: ['text', 'class', 'labels'],
        num_rows: 1140
    })
})


In [13]:
# Токенизация
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

Map:   0%|          | 0/1140 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'class', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4560
    })
    test: Dataset({
        features: ['text', 'class', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1140
    })
})

In [15]:
print("Тип всего датасета:", type(tokenized_datasets))
print("Ключи датасета:", tokenized_datasets.keys())
print("Тип меток в train:", type(tokenized_datasets["train"]["labels"][0]))
print("Пример меток:", tokenized_datasets["train"]["labels"][:3])

Тип всего датасета: <class 'datasets.dataset_dict.DatasetDict'>
Ключи датасета: dict_keys(['train', 'test'])
Тип меток в train: <class 'list'>
Пример меток: [[1, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 1, 0]]


In [16]:
def convert_to_float32(batch):
    return {
        'labels': [torch.tensor(x, dtype=torch.float32) for x in batch['labels']]
    }

tokenized_datasets = tokenized_datasets.map(convert_to_float32, batched=True)

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

Map:   0%|          | 0/1140 [00:00<?, ? examples/s]

In [17]:
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [26]:
for split in tokenized_datasets:
    tokenized_datasets[split] = tokenized_datasets[split].map(
        lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.float32)},
        batched=True
    )

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

  lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.float32)},


Map:   0%|          | 0/1140 [00:00<?, ? examples/s]

In [27]:
print("Тип меток после преобразования:", type(tokenized_datasets["train"]["labels"][0]))
print("Пример метки:", tokenized_datasets["train"]["labels"][0])
print("Формат train:", tokenized_datasets["train"].format)
print("Формат test:", tokenized_datasets["test"].format)
print(tokenized_datasets["train"]["labels"][0].dtype)

Тип меток после преобразования: <class 'torch.Tensor'>
Пример метки: tensor([1., 0., 1., 0., 0., 0.])
Формат train: {'type': 'torch', 'format_kwargs': {'dtype': torch.float32}, 'columns': ['labels'], 'output_all_columns': False}
Формат test: {'type': 'torch', 'format_kwargs': {'dtype': torch.float32}, 'columns': ['labels'], 'output_all_columns': False}
torch.float32


In [20]:
tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"],
    dtype=torch.float32,
    output_all_columns=False
)

In [21]:
tokenized_datasets["train"] = tokenized_datasets["train"].with_format(
    "torch",
    columns=["labels"],
    dtype=torch.float32
)
tokenized_datasets["test"] = tokenized_datasets["test"].with_format(
    "torch",
    columns=["labels"],
    dtype=torch.float32
)

In [28]:
print("Тип меток после преобразования:", type(tokenized_datasets["train"]["labels"][0]))
print("Пример метки:", tokenized_datasets["train"]["labels"][0])


Тип меток после преобразования: <class 'torch.Tensor'>
Пример метки: tensor([1., 0., 1., 0., 0., 0.])


In [23]:
print("Train labels type:", tokenized_datasets["train"][0]["labels"].type())
print("Test labels type:", tokenized_datasets["test"][0]["labels"].type())

Train labels type: torch.FloatTensor
Test labels type: torch.FloatTensor


In [24]:
print("Формат train:", tokenized_datasets["train"].format)
print("Формат test:", tokenized_datasets["test"].format)

Формат train: {'type': 'torch', 'format_kwargs': {'dtype': torch.float32}, 'columns': ['labels'], 'output_all_columns': False}
Формат test: {'type': 'torch', 'format_kwargs': {'dtype': torch.float32}, 'columns': ['labels'], 'output_all_columns': False}


In [29]:
tokenized_datasets.with_format('torch')

DatasetDict({
    train: Dataset({
        features: ['text', 'class', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4560
    })
    test: Dataset({
        features: ['text', 'class', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1140
    })
})

In [30]:
# Создание модели
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", 
    num_labels=len(sorted_labels),
    problem_type="multi_label_classification"
    )

# Обучение
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    eval_strategy="steps",
    log_level='info',
    do_train=True,  # Ключевое изменение!
    do_eval=True,
    logging_steps=500,
    save_steps=500,
    eval_steps=500,
    fp16=True,  # Если GPU поддерживает
    report_to=["tensorboard"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using auto half precision backend


In [32]:
tokenized_datasets['train'].format

{'type': 'torch',
 'format_kwargs': {'dtype': torch.float32},
 'columns': ['labels'],
 'output_all_columns': False}

In [37]:
tokenized_datasets.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels'],  # Преобразовать в torch.Tensor
    output_all_columns=True  # Оставить остальные столбцы в исходном виде
)

In [38]:
print("Train labels type:", tokenized_datasets["train"][0]["labels"].type())
print("Test labels type:", tokenized_datasets["test"][0]["labels"].type())

Train labels type: torch.LongTensor
Test labels type: torch.LongTensor


In [39]:
tokenized_datasets['train'].format

{'type': 'torch',
 'format_kwargs': {},
 'columns': ['input_ids', 'attention_mask', 'labels'],
 'output_all_columns': True}

In [41]:
tokenized_datasets = tokenized_datasets.map(
    lambda x: {"labels": x["labels"].to(torch.float32)},
    batched=True
)

Map:   0%|          | 0/4560 [00:00<?, ? examples/s]

Map:   0%|          | 0/1140 [00:00<?, ? examples/s]

In [42]:
trainer.train()


The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: class, text. If class, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4,560
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Training with DataParallel so batch size has been adjusted to: 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2,860
  Number of trainable parameters = 177,858,054


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/bert/modeling_bert.py", line 1713, in forward
    loss = loss_fct(logits, labels)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/loss.py", line 819, in forward
    return F.binary_cross_entropy_with_logits(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/functional.py", line 3628, in binary_cross_entropy_with_logits
    return torch.binary_cross_entropy_with_logits(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: result type Float can't be cast to the desired output type Long


In [None]:
# Сохранение модели
model.save_pretrained("./my_bert_classifier")
tokenizer.save_pretrained("./my_bert_classifier")