In [2]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [19]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from tqdm.auto import tqdm  # Для прогресс-бара


In [4]:
df = pd.read_csv('/kaggle/input/cleared-new/cleared_new.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5700 entries, 0 to 5699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5700 non-null   object
 1   class   5700 non-null   object
dtypes: object(2)
memory usage: 89.2+ KB


In [5]:
df['class'] = df['class'].apply(lambda x: ast.literal_eval(str(x)))

In [6]:
all_labels = ['личная жизнь', 'политика', 'реклама', 'соцсети', 'спорт', 'юмор']

# Сортируем метки для воспроизводимости
sorted_labels = sorted(all_labels)

# Инициализируем бинаризатор
mlb = MultiLabelBinarizer(classes=sorted_labels)
mlb.fit(sorted_labels)

binary_matrix = mlb.transform(df['class'])
df['labels'] = list(binary_matrix)
df.head()

Unnamed: 0,text,class,labels
0,твой лучший секс спрятан здесь 🔞 делюсь канал...,"[реклама, личная жизнь]","[1, 0, 1, 0, 0, 0]"
1,⭐️ кнопка: ⭐️start⭐️(https://t.me/major/start...,[соцсети],"[0, 0, 0, 1, 0, 0]"
2,а продолжение где? правильно. в моем сообществ...,[соцсети],"[0, 0, 0, 1, 0, 0]"
3,тем временем моя авторская телега уверенно в т...,[соцсети],"[0, 0, 0, 1, 0, 0]"
4,"у меня есть двоюродная сестра, у нее есть сын ...",[личная жизнь],"[1, 0, 0, 0, 0, 0]"


In [7]:
class_mapping = {label: idx for idx, label in enumerate(mlb.classes_)}
class_mapping

{'личная жизнь': 0,
 'политика': 1,
 'реклама': 2,
 'соцсети': 3,
 'спорт': 4,
 'юмор': 5}

In [8]:
pre_dataset = Dataset.from_pandas(df)
# Бинарные метки (n_samples, n_classes)
labels = np.array(pre_dataset["labels"])

# Стратифицированное разбиение
msss = MultilabelStratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)

train_idx, val_idx = next(msss.split(np.zeros(len(labels)), labels))

train_dataset = pre_dataset.select(train_idx)
val_dataset = pre_dataset.select(val_idx)

In [9]:
def print_label_distribution(pre_dataset, name):
    labels = np.array(pre_dataset["labels"])
    label_counts = pd.DataFrame(labels).sum(axis=0)
    print(f"{name} распределение:\n{label_counts}")

print_label_distribution(train_dataset, "Train")
print_label_distribution(val_dataset, "Validation")

Train распределение:
0     426
1     311
2     840
3     759
4    1376
5     873
dtype: int64
Validation распределение:
0    107
1     78
2    210
3    190
4    344
5    218
dtype: int64


In [10]:
df_train = train_dataset.to_pandas()
df_val = val_dataset.to_pandas()

print(df_train.shape)
print(df_val.shape)

(4560, 3)
(1140, 3)


In [11]:
type(df_train['text'][0])

str

In [15]:
# Токенизация
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

inputs = tokenizer(df_train['text'].to_list(), padding=True, truncation=True, return_tensors="pt")

# Создаем TensorDataset
dataset = TensorDataset(
    inputs['input_ids'],
    inputs['attention_mask'],
    torch.tensor(df_train['labels'], dtype=torch.float32)  # float32 для multi-label
)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [16]:
# 4. DataLoader
batch_size = 16
dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size=batch_size
)

In [17]:
# 5. Инициализация модели (остается без изменений)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=6,  # Автоматическое определение числа классов
    problem_type="multi_label_classification"
).to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Функция вычисления accuracy
def compute_accuracy(preds, labels):
    if len(labels.shape) > 1:  # Для multi-label
        preds = (torch.sigmoid(preds) > 0.5).int()
    else:  # Для multi-class
        preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

# Цикл обучения
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    
    # Прогресс-бар для обучения
    train_progress = tqdm(dataloader, desc=f'Epoch {epoch + 1}/{epochs} [Train]', leave=False)
    
    for batch in train_progress:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        # Сбор статистики
        total_loss += loss.item()
        all_preds.append(outputs.logits.detach())
        all_labels.append(labels)
        
        # Обновление прогресс-бара
        train_progress.set_postfix({
            'loss': loss.item(),
            'lr': optimizer.param_groups[0]['lr']
        })
    
    # Вычисление accuracy
    epoch_preds = torch.cat(all_preds)
    epoch_labels = torch.cat(all_labels)
    acc = compute_accuracy(epoch_preds, epoch_labels)
    
    print(f"\nEpoch {epoch + 1} | Loss: {total_loss/len(dataloader):.4f} | Accuracy: {acc:.4f}")

Epoch 1/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 1 | Loss: 0.3460 | Accuracy: 0.3342


Epoch 2/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 2 | Loss: 0.3041 | Accuracy: 0.4669


Epoch 3/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 3 | Loss: 0.2720 | Accuracy: 0.5311


Epoch 4/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 4 | Loss: 0.2415 | Accuracy: 0.5846


Epoch 5/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 5 | Loss: 0.2039 | Accuracy: 0.6658


Epoch 6/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 6 | Loss: 0.1748 | Accuracy: 0.7092


Epoch 7/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 7 | Loss: 0.1516 | Accuracy: 0.7550


Epoch 8/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 8 | Loss: 0.1286 | Accuracy: 0.7822


Epoch 9/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 9 | Loss: 0.1129 | Accuracy: 0.8127


Epoch 10/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 10 | Loss: 0.0981 | Accuracy: 0.8357


Epoch 11/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 11 | Loss: 0.0839 | Accuracy: 0.8662


Epoch 12/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 12 | Loss: 0.0793 | Accuracy: 0.8713


Epoch 13/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 13 | Loss: 0.0714 | Accuracy: 0.8789


Epoch 14/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 14 | Loss: 0.0682 | Accuracy: 0.8855


Epoch 15/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 15 | Loss: 0.0589 | Accuracy: 0.9004


Epoch 16/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 16 | Loss: 0.0571 | Accuracy: 0.9075


Epoch 17/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 17 | Loss: 0.0527 | Accuracy: 0.9112


Epoch 18/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 18 | Loss: 0.0566 | Accuracy: 0.9053


Epoch 19/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 19 | Loss: 0.0526 | Accuracy: 0.9134


Epoch 20/20 [Train]:   0%|          | 0/285 [00:00<?, ?it/s]


Epoch 20 | Loss: 0.0491 | Accuracy: 0.9195


In [21]:
model.save_pretrained("/kaggle/working//my_bert_classifier")
tokenizer.save_pretrained("/kaggle/working//my_bert_classifier")

('/kaggle/working//my_bert_classifier/tokenizer_config.json',
 '/kaggle/working//my_bert_classifier/special_tokens_map.json',
 '/kaggle/working//my_bert_classifier/vocab.txt',
 '/kaggle/working//my_bert_classifier/added_tokens.json')

In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

def compute_metrics(preds, labels):
    # Для multi-class классификации
    if len(labels.shape) == 1:
        preds = np.argmax(preds, axis=1)
    # Для multi-label классификации
    else:
        preds = (preds > 0.5).astype(int)
    
    return {
        'f1': f1_score(labels, preds, average='macro'),
        'precision': precision_score(labels, preds, average='macro'),
        'recall': recall_score(labels, preds, average='macro'),
        'accuracy': accuracy_score(labels, preds)
    }

In [26]:
# Токенизация
inputs_val = tokenizer(df_val['text'].to_list(), padding=True, truncation=True, return_tensors="pt")

# Создаем TensorDataset
dataset_val = TensorDataset(
    inputs_val['input_ids'],
    inputs_val['attention_mask'],
    torch.tensor(df_val['labels'], dtype=torch.float32)  # float32 для multi-label
)

# 4. DataLoader
batch_size = 16
test_loader = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size,
    shuffle=False
)

In [28]:
from sklearn.metrics import f1_score
import numpy as np

model.eval()  # Переводим модель в режим оценки
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Для multi-class
        # preds = torch.argmax(logits, dim=1)
        
        # Для multi-label (раскомментировать):
        preds = (torch.sigmoid(logits) > 0.5).int()
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Расчет F1
f1 = f1_score(all_labels, all_preds, average='macro')
print(f"F1-score: {f1:.4f}")

F1-score: 0.6278


In [47]:
from sklearn.metrics import f1_score, hamming_loss

# Расчет F1
f1 = f1_score(all_labels, all_preds, average='macro')
print(f"F1-score macro: {f1:.4f}")
f1 = f1_score(all_labels, all_preds, average='micro')
print(f"F1-score micro: {f1:.4f}")
f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"F1-score weighted: {f1:.4f}")
hamming = hamming_loss(all_labels, all_preds)
print("Hamming Loss:", hamming)

F1-score macro: 0.6278
F1-score micro: 0.6683
F1-score weighted: 0.6660
Hamming Loss: 0.11915204678362573


In [48]:
from sklearn.metrics import classification_report

print(classification_report(
    all_labels,
    all_preds,
    target_names=sorted_labels  # Замените на свои названия классов
))

              precision    recall  f1-score   support

личная жизнь       0.42      0.48      0.45       107
    политика       0.59      0.78      0.67        78
     реклама       0.58      0.73      0.65       210
     соцсети       0.51      0.65      0.58       190
       спорт       0.79      0.92      0.85       344
        юмор       0.65      0.52      0.58       218

   micro avg       0.63      0.72      0.67      1147
   macro avg       0.59      0.68      0.63      1147
weighted avg       0.63      0.72      0.67      1147
 samples avg       0.60      0.65      0.61      1147



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model.save_pretrained("/kaggle/working//my_bert_classifier1")
tokenizer.save_pretrained("/kaggle/working//my_bert_classifier1")

Configuration saved in /kaggle/working//my_bert_classifier/config.json
Model weights saved in /kaggle/working//my_bert_classifier/model.safetensors
tokenizer config file saved in /kaggle/working//my_bert_classifier/tokenizer_config.json
Special tokens file saved in /kaggle/working//my_bert_classifier/special_tokens_map.json


('/kaggle/working//my_bert_classifier/tokenizer_config.json',
 '/kaggle/working//my_bert_classifier/special_tokens_map.json',
 '/kaggle/working//my_bert_classifier/vocab.txt',
 '/kaggle/working//my_bert_classifier/added_tokens.json')