In [52]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch


In [53]:
df = pd.read_parquet('/content/drive/MyDrive/data/posts.parquet')
df = df.dropna(subset=['Text'])
df = df.head(10000)

In [54]:
texts = df['Text'].tolist()
labels = df['categoryname'].tolist()

In [55]:
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)
num_labels = len(le.classes_)

In [56]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels_encoded, test_size=0.2, random_state=42, stratify=labels_encoded
)

In [57]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [58]:
class PostsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
    def __len__(self):
        return len(self.labels)


In [59]:
train_dataset = PostsDataset(train_encodings, train_labels)
val_dataset = PostsDataset(val_encodings, val_labels)

In [60]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no',
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [65]:
trainer.train()

Step,Training Loss
10,1.1075
20,1.1145
30,1.1039
40,1.1037
50,1.0914
60,1.1035
70,1.1157
80,1.0798
90,1.0966
100,1.1033


TrainOutput(global_step=1500, training_loss=0.5477815378506978, metrics={'train_runtime': 520.163, 'train_samples_per_second': 46.139, 'train_steps_per_second': 2.884, 'total_flos': 1578680506368000.0, 'train_loss': 0.5477815378506978, 'epoch': 3.0})

In [66]:
preds = trainer.predict(val_dataset)
y_pred = preds.predictions.argmax(axis=-1)
print(classification_report(val_labels, y_pred, target_names=le.classes_))

model.save_pretrained('category_bert_model')
tokenizer.save_pretrained('category_bert_model')
import pickle
with open('category_label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

                       precision    recall  f1-score   support

Аксессуары и Запчасти       0.83      0.81      0.82       622
  Снаряжение и защита       0.89      0.92      0.90       735
 Страйкбольное оружие       0.93      0.92      0.93       643

             accuracy                           0.88      2000
            macro avg       0.88      0.88      0.88      2000
         weighted avg       0.88      0.88      0.88      2000

