In [1]:
import pickle

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
from datasets import Dataset, ClassLabel, load_dataset
import torch
from torch.utils.data import Dataset

In [3]:
with open('../dataset_raw.pickle', 'rb') as handle:
    data = pickle.load(handle)
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,example,category
0,"But when we reached Shanghai, I felt so terrif...",№1 ЛСВ 1 осмотр
1,the one event that Pliny is famous for actuall...,№1 ЛСВ 1 осмотр
2,"In an unprecedented legal move, Mr Justice Hod...",№1 ЛСВ 1 осмотр
3,Doctor Kagawa upped his estimate of latency to...,№1 ЛСВ 1 осмотр
4,and suddenly Eva was above the park and Zozobr...,№1 ЛСВ 1 осмотр


In [4]:
def replace_categories(df):
    unique_labels = df['category'].unique()
    label_mapping = {label: i for i, label in enumerate(unique_labels)}
    df['category'] = df['category'].map(label_mapping)
    return df
df = replace_categories(df)
distinct = df.groupby(['category'])['category'].count()
print(distinct)

category
0     100
1      13
2      58
3    1927
4      82
5    1652
6     273
7     116
8    2990
9      30
Name: category, dtype: int64


In [5]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
num_labels = 10
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_encodings = tokenizer(list(train_df['example']), truncation=True, padding=True, return_tensors='pt', max_length=512)
test_encodings = tokenizer(list(test_df['example']), truncation=True, padding=True, return_tensors='pt', max_length=512)

In [8]:
# Prepare your labels
train_labels = torch.tensor(list(train_df['category'].values))
test_labels = torch.tensor(list(test_df['category'].values))

In [9]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./bert_output",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
)

In [10]:
# Create a Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_encodings['token_type_ids'], train_labels),
    eval_dataset=torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_encodings['token_type_ids'], test_labels),
)

In [11]:
# Train the model
trainer.train()

TypeError: vars() argument must have __dict__ attribute

In [None]:
# Evaluate the model
results = trainer.evaluate(test_data)

# Generate predictions
predictions = trainer.predict(test_data)

# Get predicted labels
pred_labels = predictions.predictions.argmax(-1)

# Get true labels from the test set
true_labels = test_data['category']

# Print the classification report
report = classification_report(true_labels, pred_labels)
print(report)