In [55]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [24]:
# pip install transformers sentencepiece
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")


# model.cuda()  # uncomment it if you have a GPU

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()


print(embed_bert_cls('привет друзья', model, tokenizer).shape)
# (312,)


(312,)


In [56]:
label_encoder = LabelEncoder()

In [57]:
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')
val_df = pd.read_csv('val_data.csv')

In [58]:
train_df = train_df.rename({'writer': 'label'}, axis=1).drop('book', axis=1)
test_df = test_df.rename({'writer': 'label'}, axis=1).drop('book', axis=1)
val_df = val_df.rename({'writer': 'label'}, axis=1).drop('book', axis=1)

In [59]:
train_df['label'] = label_encoder.fit_transform(train_df['label'])
test_df['label'] = label_encoder.fit_transform(test_df['label'])
val_df['label'] = label_encoder.fit_transform(val_df['label'])

In [60]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46915 entries, 0 to 46914
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   46915 non-null  int64 
 1   text    46915 non-null  object
dtypes: int64(1), object(1)
memory usage: 733.2+ KB


In [61]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import DatasetDict, Dataset, load_dataset

In [62]:
# Загрузка токенайзера и модели
model_name = "cointegrated/rubert-tiny"
# model_name = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [73]:
# Создание датасета из DataFrame
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [40]:
train_dataset.

['writer', 'book', 'text']

In [78]:
# Создание объекта DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [65]:
# Функция предобработки данных
def preprocess_function(examples):
    tokenized_data = tokenizer(examples['text'], truncation=True, padding=True, max_length=128)
    return tokenized_data

In [79]:
# Токенизация данных
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/46915 [00:00<?, ? examples/s]

Map:   0%|          | 0/9119 [00:00<?, ? examples/s]

Map:   0%|          | 0/9118 [00:00<?, ? examples/s]

In [67]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 46915
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [34]:
train_df['writer'].value_counts()

writer
Dostoevsky           3157
Fray                 2677
Sergeev-Thsenskiy    2553
Solzhenitsin         2422
Kazantsev            2156
Prishvin             2042
Paustovskiy          1956
Tolstoy              1926
Leskov               1906
Chekhov              1829
Kataev               1705
Turgenev             1591
Ostrovsky            1568
Gorky                1504
Belyaev              1286
Kuprin               1111
Grin                 1104
Averchenko           1068
Pikul                1019
Saltykov-schedrin     986
Bunin                 927
Fadeev                908
Pelevin               888
Ilf_petrov            858
Serafimovich          855
Zoschenko             837
Gogol                 780
Akunin                734
Struhgatskie          692
Gaydar                556
Goncharov             457
Dovlatov              442
Pasternak             439
Bulgakov              428
Lukyanenko            412
Shukshin              410
Furmanov              402
Pushkin               324
Name:

In [68]:
# Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=38)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
# Параметры обучения
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    # label_names=['labels'],
)


In [70]:
# Определение Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)


In [71]:
# Запуск обучения
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.5873,2.909965
2,2.2094,2.721918
3,2.0404,2.671815


TrainOutput(global_step=17595, training_loss=2.4475792770190616, metrics={'train_runtime': 754.921, 'train_samples_per_second': 186.437, 'train_steps_per_second': 23.307, 'total_flos': 260688672668160.0, 'train_loss': 2.4475792770190616, 'epoch': 3.0})

In [80]:

test_dataset = tokenized_datasets['test']

# Используем метод predict для предсказания на тестовых данных
predictions = trainer.predict(test_dataset)



In [89]:
test_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9118
})

In [82]:
# Получение предсказанных классов (если это задача классификации)
predicted_labels = predictions.predictions.argmax(axis=1)
predicted_labels


array([ 9, 18, 35, ..., 24, 30, 14])

In [87]:
from sklearn.metrics import accuracy_score, f1_score


In [90]:
accuracy = accuracy_score(test_dataset['label'], predicted_labels)

print(f'Точность модели: {accuracy}')

Точность модели: 0.2779118227681509


In [91]:
f1 = f1_score(test_dataset['label'], predicted_labels, average='macro')
print(f'F1 модели: {f1}')

F1 модели: 0.173322047362297


In [None]:
# # Если вам нужно также сохранить оценки вероятностей
# predicted_probabilities = predictions.predictions

In [76]:
predictions = trainer.predict(test_dataset['text'])

TypeError: vars() argument must have __dict__ attribute

In [None]:
# Сохранение модели
trainer.save_model("./rubert-tiny-finetuned")