In [1]:
!pip install transformers torch scikit-learn accelerate -U
!pip install datasets 

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting torch
  Downloading torch-2.9.1-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/12.0 MB 5.6 MB/s eta 0:00:03
   -- ------------------------------------- 0.8/12.0 MB 6.7 MB/s eta 0:00:02
   -- ------------------------------------- 0.8/12.0 MB 6.7 MB/s eta 0:00:02
   -- ------------------------------------- 0.8/12.0 MB 6.7 MB/s eta 0:00:02
   -- ------------------------------------- 0.8/12.0 MB 6.7 MB/s eta 0:00:02
   ----- ---------------------------------- 1.6/12.0 MB 1.2 MB/s eta 0:00:09
   ------ --------------------------------- 1.8/12.0 MB 1.4 MB/s eta 0:00:08
   ------ --------------------------------- 1.8/12.0 MB 1.4 MB/s e

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np

MODEL_NAME = 'bert-base-uncased' 
NUM_LABELS = 2 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary') # 'binary' для P/N
    
    return {"accuracy": acc, "f1": f1}

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [None]:
def fine_tune_on_dataset(train_df: pd.DataFrame, test_df: pd.DataFrame, dataset_name: str, epochs: int = 3):
    print(f"Запуск дообучения на датасете: {dataset_name}")

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

    train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
    test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

    print("Токенизация обучающего набора...")
    tokenized_train_dataset = train_dataset.map(
        tokenize_function, 
        batched=True, 
        desc="Токенизация Train",
        disable_pbar=False 
    )
    print("Токенизация тестового набора...")
    tokenized_test_dataset = test_dataset.map(
        tokenize_function, 
        batched=True, 
        desc="Токенизация Test",
        disable_pbar=False
    )

    tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text", "__index_level_0__"])
    tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text", "__index_level_0__"])
    tokenized_train_dataset.set_format("torch")
    tokenized_test_dataset.set_format("torch")

    output_path = f"./results/{dataset_name}_ft"
    training_args = TrainingArguments(
        output_dir=output_path, 
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model='f1', # Оптимизировать по F1-score
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    final_results = trainer.evaluate()
    print(f"Финальные результаты для {dataset_name}: {final_results}")
    
    trainer.save_model(f"./saved_models/{dataset_name}_sentiment")
    tokenizer.save_pretrained(f"./saved_models/{dataset_name}_sentiment")
    
    print(f"Модель {dataset_name} сохранена.")
    return final_results

In [None]:
datasets_to_process = {
    "Reviews_Dataset": pd.DataFrame(data1),
    "Comments_Dataset": pd.DataFrame(data2)
}

# Словарь для сбора всех результатов
all_results = {}

for name, df in datasets_to_process.items():
    try:
        results = fine_tune_on_dataset(df, name, epochs=3)
        all_results[name] = results
    except Exception as e:
        print(f"!!! Ошибка при обработке датасета {name}: {e}")
        all_results[name] = {"error": str(e)}

print("\n--- Сводная таблица результатов ---")
print(pd.DataFrame(all_results).T)