In [3]:
!pip install -q peft evaluate datasets transformers scikit-learn torch fuzzywuzzy

In [23]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import evaluate
from torch.utils.data import DataLoader
import json
import time
import re
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')

In [25]:
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"  # Дополнительная защита

# Явное завершение любых существующих wandb-сессий
try:
    import wandb
    wandb.finish()
except ImportError:
    pass  # wandb не импортирован, ничего не делаем

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
model_name = "sberbank-ai/rugpt3large_based_on_gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

In [7]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

## Категории и предобработка

In [16]:
categories = ['бытовая техника', 'обувь', 'одежда', 'посуда', 'текстиль', 'товары для детей', 'украшения и аксессуары', 'электроника', 'нет товара']
label2id = {cat: i for i, cat in enumerate(categories)}
id2label = {i: cat for cat, i in label2id.items()}

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'[^\w\s.,!?—–-]', ' ', text)
    if len(text) < 10:
        return ""
    return text

# Функция подготовки датасета для SFT
def prepare_dataset(df, tokenizer, is_train_val=True):
    def format_example(ex):
        prompt = f"Ты классификатор отзывов. Выбери категорию из: {', '.join(categories)}.\nОтзыв: {ex['text_clean']}\nКатегория: {ex['label_final']}\n"
        return {"prompt": prompt, "label": ex['label_final']}

    dataset = Dataset.from_pandas(df[['text_clean', 'label_final']])
    dataset = dataset.map(format_example)
    tokenized = dataset.map(lambda x: tokenizer(x["prompt"], truncation=True, padding="max_length", max_length=256), batched=True)

    # Удаляем ненужные колонки, сохраняем input_ids, attention_mask
    tokenized = tokenized.remove_columns(["text_clean", "label_final", "prompt"])
    if '__index_level_0__' in tokenized.column_names:
        tokenized = tokenized.remove_columns(["__index_level_0__"])

    tokenized.set_format("torch")

    # Добавляем labels для train/val
    if is_train_val:
        tokenized = tokenized.add_column("labels", [label2id[label] for label in df['label_final'].values])

    return tokenized

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load("f1")
    return metric.compute(predictions=predictions, references=labels, average="weighted")

In [10]:
def predict_test(model, tokenizer, test_df, device, batch_size=4):
    model.eval()
    predictions = []
    times = []
    test_prompts = [f"Ты классификатор. Категории: {', '.join(categories)}.\nОтзыв: {row['text_clean']}\nКатегория: " for _, row in test_df.iterrows()]

    dataloader = DataLoader(test_prompts, batch_size=batch_size, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device))

    for batch in dataloader:
        start = time.time()
        with torch.no_grad():
            outputs = model.generate(**batch, max_new_tokens=10, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
        end = time.time()
        times.append(end - start)

        batch_preds = [tokenizer.decode(out, skip_special_tokens=True).split("Категория:")[-1].strip().lower() for out in outputs]
        batch_labels = [max(categories, key=lambda cat: fuzz.ratio(pred, cat.lower())) for pred in batch_preds]
        predictions.extend(batch_labels)

    avg_time = np.mean(times) / (len(test_prompts) / batch_size)
    print(f"Среднее время на пример: {avg_time:.2f}s (макс 5s)")

    test_df['predicted_category'] = predictions
    return test_df, avg_time

In [26]:
def predict_test(model, tokenizer, test_df, device, batch_size=4):
    model.eval()
    predictions = []
    times = []
    test_prompts = [f"Ты классификатор. Категории: {', '.join(categories)}.\nОтзыв: {row['text_clean']}\nКатегория: " for _, row in test_df.iterrows()]

    dataloader = DataLoader(test_prompts, batch_size=batch_size, collate_fn=lambda x: tokenizer(x, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device))

    for batch in dataloader:
        start = time.time()
        with torch.no_grad():
            outputs = model.generate(**batch, max_new_tokens=10, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
        end = time.time()
        times.append(end - start)

        batch_preds = [tokenizer.decode(out, skip_special_tokens=True).split("Категория:")[-1].strip().lower() for out in outputs]
        batch_labels = [max(categories, key=lambda cat: fuzz.ratio(pred, cat.lower())) for pred in batch_preds]
        predictions.extend(batch_labels)

    avg_time = np.mean(times) / (len(test_prompts) / batch_size)
    print(f"Среднее время на пример: {avg_time:.2f}s (макс 5s)")

    test_df['predicted_category'] = predictions
    return test_df, avg_time

# 9. Основная функция
def train_and_eval(labeled_file, run_name="base", test_file="data/test.csv"):
    print(f"\n=== Full FT Ран: {run_name} на {labeled_file} ===")

    # Загрузка данных
    train_df = pd.read_csv(labeled_file)
    test_df = pd.read_csv(test_file)
    test_df['text_clean'] = test_df['text'].apply(preprocess_text)
    test_df = test_df[test_df['text_clean'] != ''].reset_index(drop=True)

    # Фильтрация классов с <2 примерами
    class_counts = train_df['label_final'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    if len(valid_classes) < len(class_counts):
        print(f"Предупреждение: Удалены классы с <2 примерами: {set(class_counts.index) - set(valid_classes)}")
        train_df = train_df[train_df['label_final'].isin(valid_classes)].reset_index(drop=True)

    # Split
    if len(train_df) < 10:
        raise ValueError("Слишком мало данных после фильтрации для обучения")

    train_split, val_split = train_test_split(
        train_df, test_size=0.2, stratify=train_df['label_final'], random_state=42
    )
    print(f"Train: {len(train_split)}, Val: {len(val_split)}")
    print("Распределение train:\n", train_split['label_final'].value_counts())

    # Модель
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    model.resize_token_embeddings(len(tokenizer))
    model.train()

    # Датасеты
    train_ds = prepare_dataset(train_split, tokenizer, is_train_val=True)
    val_ds = prepare_dataset(val_split, tokenizer, is_train_val=True)

    # Collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Training args
    training_args = TrainingArguments(
        output_dir=f"./models_full_ft/{run_name}",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8,
        warmup_steps=50,
        logging_steps=50,
        eval_steps=200,
        save_steps=400,  # Сделано кратным eval_steps (2x200=400)
        eval_strategy="steps",  # Исправлено
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,  # Добавлено: для F1 (выше лучше)
        fp16=True,
        dataloader_pin_memory=False,
        report_to=None
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Обучение
    trainer.train()

    # Валидация
    val_results = trainer.evaluate()
    print(f"Val Weighted F1: {val_results.get('eval_f1', 'N/A'):.4f}")

    # Сохранение
    trainer.save_model(f"./models_full_ft/{run_name}")
    tokenizer.save_pretrained(f"./models_full_ft/{run_name}")
    with open(f"./models_full_ft/{run_name}/metrics.json", "w") as f:
        json.dump(val_results, f)

    # Тест
    test_preds, avg_time = predict_test(model, tokenizer, test_df, device)
    test_preds.to_csv(f"predictions_full_ft_{run_name}.csv", index=False)

    print(f"Сохранено: модель в ./models_full_ft/{run_name}, предсказания в predictions_full_ft_{run_name}.csv")
    return val_results.get('eval_f1', 0.0), avg_time

In [12]:
!mkdir -p ./models_full_ft

In [27]:
f1_base, time_base = train_and_eval("train_labeled-2.csv", "base")
f1_aug, time_aug = train_and_eval("train_labeled_augmented.csv", "augmented")

print(f"\nСравнение Full FT:\nBase F1: {f1_base:.4f}, Time: {time_base:.2f}s\nAug F1: {f1_aug:.4f}, Time: {time_aug:.2f}s")


=== Full FT Ран: base на train_labeled-2.csv ===
Предупреждение: Удалены классы с <2 примерами: {'посуда'}
Train: 1441, Val: 361
Распределение train:
 label_final
одежда                    985
нет товара                157
украшения и аксессуары    101
товары для детей           97
электроника                60
бытовая техника            33
обувь                       6
текстиль                    2
Name: count, dtype: int64


Map:   0%|          | 0/1441 [00:00<?, ? examples/s]

Map:   0%|          | 0/1441 [00:00<?, ? examples/s]

Map:   0%|          | 0/361 [00:00<?, ? examples/s]

Map:   0%|          | 0/361 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).