# Ноут с Bert и RoBerta + их fine-tunning 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
valid_classes = ['relevant_minus', 'relevant', 'no_relevant', 'relevant_plus']
train = train[train['target'].isin(valid_classes)].copy()

In [None]:
df_train = train.rename(columns={
    'leftItemId': 'id1_item',
    'rightItemId': 'id2_item',
    'target': 'relevant'
})

df_test = test.rename(columns={
    'leftItemId': 'id1_item',
    'rightItemId': 'id2_item'
})

In [None]:
df_items = pd.read_parquet('/content/items (3).parquet')

def merging_data(df, df_items):
    df = df.merge(df_items.rename(columns={'itemId': 'id1_item'})[['id1_item', 'title', 'content']],
                 on='id1_item', how='left')
    df = df.merge(df_items.rename(columns={'itemId': 'id2_item'})[['id2_item', 'title', 'content']],
                 on='id2_item', how='left', suffixes=('_1', '_2'))
    return df

df_train = merging_data(df_train, df_items)
df_test = merging_data(df_test, df_items)

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    # Удаляем специальные символы и лишние пробелы
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    text = ' '.join(text.split())
    return text[:400]

In [None]:
df_train['text1'] = df_train['title_1'].apply(preprocess_text) + " [SEP] " + df_train['content_1'].apply(preprocess_text)
df_train['text2'] = df_train['title_2'].apply(preprocess_text) + " [SEP] " + df_train['content_2'].apply(preprocess_text)

df_test['text1'] = df_test['title_1'].apply(preprocess_text) + " [SEP] " + df_test['content_1'].apply(preprocess_text)
df_test['text2'] = df_test['title_2'].apply(preprocess_text) + " [SEP] " + df_test['content_2'].apply(preprocess_text)


In [None]:
model_name = 'bert-base-multilingual-cased'  
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
labels = df_train['relevant'].map({'no_relevant': 0, 'relevant_minus': 1, 'relevant': 2, 'relevant_plus': 3})
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels.values)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [None]:
class ImprovedPostPairsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=160, is_train=True):  # Увеличил длину
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_train = is_train
        self.label_map = {'no_relevant': 0, 'relevant_minus': 1, 'relevant': 2, 'relevant_plus': 3}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text1 = str(row['text1'])
        text2 = str(row['text2'])

        encoding = self.tokenizer(
            text1,
            text2,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
        )

        if self.is_train:
            label = self.label_map[row['relevant']]
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten()
            }


In [None]:
train_df, val_df = train_test_split(
    df_train,
    test_size=0.15,
    random_state=42,
    stratify=df_train['relevant']
)

train_dataset = ImprovedPostPairsDataset(train_df, tokenizer, is_train=True)
val_dataset = ImprovedPostPairsDataset(val_df, tokenizer, is_train=True)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)  # Уменьшил батч для лучшего обучения
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    id2label={0: 'no_relevant', 1: 'relevant_minus', 2: 'relevant', 3: 'relevant_plus'},
    label2id={'no_relevant': 0, 'relevant_minus': 1, 'relevant': 2, 'relevant_plus': 3},
    hidden_dropout_prob=0.2,  # Добавил регуляризацию
    attention_probs_dropout_prob=0.2
)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)
class_weights = class_weights.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, eps=1e-8)
total_steps = len(train_loader) * 3  # 3 эпохи
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(total_steps * 0.1), num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss(weight=class_weights)


In [6]:

best_val_f1 = 0
patience = 2
patience_counter = 0

for epoch in range(3):
    model.train()
    total_loss = 0

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        if batch_idx % 20 == 0:
            print(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}')

    # Валидация
    model.eval()
    val_predictions = []
    val_true_labels = []
    val_probs = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_batch = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            predictions = torch.argmax(logits, dim=1)

            val_predictions.extend(predictions.cpu().numpy())
            val_true_labels.extend(labels_batch.cpu().numpy())
            val_probs.extend(probs.cpu().numpy())

    val_f1 = f1_score(val_true_labels, val_predictions, average='weighted')
    print(f'Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader):.4f}, Val Weighted F1: {val_f1:.6f}')

    # Early stopping
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pt')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Загрузка лучшей модели
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

test_dataset = ImprovedPostPairsDataset(df_test, tokenizer, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

test_predictions = []
test_probs = []
reverse_label_map = {0: 'no_relevant', 1: 'relevant_minus', 2: 'relevant', 3: 'relevant_plus'}

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        predictions = torch.argmax(logits, dim=1)

        test_predictions.extend(predictions.cpu().numpy())
        test_probs.extend(probs.cpu().numpy())

test_probs = np.array(test_probs)
final_test_predictions = []

# Адаптивные пороги уверенности для разных классов
confidence_thresholds = {
    'no_relevant': 0.55,
    'relevant_minus': 0.5,
    'relevant': 0.6,
    'relevant_plus': 0.65
}

for i, probs in enumerate(test_probs):
    pred_class = test_predictions[i]
    pred_label = reverse_label_map[pred_class]
    max_prob = probs[pred_class]

    if max_prob < confidence_thresholds[pred_label]:
        # Если модель неуверена, выбираем наиболее вероятный класс среди уверенных
        confident_classes = []
        for j, prob in enumerate(probs):
            if prob >= confidence_thresholds[reverse_label_map[j]]:
                confident_classes.append((j, prob))

        if confident_classes:
            confident_classes.sort(key=lambda x: x[1], reverse=True)
            final_label = reverse_label_map[confident_classes[0][0]]
        else:
            # Если нет уверенных классов, выбираем самый вероятный
            final_label = reverse_label_map[np.argmax(probs)]
    else:
        final_label = pred_label

    final_test_predictions.append(final_label)

# Создание submission
submission = pd.DataFrame({
    'Unnamed: 0': range(len(df_test)),
    'target': final_test_predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")
print(f"Best validation F1: {best_val_f1:.6f}")
print("Sample predictions:")
print(submission.head(10))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Epoch 1, Batch 0, Loss: 1.4018
Epoch 1, Batch 20, Loss: 1.3842
Epoch 1, Batch 40, Loss: 1.3879
Epoch 1, Batch 60, Loss: 1.4091
Epoch 1, Batch 80, Loss: 1.3840
Epoch 1, Batch 100, Loss: 1.3803
Epoch 1, Batch 120, Loss: 1.3945
Epoch 1, Batch 140, Loss: 1.3858
Epoch 1, Batch 160, Loss: 1.3879
Epoch 1, Batch 180, Loss: 1.3789
Epoch 1, Batch 200, Loss: 1.3867
Epoch 1, Batch 220, Loss: 1.3934
Epoch 1, Batch 240, Loss: 1.4046
Epoch 1, Batch 260, Loss: 1.3905
Epoch 1, Batch 280, Loss: 1.3758
Epoch 1, Batch 300, Loss: 1.3960
Epoch 1, Batch 320, Loss: 1.3818
Epoch 1, Batch 340, Loss: 1.3755
Epoch 1, Batch 360, Loss: 1.3805
Epoch 1, Batch 380, Loss: 1.3940
Epoch 1, Batch 400, Loss: 1.3898
Epoch 1, Batch 420, Loss: 1.3891
Epoch 1, Batch 440, Loss: 1.3913
Epoch 1, Batch 460, Loss: 1.3682
Epoch 1, Batch 480, Loss: 1.3698
Epoch 1, Batch 500, Loss: 1.3812
Epoch 1, Batch 520, Loss: 1.3883
Epoch 1, Batch 540, Loss: 1.3853
Epoch 1, Batch 560, Loss: 1.3548
Epoch 1, Batch 580, Loss: 1.39

# Bert и Berta

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from catboost import CatBoostClassifier
import numpy as np


# ------------------------------
# 1. Загружаем обычный BERT
# ------------------------------
MODEL_NAME = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)


# ------------------------------
# 2. Функция получения эмбеддингов BERT
# ------------------------------
def bert_embed(texts, batch_size=16):
    bert_model.eval()
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        tokens = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = bert_model(**tokens)

        # Используем CLS-токен
        cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(cls_embeddings)

    return np.vstack(embeddings)


# ------------------------------
# 3. Пример данных
# ------------------------------
texts = [
    "This movie is amazing!",
    "I hated every second of it.",
    "The service was excellent.",
    "I will never come back."
]

labels = [1, 0, 1, 0]


# ------------------------------
# 4. Генерация BERT-эмбеддингов
# ------------------------------
X = bert_embed(texts)
print("Embedding shape:", X.shape)


# ------------------------------
# 5. Обучение CatBoostClassifier
# ------------------------------
model = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    verbose=50
)

model.fit(X, labels)


# ------------------------------
# 6. Предсказание
# ------------------------------
test_texts = ["Fantastic product!", "Terrible experience."]
X_test = bert_embed(test_texts)

preds = model.predict_proba(X_test)
print(preds)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from catboost import CatBoostClassifier
import numpy as np


# ============================================================
# 1. Загружаем RoBERTa (berta)
# ============================================================
MODEL_NAME = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
roberta_model = AutoModel.from_pretrained(MODEL_NAME)


# ============================================================
# 2. Функция получения эмбеддингов RoBERTa
# ============================================================
def berta_embed(texts, batch_size=16):
    roberta_model.eval()
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        tokens = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = roberta_model(**tokens)

        # У RoBERTa роль CLS играет токен <s>
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embeddings)

    return np.vstack(embeddings)


# ============================================================
# 3. Пример тренировочных данных
# ============================================================
texts = [
    "This movie is amazing!",
    "I hated every minute of it.",
    "The service was excellent.",
    "I will never return here."
]

labels = [1, 0, 1, 0]   # 1 — позитив, 0 — негатив


# ============================================================
# 4. Генерация эмбеддингов
# ============================================================
X = berta_embed(texts)
print("Размер эмбеддингов:", X.shape)   # (N, 768)


# ============================================================
# 5. Обучение CatBoostClassifier
# ============================================================
model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    verbose=50
)

model.fit(X, labels)


# ============================================================
# 6. Применение модели
# ============================================================
test_texts = [
    "Fantastic product!",
    "Terrible experience."
]

X_test = berta_embed(test_texts)
preds = model.predict_proba(X_test)

print("Предсказания:")
print(preds)


# Fine-tunning и объединение с TF-IDF признаками

In [None]:
"""
Полный скрипт: Fine-tune BERT и RoBERTa, извлечение эмбеддингов,
объединение с TF-IDF признаками (feature stacking) и обучение CatBoost.

Зависимости:
    pip install transformers datasets torch scikit-learn catboost tqdm

Использование:
    - Подготовь CSV с колонками: 'text' и 'label' или передай списки texts/labels
    - Настрой параметры в разделе CONFIG

Скрипт выполняет для каждой модели (BERT и RoBERTa):
    1) Fine-tune модели на train/val
    2) Извлекает эмбеддинги (mean-pooling по токенам)
    3) Строит TF-IDF и уменьшает размерность (TruncatedSVD)
    4) Конкатенирует эмбеддинги + TF-IDF
    5) Обучает CatBoost на получённых признаках

Файл сохраняет: fine-tuned модели и CatBoost-классификаторы.
"""

import os
import random
import numpy as np
import pandas as pd
from typing import List, Tuple

import torch
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from tqdm import tqdm


# ========== CONFIG ==========
RANDOM_SEED = 42
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODELS = {
    'bert': 'bert-base-uncased',
    'berta': 'roberta-base'  # "BERTA" == RoBERTa в этом скрипте
}
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
TFIDF_MAX_FEATURES = 20000  # для векторизатора
SVD_COMPONENTS = 200  # снизим размер TF-IDF до этих компонент
CATBOOST_ITERS = 500
OUTPUT_DIR = './outputs_bert_roberta'
os.makedirs(OUTPUT_DIR, exist_ok=True)
# ============================


def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


set_seed(RANDOM_SEED)


# ------------------ Data helpers ------------------

def load_csv(path: str, text_col: str = 'text', label_col: str = 'label') -> Tuple[List[str], List[int]]:
    df = pd.read_csv(path)
    texts = df[text_col].astype(str).tolist()
    labels = df[label_col].astype(int).tolist()
    return texts, labels


# ------------------ Fine-tune (Trainer) ------------------
class HFDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


def fine_tune_model(model_name: str,
                    train_texts: List[str],
                    train_labels: List[int],
                    val_texts: List[str],
                    val_labels: List[int],
                    output_dir: str,
                    epochs: int = EPOCHS,
                    batch_size: int = BATCH_SIZE) -> str:
    """
    Fine-tune AutoModelForSequenceClassification и сохраняет модель в output_dir.
    Возвращает путь к сохранённой модели.
    """
    print(f"Fine-tuning {model_name} on device {DEVICE}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_labels)))

    # Токенизация
    train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
    val_enc = tokenizer(val_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')

    train_dataset = HFDataset(train_enc, train_labels)
    val_dataset = HFDataset(val_enc, val_labels)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_strategy='epoch',
        learning_rate=2e-5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        push_to_hub=False,
        fp16=torch.cuda.is_available(),
        seed=RANDOM_SEED,
    )

    # Простая метрика accuracy
    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        labels = p.label_ids
        acc = (preds == labels).mean()
        return {'accuracy': acc}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    save_path = os.path.join(output_dir, model_name.replace('/', '_'))
    trainer.save_model(save_path)
    print(f"Saved fine-tuned model to {save_path}")
    return save_path


# ------------------ Embedding extraction ------------------

def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> np.ndarray:
    """Mean pooling over tokens, учитывая attention mask. Возвращает numpy массив.
    last_hidden_state: (batch, seq_len, hidden)
    attention_mask: (batch, seq_len)
    """
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = torch.sum(last_hidden_state * input_mask_expanded, 1)
    summed_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_pooled = summed / summed_mask
    return mean_pooled.cpu().numpy()


def extract_embeddings(model_dir: str, model_name: str, texts: List[str], batch_size: int = BATCH_SIZE) -> np.ndarray:
    """
    Загружает fine-tuned базовую модель (AutoModel) и извлекает эмбеддинги mean-pooling.
    model_dir: путь к сохранённой fine-tuned модели (Trainer.save_model path)
    model_name: имя токенизатора/архитектуры (для tokenizer)
    """
    print(f"Extracting embeddings from {model_dir}... device={DEVICE}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Загружаем модель базовую часть из сохранённого каталога (если сохранена full модель, AutoModel.from_pretrained возьмёт base)
    model = AutoModel.from_pretrained(model_dir)
    model.to(DEVICE)
    model.eval()

    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
        input_ids = enc['input_ids'].to(DEVICE)
        attention_mask = enc['attention_mask'].to(DEVICE)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden = outputs.last_hidden_state  # (batch, seq_len, hidden)
            emb = mean_pooling(last_hidden, attention_mask)
            embeddings.append(emb)

    embeddings = np.vstack(embeddings)
    print(f"Embeddings shape: {embeddings.shape}")
    return embeddings


# ------------------ TF-IDF + SVD ------------------

def build_tfidf_svd(train_texts: List[str], all_texts: List[str], max_features: int = TFIDF_MAX_FEATURES,
                    n_components: int = SVD_COMPONENTS):
    """
    Fit TF-IDF на train_texts, и TruncatedSVD на TF-IDF матрицу всех текстов.
    Возвращает vectorizer, svd и преобразованные матрицы для all_texts.
    """
    print("Fitting TF-IDF and SVD...")
    tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    tfidf_train = tfidf.fit_transform(train_texts)

    # Применяем TF-IDF ко всем текстам
    tfidf_all = tfidf.transform(all_texts)

    svd = TruncatedSVD(n_components=n_components, random_state=RANDOM_SEED)
    svd_all = svd.fit_transform(tfidf_all)

    print(f"TF-IDF -> SVD transformed shape: {svd_all.shape}")
    return tfidf, svd, svd_all


# ------------------ Обучение CatBoost на фичах ------------------

def train_catboost(X_train: np.ndarray, y_train: List[int], X_val: np.ndarray, y_val: List[int], save_path: str):
    model = CatBoostClassifier(
        iterations=CATBOOST_ITERS,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        eval_metric='Accuracy',
        verbose=50,
        random_seed=RANDOM_SEED,
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)
    model.save_model(save_path)
    print(f"Saved CatBoost model to {save_path}")
    return model


# ------------------ Полный пайплайн для одной модели ------------------

def run_pipeline_for_model(model_key: str,
                           model_name: str,
                           texts: List[str],
                           labels: List[int],
                           output_dir: str = OUTPUT_DIR):
    """
    Выполняет: split -> fine-tune -> extract embeddings -> TF-IDF+SVD -> concat -> CatBoost
    Сохраняет fine-tuned модель и CatBoost модель.
    """
    print('='*40)
    print(f"RUN pipeline for {model_key} ({model_name})")

    # split
    X_train_texts, X_temp_texts, y_train, y_temp = train_test_split(texts, labels, test_size=0.3, random_state=RANDOM_SEED, stratify=labels)
    X_val_texts, X_test_texts, y_val, y_test = train_test_split(X_temp_texts, y_temp, test_size=0.5, random_state=RANDOM_SEED, stratify=y_temp)

    # 1) Fine-tune
    model_save_dir = fine_tune_model(model_name, X_train_texts, y_train, X_val_texts, y_val, output_dir=os.path.join(output_dir, 'fine_tuned'), epochs=EPOCHS, batch_size=BATCH_SIZE)

    # 2) Extract embeddings for train/val/test using fine-tuned base model
    emb_train = extract_embeddings(model_save_dir, model_name, X_train_texts)
    emb_val = extract_embeddings(model_save_dir, model_name, X_val_texts)
    emb_test = extract_embeddings(model_save_dir, model_name, X_test_texts)

    # 3) TF-IDF on train -> SVD on all (train+val+test)
    all_texts = X_train_texts + X_val_texts + X_test_texts
    _, svd, svd_all = build_tfidf_svd(X_train_texts, all_texts, max_features=TFIDF_MAX_FEATURES, n_components=SVD_COMPONENTS)

    # Разбиваем svd_all обратно на train/val/test
    n_train = len(X_train_texts)
    n_val = len(X_val_texts)
    svd_train = svd_all[:n_train]
    svd_val = svd_all[n_train:n_train + n_val]
    svd_test = svd_all[n_train + n_val:]

    # 4) Конкатенируем эмбеддинги + svd признаки
    X_train_stack = np.hstack([emb_train, svd_train])
    X_val_stack = np.hstack([emb_val, svd_val])
    X_test_stack = np.hstack([emb_test, svd_test])

    print(f"Stacked feature shapes: train={X_train_stack.shape}, val={X_val_stack.shape}, test={X_test_stack.shape}")

    # 5) Обучаем CatBoost
    cb_save_path = os.path.join(output_dir, f'catboost_{model_key}.cbm')
    cb_model = train_catboost(X_train_stack, y_train, X_val_stack, y_val, cb_save_path)

    # Оцениваем на test
    test_preds = cb_model.predict(X_test_stack)
    test_proba = cb_model.predict_proba(X_test_stack)
    test_acc = (test_preds == np.array(y_test)).mean()
    print(f"Test accuracy for {model_key}: {test_acc:.4f}")

    # Сохраняем дополнительно артефакты
    np.save(os.path.join(output_dir, f'emb_train_{model_key}.npy'), emb_train)
    np.save(os.path.join(output_dir, f'emb_val_{model_key}.npy'), emb_val)
    np.save(os.path.join(output_dir, f'emb_test_{model_key}.npy'), emb_test)

    return {
        'model_key': model_key,
        'fine_tuned_dir': model_save_dir,
        'catboost_path': cb_save_path,
        'test_accuracy': test_acc,
    }


# ------------------ Пример вызова ------------------
if __name__ == '__main__':
    # Пример: загрузка данных из CSV. Замените путь на ваш датасет.
    DATA_CSV = 'data.csv'  # <-- замените

    if not os.path.exists(DATA_CSV):
        print(f"Файл {DATA_CSV} не найден. Создайте CSV с колонками 'text' и 'label' или измените путь.")
        # Для демонстрации создаём toy-датасет
        texts = [
            "This movie is amazing.",
            "I hated the film, it was terrible.",
            "Fantastic acting and story.",
            "Worst experience ever.",
            "I would recommend it to my friends.",
            "Not worth the time.",
            "Absolutely loved it!",
            "I will never watch it again."
        ]
        labels = [1, 0, 1, 0, 1, 0, 1, 0]
    else:
        texts, labels = load_csv(DATA_CSV, text_col='text', label_col='label')

    results = {}
    for key, name in MODELS.items():
        res = run_pipeline_for_model(key, name, texts, labels, output_dir=OUTPUT_DIR)
        results[key] = res

    print('\nALL RESULTS:')
    print(results)
