# Прогноз дружбы с помощью CatBoost


In [45]:
import warnings

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    accuracy_score,
)
from sklearn.utils import resample
from catboost import CatBoostClassifier, Pool

warnings.filterwarnings('ignore')


## 1. Загрузка и первичный анализ данных


In [46]:
def load_and_explore_data(features_file, users_file=None, friends_file=None, drop_unrealistic_age=True):
    print("Запускаю чтение исходных таблиц, не пугайтесь стенки текста.")

    features_path = Path(features_file)
    if not features_path.exists():
        raise FileNotFoundError(f"Не найден файл с признаками: {features_file}")

    df = pd.read_csv(features_path)
    print(f"Сырые размеры: {df.shape}")
    print(f"Колонки, что нашлись: {df.columns.tolist()}")

    df = df.dropna(subset=['user_A', 'user_B']).copy()
    df[['user_A', 'user_B']] = df[['user_A', 'user_B']].astype('int64')
    df = df.drop_duplicates(subset=['user_A', 'user_B']).reset_index(drop=True)

    df = df.replace([np.inf, -np.inf], np.nan)

    count_cols = ['common_friends', 'common_groups']
    for col in count_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0).astype('int32')

    binary_cols = [col for col in df.columns if col.startswith('same_')]
    for col in binary_cols:
        df[col] = df[col].fillna(0).astype('int8')

    ratio_cols = ['jaccard_friends', 'adamic_adar', 'jaccard_groups']
    for col in ratio_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    if friends_file is not None:
        friends_path = Path(friends_file)
        friends_df = pd.read_csv(friends_path)
        friends_df = friends_df.dropna(subset=['user_A', 'user_B']).copy()
        friends_df[['user_A', 'user_B']] = friends_df[['user_A', 'user_B']].astype('int64')

        undirected = pd.concat([
            friends_df[['user_A', 'user_B']],
            friends_df[['user_B', 'user_A']].rename(columns={'user_B': 'user_A', 'user_A': 'user_B'}),
        ], ignore_index=True).drop_duplicates()
        undirected['label'] = 1

        df = df.drop(columns=['label'], errors='ignore')
        df = df.merge(undirected, on=['user_A', 'user_B'], how='left')
        df['label'] = df['label'].fillna(0).astype('int8')
        print(f"Таргет собран: {int(df['label'].sum())} дружб из {len(df)} пар (доля {df['label'].mean():.4%})")
    elif 'label' not in df.columns:
        raise ValueError("В таблице признаков нет колонки label и не передан friends_file для построения таргета")

    if users_file is not None and drop_unrealistic_age:
        users_path = Path(users_file)
        users_df = pd.read_csv(users_path)
        invalid_ids = users_df[(users_df['age'].notna()) & ((users_df['age'] < 14) | (users_df['age'] > 90))]['id']
        invalid_ids = invalid_ids.astype('int64') if not invalid_ids.empty else pd.Series([], dtype='int64')
        if not invalid_ids.empty:
            before = len(df)
            df = df[~df['user_A'].isin(invalid_ids) & ~df['user_B'].isin(invalid_ids)].copy()
            removed = before - len(df)
            print(f"Срезал странные возрасты, минус {removed} строк")

    df.reset_index(drop=True, inplace=True)
    print(f"После чистки осталось: {df.shape}")

    print("\nИнформация о фрейме для спокойствия:")
    df.info()

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print("\nЧисловая статистика без прикрас:")
    print(df[numeric_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

    print("\nГде дырки в данных:")
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    missing_df = pd.DataFrame({'Количество': missing_data, 'Процент': missing_percent})
    print(missing_df[missing_df['Количество'] > 0].sort_values('Процент', ascending=False))

    if 'label' in df.columns:
        print("\nКак распределён таргет:")
        label_counts = df['label'].value_counts()
        print(label_counts)
        print(label_counts / len(df))

        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        label_counts.plot(kind='bar', color=['salmon', 'steelblue'])
        plt.title('Распределение целевой переменной')
        plt.xlabel('label')
        plt.ylabel('Количество пар')

        plt.subplot(1, 2, 2)
        label_counts.plot(kind='pie', autopct='%1.2f%%', colors=['salmon', 'steelblue'])
        plt.ylabel('')
        plt.title('Соотношение классов')
        plt.tight_layout()
        plt.show()

    return df


## 2. Помощники для генерации отрицательных примеров


In [47]:
def _normalize_text(value):
    if pd.isna(value):
        return None
    text = str(value).strip()
    return text.lower() if text else None


def _parse_multi_value_field(value):
    if pd.isna(value):
        return set()
    tokens = str(value).split(';')
    return {token.strip().lower() for token in tokens if token.strip()}


def _prepare_user_profiles(users_df):
    profiles = {}
    for row in users_df.itertuples(index=False):
        user_id = int(getattr(row, 'id'))
        age_value = getattr(row, 'age', np.nan)
        if pd.isna(age_value):
            age = None
        else:
            try:
                age = float(age_value)
            except (TypeError, ValueError):
                age = None

        profiles[user_id] = {
            'city': _normalize_text(getattr(row, 'city', None)),
            'universities': _parse_multi_value_field(getattr(row, 'universities', None)),
            'faculties': _parse_multi_value_field(getattr(row, 'faculties', None)),
            'schools': _parse_multi_value_field(getattr(row, 'schools', None)),
            'age': age,
        }
    return profiles


def _build_friend_graph(friends_df, valid_user_ids=None):
    graph = defaultdict(set)
    for row in friends_df.itertuples(index=False):
        user_a = int(getattr(row, 'user_A'))
        user_b = int(getattr(row, 'user_B'))
        if valid_user_ids is not None and (user_a not in valid_user_ids or user_b not in valid_user_ids):
            continue
        graph[user_a].add(user_b)
        graph[user_b].add(user_a)
    return graph


def _build_group_membership(groups_df, valid_user_ids=None):
    membership = defaultdict(set)
    for row in groups_df.itertuples(index=False):
        user_id = int(getattr(row, 'user_id'))
        if valid_user_ids is not None and user_id not in valid_user_ids:
            continue
        group_id = int(getattr(row, 'group_id'))
        membership[user_id].add(group_id)
    return membership


def compute_pair_features(user_a, user_b, friend_graph, user_profiles, group_membership):
    friends_a = friend_graph.get(user_a, set())
    friends_b = friend_graph.get(user_b, set())
    common_friends = friends_a & friends_b
    union_friends = friends_a | friends_b

    jaccard_friends = len(common_friends) / len(union_friends) if union_friends else 0.0

    adamic_adar = 0.0
    for friend in common_friends:
        degree = len(friend_graph.get(friend, set()))
        if degree > 1:
            adamic_adar += 1.0 / np.log(degree)

    profile_a = user_profiles.get(user_a, {})
    profile_b = user_profiles.get(user_b, {})

    same_city = int(bool(profile_a.get('city') and profile_a.get('city') == profile_b.get('city')))
    same_university = int(bool(profile_a.get('universities', set()) & profile_b.get('universities', set())))
    same_faculty = int(bool(profile_a.get('faculties', set()) & profile_b.get('faculties', set())))
    same_school = int(bool(profile_a.get('schools', set()) & profile_b.get('schools', set())))

    age_a = profile_a.get('age')
    age_b = profile_b.get('age')
    age_diff = float(abs(age_a - age_b)) if age_a is not None and age_b is not None else np.nan

    groups_a = group_membership.get(user_a, set())
    groups_b = group_membership.get(user_b, set())
    common_groups = groups_a & groups_b
    union_groups = groups_a | groups_b
    jaccard_groups = len(common_groups) / len(union_groups) if union_groups else 0.0

    return {
        'common_friends': int(len(common_friends)),
        'jaccard_friends': float(jaccard_friends),
        'adamic_adar': float(adamic_adar),
        'same_city': int(same_city),
        'same_university': int(same_university),
        'same_faculty': int(same_faculty),
        'same_school': int(same_school),
        'age_diff': age_diff,
        'common_groups': int(len(common_groups)),
        'jaccard_groups': float(jaccard_groups),
    }


## 3. Добавление отрицательных примеров и ресемплинг


In [48]:
def generate_additional_negative_samples(existing_df, users_df, friends_df, groups_df, num_samples, random_state=42, max_attempts_multiplier=20):
    if num_samples <= 0:
        return pd.DataFrame(columns=existing_df.columns)

    users_df = users_df.dropna(subset=['id']).copy()
    users_df['id'] = users_df['id'].astype('int64')

    friends_df = friends_df.dropna(subset=['user_A', 'user_B']).copy()
    friends_df[['user_A', 'user_B']] = friends_df[['user_A', 'user_B']].astype('int64')

    groups_df = groups_df.dropna(subset=['user_id', 'group_id']).copy()
    groups_df[['user_id', 'group_id']] = groups_df[['user_id', 'group_id']].astype('int64')

    valid_user_ids = set(users_df['id'].tolist())
    friend_graph = _build_friend_graph(friends_df, valid_user_ids=valid_user_ids)
    group_membership = _build_group_membership(groups_df, valid_user_ids=valid_user_ids)
    user_profiles = _prepare_user_profiles(users_df)

    existing_pairs = {
        (min(int(row.user_A), int(row.user_B)), max(int(row.user_A), int(row.user_B)))
        for row in existing_df[['user_A', 'user_B']].itertuples(index=False)
    }

    rng = np.random.default_rng(random_state)
    user_pool = np.array(list(valid_user_ids))

    samples = []
    attempts = 0
    max_attempts = max(num_samples * max_attempts_multiplier, num_samples * 2)

    while len(samples) < num_samples and attempts < max_attempts:
        attempts += 1
        if len(user_pool) < 2:
            break

        user_a, user_b = rng.choice(user_pool, size=2, replace=False)
        user_a, user_b = int(min(user_a, user_b)), int(max(user_a, user_b))
        if user_a == user_b:
            continue

        pair_key = (user_a, user_b)
        if pair_key in existing_pairs:
            continue

        if user_b in friend_graph.get(user_a, set()):
            continue

        feature_values = compute_pair_features(user_a, user_b, friend_graph, user_profiles, group_membership)
        feature_values.update({'user_A': user_a, 'user_B': user_b, 'label': 0})
        samples.append(feature_values)
        existing_pairs.add(pair_key)

    if len(samples) < num_samples:
        print(f"Сгенерил только {len(samples)} из {num_samples} отрицательных пар, дальше не вытянул.")

    if not samples:
        return pd.DataFrame(columns=existing_df.columns)

    new_samples_df = pd.DataFrame(samples)
    for col in existing_df.columns:
        if col not in new_samples_df.columns:
            new_samples_df[col] = np.nan
    new_samples_df = new_samples_df[existing_df.columns]

    for col in existing_df.select_dtypes(include=['int8', 'int16', 'int32', 'int64']).columns:
        new_samples_df[col] = new_samples_df[col].round().astype(existing_df[col].dtype, errors='ignore')

    return new_samples_df


def augment_dataset_with_negatives(df, users_file, friends_file, groups_file, negative_config=None, drop_unrealistic_age=True):
    if not negative_config:
        return df, 0

    if users_file is None or friends_file is None or groups_file is None:
        raise ValueError("Для генерации отрицательных примеров нужны файлы users, friends и groups.")

    num_samples = negative_config.get('num_samples', 0)
    multiplier = negative_config.get('multiplier')
    if multiplier is not None:
        positives = int((df['label'] == 1).sum())
        num_samples = max(num_samples, int(positives * multiplier))

    if num_samples <= 0:
        print("Конфиг с отрицательными примерами пустой, пропускаю шаг.")
        return df, 0

    random_state = negative_config.get('random_state', 42)
    max_attempts_multiplier = negative_config.get('max_attempts_multiplier', 20)

    users_df = pd.read_csv(users_file)
    if drop_unrealistic_age:
        invalid_ids = users_df[(users_df['age'].notna()) & ((users_df['age'] < 14) | (users_df['age'] > 90))]['id']
        if not invalid_ids.empty:
            users_df = users_df[~users_df['id'].isin(invalid_ids)].copy()

    friends_df = pd.read_csv(friends_file)
    groups_df = pd.read_csv(groups_file)

    added_df = generate_additional_negative_samples(
        existing_df=df,
        users_df=users_df,
        friends_df=friends_df,
        groups_df=groups_df,
        num_samples=num_samples,
        random_state=random_state,
        max_attempts_multiplier=max_attempts_multiplier,
    )

    if added_df.empty:
        return df, 0

    combined_df = pd.concat([df, added_df], ignore_index=True)
    print(f"Докинул {len(added_df)} отрицательных пар, теперь данных {combined_df.shape}")
    return combined_df, len(added_df)


def resample_dataset(df, target_col='label', strategy=None, ratio=1.0, random_state=42):
    if strategy is None:
        return df

    if target_col not in df.columns:
        raise ValueError(f"Не найдена целевая колонка '{target_col}' для ресемплинга.")

    if ratio <= 0:
        raise ValueError("Параметр ratio должен быть больше 0.")

    positives = df[df[target_col] == 1]
    negatives = df[df[target_col] == 0]

    if positives.empty or negatives.empty:
        print("Классов не хватает, балансировать нечего.")
        return df

    if strategy == 'undersample':
        target_negatives = min(len(negatives), int(len(positives) * ratio))
        if target_negatives == 0:
            print("Не хватило строк для undersample, оставляю как есть.")
            return df
        negatives_resampled = resample(
            negatives, replace=False, n_samples=target_negatives, random_state=random_state
        )
        resampled_df = pd.concat([positives, negatives_resampled], ignore_index=True)
    elif strategy == 'oversample':
        target_positives = max(len(positives), int(np.ceil(len(negatives) / ratio)))
        positives_resampled = resample(
            positives, replace=True, n_samples=target_positives, random_state=random_state
        )
        resampled_df = pd.concat([positives_resampled, negatives], ignore_index=True)
    elif strategy == 'balanced':
        target_size = min(len(positives), len(negatives))
        positives_resampled = resample(
            positives, replace=len(positives) < target_size, n_samples=target_size, random_state=random_state
        )
        negatives_resampled = resample(
            negatives, replace=len(negatives) < target_size, n_samples=target_size, random_state=random_state
        )
        resampled_df = pd.concat([positives_resampled, negatives_resampled], ignore_index=True)
    else:
        raise ValueError(f"Неизвестная стратегия ресемплинга: {strategy}")

    resampled_df = resampled_df.sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    print(f"Ресемплинг {strategy}: {resampled_df[target_col].value_counts().to_dict()}")
    return resampled_df


## 4. Аналитика признаков и подготовка выборок


In [49]:
def analyze_features(df, target_col='label'):
    print("Смотрю на признаки, вдруг что-то выбивается.")

    feature_cols = [col for col in df.columns if col not in ['user_A', 'user_B', target_col]]

    target_correlations = pd.Series(dtype=float)

    if target_col in df.columns:
        plt.figure(figsize=(12, 10))
        correlation_matrix = df[feature_cols + [target_col]].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                    fmt='.2f', linewidths=0.5)
        plt.title('Матрица корреляций признаков')
        plt.tight_layout()
        plt.show()

        print("\nКорреляции с таргетом:")
        target_correlations = correlation_matrix[target_col].drop(target_col).sort_values(ascending=False)
        for feature, corr in target_correlations.items():
            print(f"{feature}: {corr:.3f}")

    numeric_features = df[feature_cols].select_dtypes(include=[np.number]).columns

    if not target_correlations.empty and len(numeric_features) > 8:
        top_features = target_correlations.abs().nlargest(8).index
    else:
        top_features = numeric_features[:8] if len(numeric_features) > 8 else numeric_features

    if len(top_features) > 0:
        rows = int(np.ceil(len(top_features) / 4))
        cols = min(4, len(top_features))
        fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows))
        axes = np.array(axes).reshape(-1)

        for i, feature in enumerate(top_features):
            if target_col in df.columns:
                sns.boxplot(x=df[target_col], y=df[feature], ax=axes[i])
                axes[i].set_title(f'{feature} по классам')
            else:
                df[feature].hist(bins=30, ax=axes[i])
                axes[i].set_title(f'Распределение {feature}')
            axes[i].grid(True, alpha=0.2)

        for j in range(len(top_features), len(axes)):
            axes[j].axis('off')

        plt.tight_layout()
        plt.show()

    return feature_cols

def prepare_data(df, target_col='label', test_size=0.2, val_size=0.2, random_state=42):
    print("Готовлю выборки для обучения.")

    if target_col not in df.columns:
        raise ValueError(f"Не найдена целевая колонка '{target_col}' в датасете")

    df_clean = df.drop(['user_A', 'user_B'], axis=1, errors='ignore').copy()

    X = df_clean.drop(target_col, axis=1)
    y = df_clean[target_col]

    feature_names = X.columns.tolist()
    print(f"В игру идут {len(feature_names)} признаков: {feature_names}")

    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    val_relative_size = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_relative_size, random_state=random_state, stratify=y_temp
    )

    print("Размеры выборок получились такие:")
    print(f"train -> {X_train.shape}")
    print(f"val -> {X_val.shape}")
    print(f"test -> {X_test.shape}")

    class_counts = y_train.value_counts()
    pos_weight = class_counts.get(0, 1) / class_counts.get(1, 1)
    class_weights = {0: 1.0, 1: pos_weight}
    print(f"Классы в train: {class_counts.to_dict()}, веса {class_weights}")

    return X_train, X_val, X_test, y_train, y_val, y_test, feature_names, class_weights


## 5. Модель CatBoost и оценка качества


In [50]:
def train_catboost_model(X_train, X_val, y_train, y_val, feature_names, class_weights=None):
    print("Запускаю CatBoost, наберитесь терпения.")

    train_pool = Pool(X_train, y_train, feature_names=feature_names)
    val_pool = Pool(X_val, y_val, feature_names=feature_names)

    params = dict(
        iterations=1500,
        learning_rate=0.05,
        depth=8,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=100,
        early_stopping_rounds=100,
        thread_count=-1,
        use_best_model=True,
        l2_leaf_reg=3.0,
    )

    if class_weights is not None:
        ordered_classes = sorted(class_weights.keys())
        params['class_weights'] = [class_weights[c] for c in ordered_classes]
        print(f"Весам классов дал: {params['class_weights']}")

    model = CatBoostClassifier(**params)

    model.fit(
        train_pool,
        eval_set=val_pool,
        plot=False,
    )

    return model

def evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test):
    print("Считаю метрики по train/val/test.")

    y_train_pred = model.predict_proba(X_train)[:, 1]
    y_val_pred = model.predict_proba(X_val)[:, 1]
    y_test_pred = model.predict_proba(X_test)[:, 1]

    train_auc = roc_auc_score(y_train, y_train_pred)
    val_auc = roc_auc_score(y_val, y_val_pred)
    test_auc = roc_auc_score(y_test, y_test_pred)

    print("\nROC-AUC по выборкам:")
    print(f"train: {train_auc:.4f}")
    print(f"val: {val_auc:.4f}")
    print(f"test: {test_auc:.4f}")

    y_train_class = model.predict(X_train)
    y_val_class = model.predict(X_val)
    y_test_class = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_class)
    val_acc = accuracy_score(y_val, y_val_class)
    test_acc = accuracy_score(y_test, y_test_class)
    print("\nAccuracy по выборкам:")
    print(f"train: {train_acc:.4f}")
    print(f"val: {val_acc:.4f}")
    print(f"test: {test_acc:.4f}")

    plt.figure(figsize=(15, 5))

    for i, (y_true, y_pred, name) in enumerate([
        (y_train, y_train_pred, 'Train'),
        (y_val, y_val_pred, 'Validation'),
        (y_test, y_test_pred, 'Test')
    ]):
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        auc_score = roc_auc_score(y_true, y_pred)

        plt.subplot(1, 3, i + 1)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.3f})', linewidth=2)
        plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {name}')
        plt.legend()
        plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(15, 4))

    datasets = [
        ('Train', y_train, y_train_class),
        ('Validation', y_val, y_val_class),
        ('Test', y_test, y_test_class),
    ]

    for idx, (name, y_true, y_pred_class) in enumerate(datasets, start=1):
        plt.subplot(1, 3, idx)
        cm = confusion_matrix(y_true, y_pred_class)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')

    plt.tight_layout()
    plt.show()

    return {
        'train_auc': train_auc,
        'val_auc': val_auc,
        'test_auc': test_auc,
        'test_acc': test_acc,
    }

def analyze_feature_importance(model, feature_names):
    print("Смотрю, какие признаки тянут модель.")

    feature_importance = model.get_feature_importance()
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)

    print("\nТоп-15 по важности:")
    print(importance_df.head(15))

    plt.figure(figsize=(12, 8))

    top_features = importance_df.head(15)

    plt.subplot(1, 2, 1)
    plt.barh(top_features['feature'], top_features['importance'])
    plt.xlabel('Важность')
    plt.title('Топ-15 самых важных признаков')
    plt.gca().invert_yaxis()

    plt.subplot(1, 2, 2)
    plt.hist(feature_importance, bins=30, alpha=0.7, color='skyblue')
    plt.xlabel('Важность признака')
    plt.ylabel('Количество')
    plt.title('Распределение важности признаков')

    plt.tight_layout()
    plt.show()

    return importance_df


## 6. Основной пайплайн и эксперименты


In [51]:
def main(
    negative_sampling_config=None,
    resampling_config=None,
    drop_unrealistic_age=True,
    random_state=42,
):
    print("Гоняю весь пайплайн от начала до конца.")
    print("-" * 40)

    base_dir = Path('.')
    features_path = base_dir / 'Features.csv'
    users_path = base_dir / 'Users.csv'
    friends_path = base_dir / 'Friends.csv'
    groups_path = base_dir / 'Groups.csv'

    df = load_and_explore_data(
        features_path,
        users_file=users_path,
        friends_file=friends_path,
        drop_unrealistic_age=drop_unrealistic_age,
    )

    if negative_sampling_config:
        df, added_negatives = augment_dataset_with_negatives(
            df,
            users_file=users_path,
            friends_file=friends_path,
            groups_file=groups_path,
            negative_config=negative_sampling_config,
            drop_unrealistic_age=drop_unrealistic_age,
        )
        if added_negatives:
            class_breakdown = df['label'].value_counts().to_dict()
            print(f"После подмешивания отрицательных пар баланс такой: {class_breakdown}")

    if resampling_config and resampling_config.get('strategy'):
        df = resample_dataset(
            df,
            target_col=resampling_config.get('target_col', 'label'),
            strategy=resampling_config.get('strategy'),
            ratio=resampling_config.get('ratio', 1.0),
            random_state=resampling_config.get('random_state', random_state),
        )

    feature_cols = analyze_features(df)

    X_train, X_val, X_test, y_train, y_val, y_test, feature_names, class_weights = prepare_data(
        df, random_state=random_state
    )

    model = train_catboost_model(X_train, X_val, y_train, y_val, feature_names, class_weights)

    metrics = evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test)

    importance_df = analyze_feature_importance(model, feature_names)

    print("\nГотово, вот финальные цифры:")
    print(f"ROC-AUC на тесте: {metrics['test_auc']:.4f}")
    print(f"Accuracy: {metrics['test_acc']:.4f}")

    model.save_model('catboost_friendship_model.cbm')
    print("Сохранил модель в catboost_friendship_model.cbm")

    return model, importance_df, metrics

def feature_ablation_study(df, target_col='label'):
    print("\nСмотрю, сколько пользы от разных групп признаков.")

    feature_groups = {
        'Графовые': ['common_friends', 'jaccard_friends', 'adamic_adar'],
        'Гео': ['same_city'],
        'Образование': ['same_university', 'same_faculty', 'same_school'],
        'Группы': ['common_groups', 'jaccard_groups'],
        'Демография': ['age_diff'],
    }

    results = {}

    for group_name, features in feature_groups.items():
        available_features = [f for f in features if f in df.columns]
        if len(available_features) == 0:
            continue

        X = df[available_features]
        y = df[target_col]

        pos_weight = max(1.0, (y == 0).sum() / max((y == 1).sum(), 1))
        model = CatBoostClassifier(iterations=300, verbose=False, random_seed=42, class_weights=[1.0, pos_weight])
        cv_scores = cross_val_score(model, X, y, cv=3, scoring='roc_auc')

        results[group_name] = {
            'mean_auc': cv_scores.mean(),
            'features': available_features
        }

        print(f"{group_name}: AUC = {cv_scores.mean():.4f} ({', '.join(available_features)})")

    return results


## 7. Запуск пайплайна (пример конфигурации)


In [52]:
if __name__ == '__main__':
    NEGATIVE_SAMPLING_CONFIG = {}

    RESAMPLING_CONFIG = {}

    neg_config = NEGATIVE_SAMPLING_CONFIG if any(NEGATIVE_SAMPLING_CONFIG.values()) else None
    res_config = RESAMPLING_CONFIG if RESAMPLING_CONFIG.get('strategy') else None

    model, importance_df, metrics = main(
        negative_sampling_config=neg_config,
        resampling_config=res_config,
        drop_unrealistic_age=True,
        random_state=42,
    )


Гоняю весь пайплайн от начала до конца.
----------------------------------------
Запускаю чтение исходных таблиц, не пугайтесь стенки текста.
Сырые размеры: (371909, 12)
Колонки, что нашлись: ['user_A', 'user_B', 'common_friends', 'jaccard_friends', 'adamic_adar', 'same_city', 'same_university', 'same_faculty', 'same_school', 'age_diff', 'common_groups', 'jaccard_groups']
Таргет собран: 1304 дружб из 370803 пар (доля 0.3517%)
После чистки осталось: (370803, 13)

Информация о фрейме для спокойствия:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370803 entries, 0 to 370802
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user_A           370803 non-null  int64  
 1   user_B           370803 non-null  int64  
 2   common_friends   370803 non-null  int32  
 3   jaccard_friends  370803 non-null  float64
 4   adamic_adar      370803 non-null  float64
 5   same_city        370803 non-null  int8   
 6   same