In [None]:
pip install catboost

In [None]:
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_predict

In [None]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

In [None]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

In [None]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [None]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

for col in cat_cols:
    train_df[col] = train_df[col].astype(str).fillna("missing")
    test_df[col] = test_df[col].astype(str).fillna("missing")

In [None]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

n_classes = 17
#oof предсказания
oof_preds1 = np.zeros((len(X_train), n_classes))
oof_preds2 = np.zeros((len(X_train), n_classes))

for train_idx, val_idx in kf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    pool_tr = Pool(X_tr, y_tr, cat_features=cat_features)
    pool_val = Pool(X_val, y_val, cat_features=cat_features)
#наташина норм модель
    model1 = CatBoostClassifier(
        iterations=3500,
        learning_rate=0.1,
        depth=6,
        loss_function="MultiClass",
        eval_metric="MultiClass",
        auto_class_weights='Balanced',
        verbose=100,
        cat_features=cat_features,
        random_seed=42,
        task_type="GPU"
    )
    model1.fit(pool_tr)
    oof_preds1[val_idx] = model1.predict_proba(pool_val)
#норм модель степы
    model2 = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.02,
        depth=8,
        auto_class_weights='Balanced',
        loss_function="MultiClass",
        eval_metric="MultiClass",
        verbose=100,
        task_type="GPU",
        cat_features=cat_features,
        random_seed=42,
    )
    model2.fit(pool_tr)
    oof_preds2[val_idx] = model2.predict_proba(pool_val)
#стакаем предсказания
X_stack = np.hstack([oof_preds1, oof_preds2])

stacker = LogisticRegression(
    class_weight='balanced',
    random_state=42,
    max_iter=3500,
    multi_class='multinomial',
    n_jobs=-1
)

stacker_oof_preds = cross_val_predict(
    stacker, X_stack, y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    method='predict_proba'
)

print("OOF ROC-AUC стекинга (OvO):", roc_auc_score(y_train, stacker_oof_preds, multi_class='ovo'))
print("OOF ROC-AUC стекинга (OvR):", roc_auc_score(y_train, stacker_oof_preds, multi_class='ovr'))

stacker.fit(X_stack, y_train)

test_preds1 = model1.predict_proba(X_test)
test_preds2 = model2.predict_proba(X_test)
X_test_stack = np.hstack([test_preds1, test_preds2])

final_preds = stacker.predict_proba(X_test_stack)
print("Test ROC-AUC (OvO):", roc_auc_score(y_test, final_preds, multi_class='ovo'))
print("Test ROC-AUC (OvR):", roc_auc_score(y_test, final_preds, multi_class='ovr'))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    if len(y_true) != y_pred.shape[0]: #на размеры проверочка
        raise ValueError(f"Несоответствие размеров: y_true имеет {len(y_true)} образцов, "
                       f"а y_pred имеет {y_pred.shape[0]} образцов")

    lb = LabelBinarizer()
    y_true_bin = lb.fit_transform(y_true)

    if y_pred.shape[1] != len(labels):
        raise ValueError(f"Количество классов в y_pred ({y_pred.shape[1]}) "
                       f"не совпадает с количеством labels ({len(labels)})")

    roc_auc_scores = []
    for i, label in enumerate(labels):
        if y_true_bin[:, i].sum() == 0:
            continue
        score = roc_auc_score(y_true_bin[:, i], y_pred[:, i])
        roc_auc_scores.append(score * weights_dict.get(label, 1.0))

    return np.mean(roc_auc_scores)

In [None]:
labels = np.unique(y_train)
weights_dict = {label: 1.0 for label in labels}

In [None]:
if len(y_test) == len(final_preds):
    score = weighted_roc_auc(y_test, final_preds, labels, weights_dict)
    print(f"\nWeighted ROC AUC: {score:.4f}")
else:
    print("Ошибка: размеры y_test и final_preds не совпадают")

In [None]:
#сохранение мб
import os
os.makedirs('models', exist_ok=True)

model1.save_model('models/catboost_model1.cbm')
model2.save_model('models/catboost_model2.cbm')

joblib.dump(stacker, 'models/stacker_lr.pkl')
