# ============Необходимые библиотеки и фреймворки============

In [14]:
import pandas as pd
import numpy as np
import joblib
import mlflow
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix
)
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# ================0. КОНФИГУРАЦИЯ MLFLOW================

In [15]:
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("Customer_Class_Tree_Model")
print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")
print(f"MLflow Experiment: Customer_Class_Tree_Models")

2025/08/29 16:31:55 INFO mlflow.tracking.fluent: Experiment with name 'Customer_Class_Tree_Model' does not exist. Creating a new experiment.


MLflow Tracking URI: file:./mlruns
MLflow Experiment: Customer_Class_Tree_Models


# ================1. ЗАГРУЗКА И ПОДГОТОВКА ДАННЫХ================

In [16]:
print("\nЗагрузка данных...")
macro_df = pd.read_csv('ML test task v3/context_df.csv')
contracts_df = pd.read_parquet('ML test task v3/test_task.parquet')

print("Предобработка данных и создание признаков...")

# --- Обработка макроэкономических данных ---
macro_df['context_data_from'] = pd.to_datetime(macro_df['context_data_from'])
macro_df['context_data_to'] = pd.to_datetime(macro_df['context_data_to'])

def clean_and_convert_numeric(series):
    return pd.to_numeric(
        series.astype(str).str.replace('%', '', regex=False).str.replace(',', '.', regex=False),
        errors='coerce'
    )

numeric_macro_features = [
    'inflation', 'key_rate', 'deposit_1', 'deposit_3', 'deposit_6', 'deposit_12',
    'fa_delta', 'usd_delta', 'IMOEX_delta', 'RGBI_delta'
]

for col in numeric_macro_features:
    macro_df[col] = clean_and_convert_numeric(macro_df[col])
macro_df.dropna(subset=numeric_macro_features, inplace=True)

# --- Создание лагов и дельт для макро-признаков ---
macro_df_for_merge = macro_df.rename(columns={'context_data_from': 'quarter_start_date'}).sort_values('quarter_start_date')
engineered_macro_features = numeric_macro_features.copy()

for feature in numeric_macro_features:
    for lag in [1, 2]:
        lag_feature_name = f'{feature}_lag{lag}'
        macro_df_for_merge[lag_feature_name] = macro_df_for_merge[feature].shift(lag)
        engineered_macro_features.append(lag_feature_name)
    
    delta_feature_name = f'{feature}_delta1'
    macro_df_for_merge[delta_feature_name] = macro_df_for_merge[feature] - macro_df_for_merge[f'{feature}_lag1']
    engineered_macro_features.append(delta_feature_name)

# --- Обработка данных о контрактах ---
date_col = 'Договор Дата Заключения'
contracts_df[date_col] = pd.to_datetime(contracts_df[date_col])
contracts_df['quarter_start_date'] = contracts_df[date_col].dt.to_period('Q').dt.start_time

# --- Создание временных признаков ---
contracts_df['contract_year'] = contracts_df[date_col].dt.year
contracts_df['contract_quarter'] = contracts_df[date_col].dt.quarter
contracts_df['contract_month'] = contracts_df[date_col].dt.month
contracts_df['contract_dayofweek'] = contracts_df[date_col].dt.dayofweek
categorical_date_features = ['contract_year', 'contract_quarter', 'contract_month', 'contract_dayofweek']

# --- Объединение датасетов ---
merged_df = pd.merge(contracts_df, macro_df_for_merge, on='quarter_start_date', how='left')
all_features_for_model = engineered_macro_features + categorical_date_features
merged_df.dropna(subset=all_features_for_model + ['cus_class'], inplace=True)

# --- Кодирование целевой переменной ---
label_encoder = LabelEncoder()
merged_df['cus_class_encoded'] = label_encoder.fit_transform(merged_df['cus_class'])
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_.astype(str)


Загрузка данных...
Предобработка данных и создание признаков...


# ===========2. ПОДГОТОВКА ВЫБОРКИ ДЛЯ ОБУЧЕНИЯ=============

In [17]:
print("Подготовка выборок и балансировка...")
X = merged_df[all_features_for_model]
y = merged_df['cus_class_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Балансировка обучающей выборки ---
target_samples_per_class = 500
ros = RandomOverSampler(
    sampling_strategy={cls: max(cnt, target_samples_per_class) for cls, cnt in Counter(y_train).items()},
    random_state=42
)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

rus = RandomUnderSampler(
    sampling_strategy={cls: target_samples_per_class for cls in Counter(y_resampled).keys()},
    random_state=42
)
X_train_final, y_train_final = rus.fit_resample(X_resampled, y_resampled)

print(f"Размер обучающей выборки после балансировки: {X_train_final.shape}")
print(f"Распределение классов в обучающей выборке: {sorted(Counter(y_train_final).items())}")

cat_features_indices = [X_train_final.columns.get_loc(col) for col in categorical_date_features]

Подготовка выборок и балансировка...
Размер обучающей выборки после балансировки: (9000, 44)
Распределение классов в обучающей выборке: [(0, 500), (1, 500), (2, 500), (3, 500), (4, 500), (5, 500), (6, 500), (7, 500), (8, 500), (9, 500), (10, 500), (11, 500), (12, 500), (13, 500), (14, 500), (15, 500), (16, 500), (17, 500)]


# =========3. ВСПОМОГАТЕЛЬНАЯ ФУНКЦИЯ ДЛЯ ОЦЕНКИ================

In [18]:
# ==============================================================================
# 3. ВСПОМОГАТЕЛЬНАЯ ФУНКЦИЯ ДЛЯ ОЦЕНКИ
# ==============================================================================

def evaluate_and_log(model, X_test, y_test, model_name, class_names):
    """Оценивает модель, выводит метрики, логирует их в MLflow и сохраняет матрицу ошибок."""
    y_pred = model.predict(X_test)
    
    # Расчет метрик
    accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    print(f"\n--- Результаты для {model_name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score (Macro): {f1_macro:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))

    # Логирование метрик в MLflow
    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("test_f1_macro", f1_macro)
    mlflow.log_metric("test_balanced_accuracy", balanced_acc)

    # Создание и логирование матрицы ошибок
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    cm_path = f"confusion_matrix_{model_name.lower()}.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

# ==================4. ОБУЧЕНИЕ МОДЕЛЕЙ======================

# ----------------- Блок LightGBM -----------------

In [19]:
print("\n--- Обучение LightGBM ---")
with mlflow.start_run(run_name="LightGBM"):
    mlflow.log_param("model_type", "LightGBM")
    
    lgbm_params = {
        'objective': 'multiclass',
        'num_class': num_classes,
        'metric': 'multi_logloss',
        'n_estimators': 1000,
        'learning_rate': 0.03,
        'num_leaves': 31,
        'max_depth': -1,
        'random_state': 42,
        'n_jobs': -1,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
    }
    mlflow.log_params(lgbm_params)
    
    model_lgb = lgb.LGBMClassifier(**lgbm_params)
    model_lgb.fit(
        X_train_final, y_train_final,
        eval_set=[(X_test, y_test)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(50, verbose=False)],
        categorical_feature=cat_features_indices
    )
    
    evaluate_and_log(model_lgb, X_test, y_test, "LightGBM", class_names)
    mlflow.lightgbm.log_model(model_lgb, "model")
    joblib.dump(label_encoder, "label_encoder.joblib")
    mlflow.log_artifact("label_encoder.joblib")


--- Обучение LightGBM ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1196
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 44
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from score -2.890372
[LightGBM] [Info] Start training from



# ----------------- Блок XGBoost -----------------

In [20]:
print("\n--- Обучение XGBoost ---")
with mlflow.start_run(run_name="XGBoost"):
    mlflow.log_param("model_type", "XGBoost")
    
    # Для XGBoost категориальные признаки должны быть типа 'category'
    X_train_xgb = X_train_final.copy()
    X_test_xgb = X_test.copy()
    for col in categorical_date_features:
        X_train_xgb[col] = X_train_xgb[col].astype("category")
        X_test_xgb[col] = X_test_xgb[col].astype("category")
    
    xgb_params = {
        'objective': 'multi:softmax',
        'num_class': num_classes,
        'n_estimators': 300,
        'learning_rate': 0.05,
        'max_depth': 7,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'eval_metric': 'mlogloss',
        'seed': 42,
        'enable_categorical': True, # Важный параметр для работы с типом 'category'
    }
    mlflow.log_params(xgb_params)

    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(X_train_xgb, y_train_final)
    
    evaluate_and_log(model_xgb, X_test_xgb, y_test, "XGBoost", class_names)
    model_path = "xgb_model.json"
    model_xgb.save_model(model_path)
    mlflow.log_artifact(model_path, artifact_path="model")
    joblib.dump(label_encoder, "label_encoder.joblib")
    mlflow.log_artifact("label_encoder.joblib")



--- Обучение XGBoost ---

--- Результаты для XGBoost ---
Accuracy: 0.2443
F1-score (Macro): 0.1614
Balanced Accuracy: 0.2643

Classification Report:
              precision    recall  f1-score   support

         1.0       0.43      0.28      0.34       478
         2.0       0.03      0.33      0.05        12
         4.0       0.50      0.11      0.18       708
         5.0       0.59      0.11      0.18      1093
         6.0       0.10      0.74      0.18        31
         7.0       0.05      0.45      0.09        29
         8.0       0.05      0.09      0.06        69
        10.0       0.66      0.72      0.69       533
       100.0       0.01      0.25      0.02         8
       101.0       0.04      0.08      0.05        76
       102.0       0.05      0.10      0.07       105
       103.0       0.32      0.30      0.31       197
       104.0       0.02      0.13      0.03        23
       105.0       0.18      0.25      0.21       110
       106.0       0.26      0.18      

# ----------------- Блок CatBoost -----------------

In [21]:
print("\n--- Обучение CatBoost ---")
with mlflow.start_run(run_name="CatBoost"):
    mlflow.log_param("model_type", "CatBoost")

    catboost_params = {
        'iterations': 500,
        'learning_rate': 0.05,
        'depth': 8,
        'loss_function': 'MultiClass',
        'verbose': 0, # Отключаем вывод во время обучения
        'random_seed': 42
    }
    mlflow.log_params(catboost_params)

    model_cb = cb.CatBoostClassifier(**catboost_params)
    model_cb.fit(
        X_train_final, y_train_final,
        cat_features=cat_features_indices,
        eval_set=(X_test, y_test),
        early_stopping_rounds=50,
    )
    
    evaluate_and_log(model_cb, X_test, y_test, "CatBoost", class_names)
    model_path = "catboost_model.cbm"
    model_cb.save_model(model_path)
    mlflow.log_artifact(model_path, artifact_path="model")
    joblib.dump(label_encoder, "label_encoder.joblib")
    mlflow.log_artifact("label_encoder.joblib")


--- Обучение CatBoost ---

--- Результаты для CatBoost ---
Accuracy: 0.2661
F1-score (Macro): 0.1684
Balanced Accuracy: 0.2743

Classification Report:
              precision    recall  f1-score   support

         1.0       0.47      0.22      0.30       478
         2.0       0.03      0.33      0.06        12
         4.0       0.53      0.08      0.15       708
         5.0       0.58      0.17      0.26      1093
         6.0       0.08      0.65      0.14        31
         7.0       0.05      0.45      0.09        29
         8.0       0.06      0.06      0.06        69
        10.0       0.62      0.82      0.71       533
       100.0       0.01      0.25      0.02         8
       101.0       0.03      0.04      0.03        76
       102.0       0.06      0.12      0.08       105
       103.0       0.38      0.40      0.39       197
       104.0       0.02      0.13      0.04        23
       105.0       0.19      0.28      0.23       110
       106.0       0.26      0.18    