# Основная модель

In [5]:
!pip install catboost



In [6]:
!pip freeze | grep "numpy\|pandas\|scikit-learn"

geopandas==0.14.4
numpy==1.26.4
pandas==2.2.3
pandas-datareader==0.10.0
pandas-gbq==0.25.0
pandas-profiling==3.6.6
pandas-stubs==2.2.2.240909
pandasql==0.7.3
scikit-learn==1.2.2
scikit-learn-intelex==2025.2.0
sklearn-pandas==2.2.0


In [2]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## Загрузка данных

In [4]:
train_df = pd.read_parquet("data/train_data.pqt")
test_df = pd.read_parquet("data/test_data.pqt")

In [7]:
train_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}


In [8]:
test_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,


Категориальные признаки:

In [9]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

Заполняем все пропуски во всех категориальных признаках, кроме start_cluster:

In [10]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

for col in cat_cols:
  if col != 'start_cluster':
    train_df[col] = train_df[col].astype(str).fillna("missing")
    test_df[col] = test_df[col].astype(str).fillna("missing")

In [11]:
from catboost import CatBoostClassifier

**Создаем выборки для валидации и обучения**

In [23]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

In [24]:
print(train_df.shape)


(600000, 93)


## Обучение модели

В качестве базовой модели возьмем Catboost, так как он хорошо работает с пропусками, обучим на всех признаках

In [25]:
cat_features = [X.columns.get_loc(col) for col in cat_cols]


In [26]:
from catboost import CatBoostClassifier


# так как у нас довольно большой датасет, мы решили взять 5000 итераций, и, соответственно, уменьшили для этого learning_rate
model = CatBoostClassifier(
    iterations=5000,
    learning_rate=0.02,
    depth=8,
    loss_function="MultiClass",
    eval_metric="MultiClass",
    verbose=100,
    cat_features=cat_features,
    task_type="GPU",
    random_seed=42,
)

model.fit(x_train, y_train, eval_set=(x_val, y_val), use_best_model=True, early_stopping_rounds=200)


0:	learn: 2.6647286	test: 2.6648750	best: 2.6648750 (0)	total: 243ms	remaining: 20m 13s
100:	learn: 0.9624750	test: 0.9667216	best: 0.9667216 (100)	total: 8.24s	remaining: 6m 39s
200:	learn: 0.8673932	test: 0.8733365	best: 0.8733365 (200)	total: 16.7s	remaining: 6m 38s
300:	learn: 0.8453949	test: 0.8525042	best: 0.8525042 (300)	total: 24.7s	remaining: 6m 26s
400:	learn: 0.8326307	test: 0.8420357	best: 0.8420357 (400)	total: 33s	remaining: 6m 18s
500:	learn: 0.8233340	test: 0.8353527	best: 0.8353527 (500)	total: 41.5s	remaining: 6m 12s
600:	learn: 0.8152072	test: 0.8300725	best: 0.8300725 (600)	total: 49.9s	remaining: 6m 5s
700:	learn: 0.8080719	test: 0.8258117	best: 0.8258117 (700)	total: 58.4s	remaining: 5m 58s
800:	learn: 0.8020495	test: 0.8223319	best: 0.8223319 (800)	total: 1m 6s	remaining: 5m 49s
900:	learn: 0.7959096	test: 0.8189488	best: 0.8189488 (900)	total: 1m 15s	remaining: 5m 42s
1000:	learn: 0.7905302	test: 0.8160536	best: 0.8160536 (1000)	total: 1m 23s	remaining: 5m 33s
1

<catboost.core.CatBoostClassifier at 0x7bdce3731900>

Зададим функцию для взвешенной метрики roc auc

In [27]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [29]:
cluster_weights = pd.read_excel("data/cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

# Проверка работы модели - получаем предсказания на валидационной выборке

In [30]:
y_pred_proba = model.predict_proba(x_val)
y_pred_proba.shape

(120000, 17)

In [31]:
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

0.947400150913914

## Прогноз на тестовой выборке

**Так как в тестовом датасете довольно много пропусков, а это самый значимый признак для дальнейшего предсказания, мы решили обучить отдельный катбуст на определение стартого кластера пользователя**

Загрузка модели (код обучения этой модели находится в файле под названием prediction_start_cluster.ipynb)

In [33]:
start_cluster_model = CatBoostClassifier()
start_cluster_model.load_model("models/catboost_for_start_cluster_model.cbm")

<catboost.core.CatBoostClassifier at 0x7bdce3fe5cc0>

**Для дальнейшей работы копируем датасет test_df в датасет test_test_df, из которого удаляем ненужные признаки и start_cluster, чтобы самим его потом предсказать**

In [34]:
test_test_df = test_df.drop(["id", "date", "start_cluster"], axis=1)

In [36]:
start_clusters_for_test = start_cluster_model.predict(test_test_df)

In [37]:
test_df['start_cluster'] = test_df['start_cluster'].where(test_df['start_cluster'].notna(), start_clusters_for_test)

In [38]:
test_df.pivot(index="id", columns="date", values="start_cluster").head(3)

date,month_4,month_5,month_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200000,{α},{α},{α}
200001,{α},{α},{α}
200002,{other},{other},{other}


In [41]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")

In [42]:
sample_submission_df.shape

(100000, 18)

In [43]:
sample_submission_df.head()

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,200001,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,200002,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
3,200003,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
4,200004,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


Для тестовой выборки будем использовать только последний месяц

In [44]:
last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)

In [45]:
test_pred_proba = model.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [46]:
test_pred_proba_df.shape

(100000, 17)

In [47]:
test_pred_proba_df.head(2)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.009111,0.017351,0.03655,0.029289,0.004844,0.000129,0.000947,0.000584,0.019457,0.004049,0.014565,0.000232,0.000771,3.170902e-06,0.010034,0.85208,5e-06
1,0.00635,0.572106,0.000265,0.001086,0.00041,7.7e-05,0.000123,6e-06,0.001019,0.009679,0.000623,0.000157,0.000689,8.038626e-08,0.000723,0.406585,0.000101


In [48]:
from datetime import datetime

In [49]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv(f"{datetime.now()}_submission.csv", index=False)