# Baseline

In [53]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate, StratifiedGroupKFold, KFold

In [54]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

lightgbm==4.3.0
numpy @ file:///C:/b/abs_c1ywpu18ar/croot/numpy_and_numpy_base_1708638681471/work/dist/numpy-1.26.4-cp311-cp311-win_amd64.whl#sha256=5dfd3e04dc1c2826d3f404fdc7f93c097901f5da9b91f4f394f79d4e038ed81d
numpydoc @ file:///C:/ci_311/numpydoc_1676453412027/work
pandas @ file:///C:/b/abs_fej9bi0gew/croot/pandas_1702318041921/work/dist/pandas-2.1.4-cp311-cp311-win_amd64.whl#sha256=d3609b7cc3e3c4d99ad640a4b8e710ba93ccf967ab8e5245b91033e0200f9286
scikit-learn @ file:///C:/b/abs_38k7ridbgr/croot/scikit-learn_1684954723009/work


## Загрузка данных

In [55]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

In [56]:
train_df.head(30)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
3,1,month_1,-0.081586,-0.09186,-0.11404,-0.08089,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
4,1,month_2,-0.094962,-0.100504,-0.119302,-0.094307,channel_code_2,city_14,city_type_0,,...,0.946066,0.43075,0.067275,0.559928,0.696576,-0.183854,0.255545,0.495419,{other},{other}
5,1,month_3,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,,...,0.948027,0.488221,0.043221,0.560788,0.707687,-0.167905,0.259011,0.605309,{other},{other}
6,2,month_1,-0.154685,-0.186795,-0.122805,-0.154215,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
7,2,month_2,-0.152784,-0.193686,-0.122805,-0.152308,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.177854,0.252657,0.440474,{α},{α}
8,2,month_3,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.176302,0.252368,0.429485,{α},{α}
9,3,month_1,-0.156643,-0.204861,-0.12566,-0.156179,channel_code_14,city_21,city_type_0,index_city_code_46,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{α}


In [57]:
test_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,


In [58]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

Обозначение категориальных признаков

In [59]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

Создаем выборки для валидации и обучения

In [60]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

## Обучение модели

В качестве базовой модели возьмем LGBM обучим на всех признаках

In [61]:
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "auc_mu",
    "max_depth": 10,
    "num_leaves": 20,
    "min_data_in_leaf": 10,
    "learning_rate": 0.01,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "bagging_freq": 5,
    "n_estimators": 100,
    "min_data_in_bin": 1,
    "max_bin": 64,
    "verbose": -1,
    "random_state": 42,
    "n_jobs": -1
}



In [62]:
from sklearn.model_selection import StratifiedKFold, LeavePOut, LeaveOneOut, ShuffleSplit

model = LGBMClassifier(**params)


# cv = KFold(n_splits=3)
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
auc_scores = []  # Список для хранения показателей AUC для каждого разбиения


In [63]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [64]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [77]:

from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=12)
for train_idx, val_idx in cv.split(X, y):
    print(train_idx)

[     0      1      2 ... 599996 599997 599998]
[     0      1      2 ... 599992 599997 599999]
[     0      1      2 ... 599996 599998 599999]
[     0      3      4 ... 599997 599998 599999]
[     1      2      4 ... 599997 599998 599999]
[     0      1      2 ... 599997 599998 599999]
[     2      3      4 ... 599996 599997 599998]
[     0      1      2 ... 599997 599998 599999]
[     0      1      2 ... 599994 599996 599999]
[     0      1      3 ... 599997 599998 599999]
[     0      1      3 ... 599997 599998 599999]
[     0      1      2 ... 599996 599997 599999]
[     0      1      2 ... 599994 599997 599998]
[     0      2      3 ... 599996 599998 599999]
[     1      2      3 ... 599997 599998 599999]
[     0      1      2 ... 599997 599998 599999]
[     2      3      4 ... 599997 599998 599999]
[     0      1      3 ... 599996 599997 599998]
[     0      1      2 ... 599997 599998 599999]
[     0      1      2 ... 599994 599996 599999]


In [65]:

for train_idx, val_idx in cv.split(X, y):
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(x_train, y_train)

    y_pred_proba = model.predict_proba(x_val)

    auc_score = weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)
    auc_scores.append(auc_score)

mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)
print(f"Средний взвешенный ROC AUC: {mean_auc:.3f} ± {std_auc:.3f}")

Средний взвешенный ROC AUC: 0.907 ± 0.008
