In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate, StratifiedGroupKFold, KFold
from catboost import CatBoostClassifier


In [2]:
train_df = pd.read_parquet("train_data.pqt")
train_df.__len__

<bound method DataFrame.__len__ of             id     date  balance_amt_avg  balance_amt_max  balance_amt_min  \
0            0  month_1         0.744845         0.705492         1.287207   
1            0  month_2         1.049605         0.831916         2.458609   
2            0  month_3         0.692653         0.740253         0.430042   
3            1  month_1        -0.081586        -0.091860        -0.114040   
4            1  month_2        -0.094962        -0.100504        -0.119302   
...        ...      ...              ...              ...              ...   
599995  199998  month_2        -0.156775        -0.204960        -0.125987   
599996  199998  month_3        -0.156776        -0.204960        -0.125995   
599997  199999  month_1        -0.156712        -0.204913        -0.125831   
599998  199999  month_2        -0.156712        -0.204913        -0.125831   
599999  199999  month_3        -0.156712        -0.204913        -0.125831   

        balance_amt_day_avg 

In [3]:
train_df.head(12)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
3,1,month_1,-0.081586,-0.09186,-0.11404,-0.08089,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
4,1,month_2,-0.094962,-0.100504,-0.119302,-0.094307,channel_code_2,city_14,city_type_0,,...,0.946066,0.43075,0.067275,0.559928,0.696576,-0.183854,0.255545,0.495419,{other},{other}
5,1,month_3,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,,...,0.948027,0.488221,0.043221,0.560788,0.707687,-0.167905,0.259011,0.605309,{other},{other}
6,2,month_1,-0.154685,-0.186795,-0.122805,-0.154215,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
7,2,month_2,-0.152784,-0.193686,-0.122805,-0.152308,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.177854,0.252657,0.440474,{α},{α}
8,2,month_3,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.176302,0.252368,0.429485,{α},{α}
9,3,month_1,-0.156643,-0.204861,-0.12566,-0.156179,channel_code_14,city_21,city_type_0,index_city_code_46,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{α}


In [272]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [273]:

train_df[cat_cols] = train_df[cat_cols].astype("category")

In [274]:
train_df = train_df.drop(["id", "date"], axis=1)


In [278]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()
cat_features = [col for col in test_df_missing.columns if test_df_missing[col].dtype.name == 'category' or test_df_missing[col].dtype.name == 'object']
for col in cat_features:
   test_df_missing[col] = test_df_missing[col].astype('category')

cat_features = [col for col in test_df_missing.columns if test_df_missing[col].dtype.name == 'category' or test_df_missing[col].dtype.name == 'object']
for col in cat_features:
    test_df_missing[col] = test_df_missing[col].cat.add_categories('Missing').fillna('Missing')

cat_features

['channel_code',
 'city',
 'city_type',
 'index_city_code',
 'ogrn_month',
 'ogrn_year',
 'okved',
 'segment']

In [279]:
models = [CatBoostClassifier().load_model(f'model_14d_611i_8split\model_{i}.cbm') for i in range(8)]
#models = [CatBoostClassifier().load_model(f'model_{i}.cbm') for i in range(8)]
predictions = np.array([model.predict(test_df_missing) for model in models])

predictions_transposed = predictions.T


In [280]:
predictions_transposed

array([[['{α}', '{α}', '{α}', ..., '{α}', '{α}', '{α}'],
        ['{α}', '{α}', '{α}', ..., '{α}', '{α}', '{α}'],
        ['{other}', '{other}', '{other}', ..., '{other}', '{other}',
         '{other}'],
        ...,
        ['{α}', '{α}', '{α}', ..., '{α}', '{α}', '{α}'],
        ['{α}', '{α}', '{α}', ..., '{α}', '{α}', '{α}'],
        ['{α}', '{α}', '{α, γ}', ..., '{α}', '{α}', '{α}']]], dtype=object)

In [281]:
import numpy as np

consensus_predictions = []
# print(predictions_transposed)
for row in predictions_transposed[0]:
    print(row)
    vals, counts = np.unique(row, return_counts=True)
    print(vals)
    print(counts)
    index = np.argmax(counts)
    consensus_predictions.append(vals[index])


consensus_predictions = np.array(consensus_predictions)



['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{other}' '{other}' '{other}' '{other}' '{other}' '{other}' '{other}'
 '{other}']
['{other}']
[8]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α, δ}' '{α, δ}' '{α}' '{other}' '{α}' '{α, δ}' '{α}' '{other}']
['{other}' '{α, δ}' '{α}']
[2 3 3]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α, μ}' '{α}' '{α}' '{α, η}' '{α}' '{α}' '{α}' '{α, η}']
['{α, η}' '{α, μ}' '{α}']
[2 1 5]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{}' '{α}']
['{}' '{α}']
[1 7]
['{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}' '{α}']
['{α}']
[8]
['{α}' '{α, η}' '{

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [282]:
consensus_predictions

array(['{α}', '{α}', '{other}', ..., '{α}', '{α}', '{α}'], dtype='<U9')

In [283]:

# Теперь можно безопасно заполнять пропуски в 'start_cluster'
for i, idx in enumerate(missing_indices):
    test_df.at[idx, 'start_cluster'] = consensus_predictions[i]




In [284]:
test_df.head(30)

Unnamed: 0,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,ogrn_days_end_month,ogrn_days_end_quarter,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,,-1.533705,-1.683741,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,,-1.533705,-1.683741,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,-1.533705,-1.683741,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,{α}
3,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,0.092087,1.22003,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}
4,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,0.092087,1.22003,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}
5,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,0.092087,1.22003,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}
6,-0.096506,0.185905,-0.125995,-0.095856,channel_code_12,city_14,city_type_0,index_city_code_78,-1.069193,-1.528873,...,0.51073,0.950774,0.545693,0.715525,0.554913,0.718798,0.445811,0.254968,0.495419,{other}
7,0.572242,1.502779,-0.125995,0.574963,channel_code_12,city_14,city_type_0,index_city_code_78,-1.069193,-1.528873,...,0.499912,0.949989,0.522704,2.442243,0.56394,0.84102,3.313686,0.256701,0.550364,{other}
8,-0.048015,0.448252,-0.125995,-0.047215,channel_code_12,city_14,city_type_0,index_city_code_78,-1.069193,-1.528873,...,0.123154,0.94685,0.453739,2.61487,0.565087,0.818798,4.449125,0.258723,0.627287,{other}
9,-0.156623,-0.204813,-0.125665,-0.156159,channel_code_7,city_31,city_type_0,,-0.256297,-1.257854,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}


In [285]:

# test_df.to_csv('test_df_filled.csv', index=False)


In [286]:
test_df.__len__()

290120

In [287]:
# Загрузка оригинального датафрейма
original_test_df = pd.read_parquet("test_data.pqt")

# Обновление столбца 'start_cluster' в оригинальном датафрейме значениями из обработанного test_df
original_test_df['start_cluster'] = test_df['start_cluster']

# Сохранение обновлённого датафрейма в CSV
original_test_df.to_csv('test_df_filled_night_timur.csv', index=False)

