In [1]:
selected_features = ['segment', 'avg_debet_turn_rur', 'worksalary_rur_amt',
       'avg_by_category__amount__sum__cashflowcategory_name__vydacha_nalichnyh_v_bankomate',
       'hdb_bki_total_pil_max_limit', 'hdb_bki_total_max_limit', 'month',
       'max_pil_largest_max_limit_actoff_90d',
       'max_cc_largest_max_limit_actoff_30d', 'curr_rur_amt_cm_avg',
       'hdb_bki_total_cc_max_limit', 'min_max_limit',
       'avg_6m_money_transactions', 'incomeValue', 'hdb_outstand_sum',
       'addrref', 'oldest_campaignsegment_ccode_for_nss',
       'amount_of_money', 'turn_other_db_max_v2', 'first_salary_income',
       'hdb_bki_total_max_overdue_sum',
       'total_rur_amt_cm_avg_period_days_ago_v2',
       'oldest_campaignsegment_ccode_for_pil',
       'transaction_category_supermarket_sum_amt_m3_4', 'usd_price',
       'mob_cnt_days', 'avg_6m_transportation',
       'channel_mobilnoe_prilozhenie_am_voc_features_12m_voc_with_expert_cnt',
       'avg_percents_inc', 'hdb_bki_active_ip_max_outstand',
       'turn_other_cr_avg_v2', 'avg_cnt_daily_transactions_90d',
       'avg_by_category__amount__sum__cashflowcategory_name__zdorove',
       'profit_income_out_rur_amt_9m', 'min_pil_max_score_actoff_180d',
       'hdb_bki_active_pil_max_overdue', 'total_rur_amt_cm_avg_div_v2',
       'by_category__amount__sum__eoperation_type_name__vneshnij_perevod_rur',
       'avg_6m_building_services', 'main_last_position_ccode',
       'accum_rur_amt_cm_avg_div_v2', 'uniV5',
       'transaction_category_supermarket_sum_cnt_d15',
       'avg_6m_personal_services', 'prof_cc_prof',
       'hdb_bki_active_cc_cnt', 'turn_cc_cr_max_v2', 'avg_3m_hotels',
       'min_cc_max_el_actoff_90d', 'commission_outcome_rur_amt',
       'percent_outcome_rur_amt', 'infl', 'mob_cover_days',
       'hdb_bki_other_active_auto_month_payments_sum', 'smsInWavg6m',
       'avg_by_category__amount__sum__cashflowcategory_name__detskie_igrushki',
       'summarur_1m_no_cat', 'atravel', 'turn_cc_db_min_v2', 'staff_flag',
       'puupg_offer_exists_pass_180d',
       'transaction_category_hotels_sum_amt_m2', 'mean_addrref_income', 'date_mean_target', 'above_200000.0',
        'above_180000.0', 'above_160000.0', 'above_140000.0', 'above_120000.0', 'above_100000.0', 'above_80000.0',
        'above_60000.0', 'above_40000.0', 'above_20000.0']

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import catboost
from catboost import CatBoostRegressor

import optuna
from optuna.samplers import TPESampler

#from faiss_imputer import FaissImputer

from tqdm import tqdm

In [3]:
def weighted_mean_absolute_error(y_true, y_pred, weights):
    return (weights * np.abs(y_true - y_pred)).mean()

In [4]:
train_df = pd.read_csv('data3/train_df.csv')
val_df = pd.read_csv('data3/val_df.csv')

In [5]:
train_df['year'] = train_df['feature_date'].apply(lambda x: int(x[:4]))
train_df['month'] = train_df['feature_date'].apply(lambda x: int(x[6:7]))

val_df['year'] = val_df['feature_date'].apply(lambda x: int(x[:4]))
val_df['month'] = val_df['feature_date'].apply(lambda x: int(x[6:7]))

In [6]:
d_train = train_df.groupby('feature_date')['target'].agg('mean')
d_val = val_df.groupby('feature_date')['target'].agg('mean')

In [7]:
train_df['date_mean_target'] = train_df['feature_date'].apply(lambda x: d_train[x])
val_df['date_mean_target'] = val_df['feature_date'].apply(lambda x: d_val[x])

In [8]:
train_df = train_df[selected_features + ['target', 'w', 'feature_date']]
val_df = val_df[selected_features + ['target', 'w', 'feature_date']]

In [9]:
cat_feat = train_df.select_dtypes(include=['object']).columns.to_numpy()
for feature in cat_feat:
    train_df[feature] = train_df[feature].fillna('no_data')
    val_df[feature] = val_df[feature].fillna('no_data')

In [10]:
X_train = train_df.drop(['target', 'w'], axis=1)
y_train = train_df['target']
X_val = val_df.drop(['target', 'w'], axis=1)
y_val = val_df['target']

In [11]:
text_feat = ['main_last_position_ccode']

cat_feat_no_text = []
for el in cat_feat:
    if el not in text_feat:
        cat_feat_no_text.append(el)

In [12]:
train_df = pd.concat([train_df, val_df])

In [13]:
text_feat = ['main_last_position_ccode']

cat_feat_no_text = []
for el in cat_feat:
    if el not in text_feat:
        cat_feat_no_text.append(el)

In [14]:
i_date = -1
date_list = sorted(train_df['feature_date'].unique())
def get_next_date():
    global i_date
    i_date += 1
    if i_date >= len(date_list):
        i_date = 0
    return date_list[i_date]

In [15]:
def objective(trial):
    split_date = get_next_date()
    print(f'Next split date: {split_date}')
    
    val = train_df[train_df['feature_date'] == split_date]
    train = train_df[train_df['feature_date'] != split_date]
    
    
    X_train = train.drop(['target', 'w', 'feature_date'], axis=1)
    y_train = train['target']
    X_val = val.drop(['target', 'w', 'feature_date'], axis=1)
    y_val = val['target']
    
    val_weights = val['w']
    
    model = catboost.CatBoostRegressor(
        iterations=trial.suggest_int("iterations", 2000, 3000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 6, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 1.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["MVS"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        od_type=trial.suggest_categorical("od_type", ["IncToDec"]),
        od_wait=trial.suggest_int("od_wait", 20, 40),
        verbose=400,
        #task_type='GPU'
    )
    model.fit(X_train, y_train, cat_features=cat_feat_no_text[:-1], text_features=text_feat)
    y_pred = model.predict(X_val)
    return weighted_mean_absolute_error(y_val, y_pred, val_weights)

In [16]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="catboost", direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=100, timeout=29000)

Next split date: 2022-09-30
0:	learn: 86132.0015123	total: 278ms	remaining: 11m 10s
400:	learn: 48230.4272524	total: 56.7s	remaining: 4m 45s
800:	learn: 46480.2207973	total: 1m 49s	remaining: 3m 40s
1200:	learn: 45243.3391173	total: 2m 41s	remaining: 2m 43s
1600:	learn: 44280.1697638	total: 3m 32s	remaining: 1m 48s
2000:	learn: 43400.6025222	total: 4m 23s	remaining: 54.8s
2400:	learn: 42660.6638593	total: 5m 14s	remaining: 2.1s
2416:	learn: 42632.4688325	total: 5m 16s	remaining: 0us
Next split date: 2022-10-31
0:	learn: 87170.2501553	total: 196ms	remaining: 7m 8s
400:	learn: 52332.1411763	total: 1m 19s	remaining: 5m 52s
800:	learn: 49910.6352802	total: 2m 37s	remaining: 4m 32s
1200:	learn: 48924.8046723	total: 3m 53s	remaining: 3m 11s
1600:	learn: 48195.2198753	total: 5m 8s	remaining: 1m 52s
2000:	learn: 47598.4918065	total: 6m 28s	remaining: 35.9s
2185:	learn: 47354.4561467	total: 7m 5s	remaining: 0us
Next split date: 2022-11-30
0:	learn: 84081.1473580	total: 152ms	remaining: 5m 33s
4

2400:	learn: 37099.1170316	total: 40m 3s	remaining: 7m 26s
2800:	learn: 36113.7571658	total: 46m 31s	remaining: 45.8s
2846:	learn: 35989.1556668	total: 47m 15s	remaining: 0us
Next split date: 2023-03-31
0:	learn: 88345.6712280	total: 457ms	remaining: 22m 41s
400:	learn: 50788.5109014	total: 3m 5s	remaining: 19m 57s
800:	learn: 48468.5067645	total: 6m 9s	remaining: 16m 46s
1200:	learn: 46941.1674413	total: 9m 10s	remaining: 13m 36s
1600:	learn: 45801.9762425	total: 12m 9s	remaining: 10m 29s
2000:	learn: 44837.7573557	total: 15m 11s	remaining: 7m 26s
2400:	learn: 44022.6191273	total: 18m 3s	remaining: 4m 22s
2800:	learn: 43306.3477864	total: 20m 57s	remaining: 1m 21s
2981:	learn: 43000.5125773	total: 22m 15s	remaining: 0us
Next split date: 2023-04-30
0:	learn: 87170.5476376	total: 961ms	remaining: 41m
400:	learn: 52798.1924786	total: 6m 50s	remaining: 36m 50s
800:	learn: 49104.3334122	total: 13m 41s	remaining: 30m 3s
1200:	learn: 47474.0632408	total: 20m 32s	remaining: 23m 14s
1600:	lear

In [17]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  27
Best trial:
  Value:  14546.99779037388
  Params: 
    iterations: 2773
    learning_rate: 0.09457082863778554
    depth: 8
    l2_leaf_reg: 0.01268524076966806
    bootstrap_type: MVS
    random_strength: 0.0006630796555209294
    od_type: IncToDec
    od_wait: 32
