In [1]:
import numpy as np
import pandas as pd
from math import sin,cos,pi
from catboost import CatBoostRegressor,Pool,cv
from sklearn.model_selection import KFold
import optuna
import json

ModuleNotFoundError: No module named 'pytorch_tabnet'

In [19]:
data_train = pd.read_csv('/kaggle/input/vseros-ai-otb/train-2/train.csv',parse_dates=['agreement_date'])
data_test = pd.read_csv('/kaggle/input/vseros-ai-otb/test-3/test.csv',parse_dates=['agreement_date'])
sample_sub = pd.read_csv('/kaggle/input/vseros-ai-otb/sample_submission-7.csv')

In [10]:
data_train.head(5)

Unnamed: 0,region_name_cat,district_cat,corpus_cat,developer_cat,agreement_date,floor,square,rooms_4,location_logs_count_mean,location_depth,...,location_public_transport_platform_w_mean_distance,location_water_w_mean_distance,location_university_w_mean_distance,location_leisure_w_mean_distance,location_pop_shop_cnt,price_target,hc_name_cat,interior_cat,class_cat,stage_cat
0,Город,58,1331,91,2012-04-13,10.0,78.44,3,23.131066,13.0,...,0.894947,0.772872,1.309514,0.853183,7.0,25024.299281,36,0.0,97865,27728
1,Город,75,1677,91,2013-09-16,2.0,34.15,1,14.090185,13.0,...,1.063211,0.84013,-999.0,1.147596,0.0,18477.300271,372,32413.0,97865,70661
2,Пригород,48,316,10,2014-07-31,17.0,59.85,2,19.453795,13.0,...,0.832622,-999.0,-999.0,0.905435,1.0,17441.013879,336,8977.0,97865,12638
3,Пригород,48,1409,91,2012-12-30,12.0,67.53,2,13.178136,13.0,...,-999.0,1.322121,-999.0,1.263878,0.0,17019.139763,154,32413.0,97865,70661
4,Пригород,48,1590,91,2014-06-20,5.0,58.13,2,20.69177,13.0,...,0.825295,0.996629,-999.0,1.024595,2.0,17132.394908,154,32413.0,97865,12638


In [11]:
data_train['rooms_4'].value_counts()

rooms_4
1         45117
2         31175
3         13907
>=4        2108
студия     1148
Name: count, dtype: int64

In [14]:
def make_base_time_features(df):
    df['month'] = df['agreement_date'].apply(lambda x:x.month)
    df['day'] = df['agreement_date'].apply(lambda x:x.day)
    df['week'] = df['agreement_date'].apply(lambda x:x.week)
    df['quarter'] = df['agreement_date'].apply(lambda x:x.quarter)
    df['year'] = df['agreement_date'].apply(lambda x: x.year - 2012)
    
    df['weekofyear'] = df['agreement_date'].apply(lambda x:x.weekofyear)
    df['dayofweek'] = df['agreement_date'].apply(lambda x:x.dayofweek)
    df['dayofyear'] = df['agreement_date'].apply(lambda x:x.dayofyear)
    df['week'] = df['agreement_date'].apply(lambda x:x.week)
    
    df['all_time'] = df['year'] * 365 + df['dayofyear']
    df['all_week'] = df['year'] * 52 + df['weekofyear']
    return df

def get_polynoms_from_column(df,col):
    min_v = df[col].min()
    max_v = df[col].max()
    
    df[f'sin_{col}'] = df[col].apply(sin)
    df[f'cos_{col}'] = df[col].apply(cos)
    df[f'sin_{col}^2'] = df[col].apply(sin) * df[col].apply(sin)
    df[f'cos_{col}^2'] = df[col].apply(cos) * df[col].apply(cos)
    
    df[f'{col}_sin'] = df[col].apply(lambda x: sin((x - min_v) / max_v * 2 * pi))
    df[f'{col}_cos'] = df[col].apply(lambda x: cos((x - min_v) / max_v * 2 * pi))
    df[f'{col}_sin^2'] = df[f'{col}_sin'] * df[f'{col}_sin']
    df[f'{col}_cos^2'] = df[f'{col}_cos'] * df[f'{col}_cos']
    return df

def get_dop_features(df):
    day = 24
    year = 365.2425*day
    df['vday_sin'] = (df.hour * 2 * np.pi / day)
    df['vday_sin'] = df['vday_sin'].apply(sin)
    df['vday_cos'] = (df.hour * 2 * np.pi / day)
    df['vday_cos'] = df['vday_cos'].apply(cos)
    df['vyear_sin'] = (df.hour * 2 * np.pi / year)
    df['vyear_sin'] = df['vyear_sin'].apply(sin)
    df['vyear_cos'] = (df.hour * 2 * np.pi / year)
    df['vyear_cos'] = df['vyear_cos'].apply(cos)
    
    return df

In [15]:
def process(df):
    df = make_base_time_features(df)
    for col in ['month','day','dayofyear']:
        df = get_polynoms_from_column(df,col)
    df['interior_cat'] = df['interior_cat'].astype(int)
    df['rooms_4'] = df['rooms_4'].map(lambda x: 5 if x == '>=4' else x)
    df['rooms_4'] = df['rooms_4'].map(lambda x: 0 if x == 'студия' else x)
    return df

In [20]:
train_df = process(data_train)
test_df = process(data_test)

In [61]:
class CatBoostKFoldWraper():
    def __init__(self,parameters,n_folds,seed,cat_cols):
        self.parameters = parameters
        self.kfold = KFold(n_splits=n_folds,shuffle=True,random_state=seed)
        self.cat_cols = cat_cols
        
    def train_cv(self,X,y):
        self.scores = []
        self.models = []
        for train_idx,val_idx in self.kfold.split(X,y):
            
            train_pool = Pool(X.iloc[train_idx],
                              label=y.iloc[train_idx],
                              cat_features=self.cat_cols)
            val_pool = Pool(X.iloc[val_idx],
                              label=y.iloc[val_idx],
                              cat_features=self.cat_cols)
            model_cb = CatBoostRegressor(**self.parameters)
            model_cb.fit(train_pool,eval_set=val_pool,verbose=False)
            self.models += [model_cb]
            self.scores += [model_cb.get_best_score()['validation']['MAPE']]
        return sum(self.scores) / len(self.scores)

In [49]:
cat_cols = ['region_name_cat','district_cat','corpus_cat','developer_cat',
            'hc_name_cat','interior_cat','class_cat','stage_cat']
label_col = 'price_target'
drop_cols = ['agreement_date','price_target']

In [66]:
def objective(trial):
    params = {'iterations':1000,
              'loss_function':trial.suggest_categorical('objection',['MAE','RMSE','MAPE','LogCosh']),
              'learning_rate':trial.suggest_float('lr',1e-3,0.95),
              'random_seed':56,
              'eval_metric':'MAPE',
              'l2_leaf_reg':trial.suggest_float('l2_reg',0,5.0),
              'max_depth': trial.suggest_int('depth', 1, 12),
              'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
              'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
              'border_count':trial.suggest_int('bc',1,256),
              'rsm':trial.suggest_float('rsm',0.1,0.9),
              'leaf_estimation_method':trial.suggest_categorical('lfs_type', ['Newton', 'Gradient','Exact']),
              'grow_policy':'SymmetricTree'
              }
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    wraper = CatBoostKFoldWraper(parameters=params,
                                 n_folds=5,
                                 seed=56,
                                 cat_cols=cat_cols)
    score = wraper.train_cv(train_df.drop(drop_cols,axis=1),train_df[label_col])
    params['score'] = score
    with open(f'chekpoint_{score}.json','w') as f :
        json.dump(params,f)
    return score

In [59]:
wandb_kwargs = {"project": "catboost_optuna"}
wandbc = optuna.integration.WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

  wandbc = optuna.integration.WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [67]:
study = optuna.create_study(
        direction="minimize",
        study_name='expirement',
        load_if_exists=True,
        pruner=optuna.pruners.HyperbandPruner(),
        sampler=optuna.samplers.TPESampler()
    )

[I 2023-10-04 20:02:06,400] A new study created in memory with name: expirement


In [68]:
study.optimize(objective,
               n_trials=100,
               callbacks=[wandbc])

[I 2023-10-04 20:20:11,858] Trial 0 finished with value: 0.04592605217543971 and parameters: {'objection': 'RMSE', 'lr': 0.2905670731589108, 'l2_reg': 2.675902211144103, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bc': 142, 'rsm': 0.46084937917294755, 'lfs_type': 'Newton', 'bagging_temperature': 7.170519343878774}. Best is trial 0 with value: 0.04592605217543971.
[I 2023-10-04 20:22:36,629] Trial 1 finished with value: 0.9721840229830697 and parameters: {'objection': 'LogCosh', 'lr': 0.6157959594086743, 'l2_reg': 3.8610015651048246, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'bc': 25, 'rsm': 0.24882659455379808, 'lfs_type': 'Gradient'}. Best is trial 0 with value: 0.04592605217543971.
[I 2023-10-04 20:29:48,142] Trial 2 finished with value: 0.04600454123266151 and parameters: {'objection': 'RMSE', 'lr': 0.0572084847889471, 'l2_reg': 1.502984534898939, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bc': 169, 'rsm': 

CatBoostError: /src/catboost/catboost/private/libs/options/catboost_options.cpp:595: Newton leaves estimation method is not supoprted for MAPE loss function