In [1]:
# main 
import numpy as np
import pandas as pd

import joblib
from copy import deepcopy

# deal with warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# set path
path = '/kaggle/input/playground-series-s4e5'

# load data
train = pd.read_csv(f'{path}/train.csv', index_col=0)
test = pd.read_csv(f'{path}/test.csv', index_col=0)

In [3]:
train_features = train.drop(columns=['FloodProbability']).copy()

# PipeLine Maker

In [4]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn.base import clone 

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())

preprocessing_baseline = make_column_transformer(
    (default_num_pipeline, make_column_selector(dtype_include=np.number)),
)

In [5]:
from catboost import CatBoostRegressor
# cat boost
model = make_pipeline(preprocessing_baseline,
                      CatBoostRegressor(verbose=1))

# New features

In [6]:
def stat_features(data, only_stat=False):
    
    df = data.copy()
    cols = df.columns
    
#     df['sum'] = df[cols].sum(axis=1) # Same info as mean 1 corr
    df['mean'] = df[cols].mean(axis=1)
    df['std'] = df[cols].std(axis=1)
    df['median'] = df[cols].median(axis=1)
    df['min_all'] = df[cols].min(axis=1)
    df['max_all'] = df[cols].max(axis=1)
    df['ptp'] = df[cols].values.ptp(axis=1)
    df['q25'] = df[cols].quantile(0.25, axis=1)
    df['q75'] = df[cols].quantile(0.75, axis=1)
    
    
    # df['max_min_range'] = df['max_all'] - df['min_all'] # same as ptp
    df['q75_25_range'] = df['q75'] - df['q25']
    
    df['powsum_1/2'] = np.power(df[cols], 1/2).sum(axis=1)
    df['powsum_2'] = np.power(df[cols], 2).sum(axis=1)
    df['powsum_3/2'] = np.power(df[cols], 1.5).sum(axis=1)
    df['powsum_-1'] = np.power(df[cols]+1e-5, -1).sum(axis=1)
    
    # will shift to avoid problems
    df['max_min_ratio'] = df['max_all'] / (df['min_all']+1e-5)
    df['q75_25_ratio'] = df['q75'] / (df['q25']+1e-5)
    
    
    # cat
    
    # df['spetial'] = df['sum'].isin(np.arange(72,74))
    
    if only_stat:
        df = df.drop(columns=cols) # drops lables too
        
    return df

In [7]:
X_train = stat_features(train_features, only_stat=True)
X_test = stat_features(test, only_stat=True)

y_train = train[['FloodProbability']]

In [8]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean,1117957.0,4.939411,0.415364,3.15,4.7,4.9,5.2,7.2
std,1117957.0,2.052674,0.369329,0.786398,1.785173,2.013115,2.282081,4.183615
median,1117957.0,4.777141,0.5644,2.0,4.5,5.0,5.0,8.0
min_all,1117957.0,1.604034,0.835383,0.0,1.0,2.0,2.0,5.0
max_all,1117957.0,9.262722,1.30484,4.0,8.0,9.0,10.0,19.0
ptp,1117957.0,7.658688,1.604804,2.0,7.0,7.0,9.0,18.0
q25,1117957.0,3.565401,0.595724,1.0,3.0,3.75,4.0,6.75
q75,1117957.0,6.1287,0.661493,3.25,6.0,6.0,6.25,10.25
q75_25_range,1117957.0,2.563299,0.795309,0.0,2.0,2.25,3.0,7.0
powsum_1/2,1117957.0,43.333922,2.045674,32.292121,42.069525,43.24654,44.714519,53.49086


In [9]:
# model.get_params().keys()

RS_CV

In [10]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform, randint

# """
# scipy.stats.randint(a, b+1): for hyperparameters with discrete values that range from a to b, and all values in that range seem equally likely.
# scipy.stats.uniform(a, b): this is very similar, but for continuous hyperparameters.
# scipy.stats.geom(1 / scale): for discrete values, when you want to sample roughly in a given scale. E.g., 
#     with scale=1000 most samples will be in this ballpark, but ~10% of all samples will be <100 and ~10% will be >2300.
# scipy.stats.expon(scale): this is the continuous equivalent of geom. Just set scale to the most likely value.
# scipy.stats.loguniform(a, b): when you have almost no idea what the optimal hyperparameter value's scale is. 
#     If you set a=0.01 and b=100, then you're just as likely to sample a value between 0.01 and 0.1 as a value between 10 and 100.
# """

# param_distrib = {
#     'catboostregressor__learning_rate': [0.05, 0.07, 0.09, 0.3],
#     'catboostregressor__depth': randint(low=4, high=8),
#     'catboostregressor__l2_leaf_reg': randint(low=1, high=9),
#     'catboostregressor__grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
# }

# best_params = {
#     'catboostregressor__depth': 7,
#     'catboostregressor__grow_policy': 'SymmetricTree',
#     'catboostregressor__l2_leaf_reg': 5,
#     'catboostregressor__learning_rate': 0.09
# }

In [11]:
# rnd_search_cv = RandomizedSearchCV(model, 
#                                    param_distrib, 
#                                    n_iter=5, 
#                                    cv=3,
#                                    random_state=42, 
#                                    n_jobs=-1
#                                   )

In [12]:
# rnd_search_cv.fit(X_train, y_train)

In [13]:
# cv_res = pd.DataFrame(rnd_search_cv.cv_results_)
# cv_res

In [14]:
# rnd_search_cv.best_params_

# {'catboostregressor__depth': 7,
#  'catboostregressor__grow_policy': 'SymmetricTree',
#  'catboostregressor__l2_leaf_reg': 5,
#  'catboostregressor__learning_rate': 0.09}

In [15]:
# final_model = rnd_search_cv.best_estimator_
# joblib.dump(final_model, "CatBoost_stats_only.pkl")

In [16]:
# pd.DataFrame(final_model_reloaded[-1].get_feature_importance(),
#              index=final_model_reloaded.feature_names_in_).sort_values(by=0, ascending=False)

# PipeLine fit

> Random search sol

In [17]:
# old model
# final_model_reloaded = joblib.load("/kaggle/input/s4e5-catboost/scikitlearn/s4e5/1/CatBoost_stats_only.pkl")

# fresh out of th box
final_model_reloaded = deepcopy(model)

final_model_reloaded.fit(X_train, y_train)
test_pred = final_model_reloaded.predict(X_test)

sub = pd.DataFrame({y_train.columns[0]: test_pred},
             index=test.index)
sub

Learning rate set to 0.124117
0:	learn: 0.0458651	total: 200ms	remaining: 3m 19s
1:	learn: 0.0414189	total: 314ms	remaining: 2m 36s
2:	learn: 0.0375730	total: 435ms	remaining: 2m 24s
3:	learn: 0.0343117	total: 554ms	remaining: 2m 18s
4:	learn: 0.0315326	total: 673ms	remaining: 2m 13s
5:	learn: 0.0292081	total: 783ms	remaining: 2m 9s
6:	learn: 0.0272635	total: 892ms	remaining: 2m 6s
7:	learn: 0.0256320	total: 1s	remaining: 2m 4s
8:	learn: 0.0243094	total: 1.11s	remaining: 2m 2s
9:	learn: 0.0232379	total: 1.22s	remaining: 2m
10:	learn: 0.0223479	total: 1.33s	remaining: 1m 59s
11:	learn: 0.0216285	total: 1.44s	remaining: 1m 58s
12:	learn: 0.0210577	total: 1.55s	remaining: 1m 57s
13:	learn: 0.0205962	total: 1.66s	remaining: 1m 56s
14:	learn: 0.0202388	total: 1.77s	remaining: 1m 56s
15:	learn: 0.0199485	total: 1.87s	remaining: 1m 55s
16:	learn: 0.0197150	total: 1.98s	remaining: 1m 54s
17:	learn: 0.0195315	total: 2.08s	remaining: 1m 53s
18:	learn: 0.0193840	total: 2.19s	remaining: 1m 53s
19:

Unnamed: 0_level_0,FloodProbability
id,Unnamed: 1_level_1
1117957,0.578949
1117958,0.457436
1117959,0.449562
1117960,0.466236
1117961,0.466377
...,...
1863257,0.474525
1863258,0.446139
1863259,0.621029
1863260,0.549138


> Opruna sol

In [18]:
import optuna

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 7, 15),
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = make_pipeline(preprocessing_baseline,
                          CatBoostRegressor(**params, 
                                            silent=True,
                                            eval_metric='RMSE',
                                            grow_policy='Depthwise',
                                           )
                         )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    r2 = r2_score(y_val, predictions)
    return r2

In [None]:
# def objective(trial):
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 100, 500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 200, 600),
#         'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),
#         'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
#         'subsample': trial.suggest_float('subsample', 0.25, 1.0),
#         'max_depth': trial.suggest_int('max_depth', 1, 15)
#     }
#     cv = KFold(5, shuffle=True, random_state=0)
#     cv_splits = cv.split(X_train, y_train)
#     scores = list()
#     model = LGBMRegressor(**params, objective='regression', random_state=0, device='gpu', verbosity = -1,)
#     for train_idx, val_idx in cv_splits:
#         X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
#         y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
#
#         model.fit(X_train_fold, y_train_fold)
#
#         y_pred = model.predict(X_val_fold)
#         r2 = r2_score(y_val_fold, y_pred)
#         scores.append(r2)
#
#     return np.mean(scores)

# sqlite_db = "sqlite:///lgbm.db"
# study_name = "lgbm"
#
# if optimize:
#     study = optuna.create_study(storage=sqlite_db, study_name=study_name,
#                                 sampler=TPESampler(n_startup_trials=75, multivariate=True, seed=0),
#                                 direction="maximize", load_if_exists=True)
#
#     study.optimize(objective, n_trials=200)
#     print(f"best optimized R2: {study.best_value:0.5f}") # 0.86924
#     print(f"best hyperparameters: {study.best_params}")
#
#     lgbm_params = study.best_params

In [None]:
model = CatBoostRegressor(verbose=False,
                                   eval_metric='RMSE',
                                   grow_policy='Depthwise',
                                   max_depth=9, 
                                   min_child_samples=80,
                                   n_estimators=300,
                                   
                                   reg_lambda=6,
                                   , objective='regression', random_state=0, device='gpu',
                
                                   task_type="GPU",
                                   devices='0'
                                  )

In [19]:
MAKE_SUBMISSION = False

if MAKE_SUBMISSION:
    sub.to_csv('submission.csv', index=True)