In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

from scipy.stats import uniform, randint, loguniform
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostRegressor

from fit_util import *

In [9]:
# X, y = load_data()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

train_df, target = load_data()
X_train = train_df.iloc[np.random.choice(np.arange(len(train_df)), 300, False)]
y_train = target[X_train.index]

cat_features = np.where(X_train.loc[:, X_train.columns.values].dtypes == "object")[0]
to_categorical(X_train)

In [10]:
parameters = {
    "n_estimators": [1, 10, 20, 100, 1000, 5000, 7000, 9000, 10000],
    "learning_rate": uniform(1e-3,  1e-1),
    "depth": [1, 2, 3, 4, 5],
    "l2_leaf_reg": loguniform(10, 40)
    }

model = CatBoostRegressor( task_type="CPU", 
                           logging_level='Silent', 
                           random_seed=0,
                           cat_features=cat_features)

grid_model = RandomizedSearchCV(model, parameters, cv=5, n_jobs=-1)
grid_model.fit(X_train, y_train);

In [11]:
print(grid_model.best_params_)

{'depth': 4, 'l2_leaf_reg': 12.04252104127671, 'learning_rate': 0.05412027556607346, 'n_estimators': 7000}


In [12]:
X, y = load_data()
cat_features = np.where(X.loc[:, X.columns.values].dtypes == "object")[0]
to_categorical(X)

gs_model = CatBoostRegressor(**grid_model.best_params_,
                               task_type="CPU", 
                               logging_level='Silent', 
                               random_seed=0,
                               cat_features=cat_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)
gs_model.fit(X_train, y_train);

In [13]:
evaluate(gs_model, X_train, y_train)
evaluate(gs_model, X_test, y_test)

RMSE: 0.001046892388252714
RMSE: 0.008941165645505358
