In [1]:
import sys
sys.path.insert(0, '../')

import os 
from os.path import join as pjoin

import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool
from optuna.samplers import TPESampler, RandomSampler

from src.hyperopt_wrap import ModelHyperOpt
from src.common_utils import prc_auc
from src.common_utils import preprocess_and_train, preprocess

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
DATA_PATH = pjoin(os.getcwd(), 'data')

### Load data

In [4]:
super_train = pd.read_csv(pjoin(DATA_PATH, 'train.csv'))
super_test = pd.read_csv(pjoin(DATA_PATH, 'test.csv'))

In [5]:
super_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
features_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

cat_features = ['Sex', 'Cabin', 'Embarked']

num_features = list(set(features_names) - set(cat_features))

In [7]:
X_train_1, X_test, _, y_test = train_test_split(super_train, super_train['Survived'], random_state=4, test_size=0.1)
X_train, X_eval, y_train, y_eval = train_test_split(X_train_1, X_train_1['Survived'], random_state=5, test_size=0.1)

In [8]:
X_train.shape, X_test.shape, X_eval.shape

((720, 12), (90, 12), (81, 12))

#### baseline

In [9]:
baseline_dict = X_train[['Survived', 'Sex']].groupby('Sex').mean().to_dict()['Survived']

In [10]:
X_test.loc[X_test['Sex'] == 'male', 'baseline_prediction'] = 0
X_test.loc[X_test['Sex'] == 'female', 'baseline_prediction'] = 1

In [11]:
X_test['baseline_prediction'] = X_test['Sex'].replace(baseline_dict)

In [12]:
prc_auc(
    y_true = X_test['Survived'].values,
    y_score = X_test['baseline_prediction'].values
)

0.762

#### boosting default params

In [13]:
X_train[cat_features] = X_train[cat_features].fillna('no_data')
X_eval[cat_features] = X_eval[cat_features].fillna('no_data')
X_test[cat_features] = X_test[cat_features].fillna('no_data')

In [14]:
catboost_default = CatBoostClassifier(
    random_seed = 30,
    verbose = False,
    cat_features = cat_features
)

In [15]:
catboost_default.fit(
    X = X_train[features_names], 
    y = y_train,
    eval_set = (X_eval[features_names], y_eval),
    early_stopping_rounds = 10
)

<catboost.core.CatBoostClassifier at 0x7f43a2b2f160>

In [16]:
X_test['catboost_default'] = catboost_default.predict_proba(X = Pool(X_test[features_names], cat_features = cat_features))[:, 1]

In [17]:
prc_auc(
    y_true = X_test['Survived'].values,
    y_score = X_test['catboost_default'].values
)

0.864

### hyperopt for catboost

In [28]:
my_config_catboost = {
    "model": CatBoostClassifier,
    
    "hyperparameters": lambda trial: {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.2),
        'iterations': trial.suggest_discrete_uniform('iterations', 100, 500, 10),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 1, 30),
        'depth' : trial.suggest_int('depth', 2, 5)
    },

    "fixed_hyperparameters": {
        "random_seed": 30,
        "verbose" : False
    },

    "metric": prc_auc,
    "direction": "maximize",
    "n_trials": 20,
    "target_name" : "target_lal",
    "cat_features" : cat_features
}

In [29]:
optimizer = ModelHyperOpt(train_set = (X_train[features_names], y_train), eval_set = (X_eval[features_names], y_eval), config = my_config_catboost)
best_params = optimizer.process()

[I 2023-09-07 15:58:28,226] A new study created in memory with name: no-name-80ed20a0-682c-4e44-b6ad-44d90914e4db
  'iterations': trial.suggest_discrete_uniform('iterations', 100, 500, 10),
[I 2023-09-07 15:58:28,778] Trial 0 finished with value: 0.805 and parameters: {'learning_rate': 0.19239529409532957, 'iterations': 420.0, 'min_data_in_leaf': 16, 'depth': 2}. Best is trial 0 with value: 0.805.
  'iterations': trial.suggest_discrete_uniform('iterations', 100, 500, 10),
[I 2023-09-07 15:58:29,239] Trial 1 finished with value: 0.791 and parameters: {'learning_rate': 0.15087750727879862, 'iterations': 280.0, 'min_data_in_leaf': 6, 'depth': 4}. Best is trial 0 with value: 0.805.
  'iterations': trial.suggest_discrete_uniform('iterations', 100, 500, 10),
[I 2023-09-07 15:58:30,116] Trial 2 finished with value: 0.842 and parameters: {'learning_rate': 0.043374244884538275, 'iterations': 500.0, 'min_data_in_leaf': 26, 'depth': 5}. Best is trial 2 with value: 0.842.
  'iterations': trial.sug

[I 2023-09-07 15:58:36,369] Trial 15 finished with value: 0.833 and parameters: {'learning_rate': 0.0740264381193268, 'iterations': 330.0, 'min_data_in_leaf': 13, 'depth': 2}. Best is trial 5 with value: 0.875.
  'iterations': trial.suggest_discrete_uniform('iterations', 100, 500, 10),
[I 2023-09-07 15:58:36,859] Trial 16 finished with value: 0.843 and parameters: {'learning_rate': 0.12316513618161247, 'iterations': 250.0, 'min_data_in_leaf': 1, 'depth': 5}. Best is trial 5 with value: 0.875.
  'iterations': trial.suggest_discrete_uniform('iterations', 100, 500, 10),
[I 2023-09-07 15:58:37,369] Trial 17 finished with value: 0.814 and parameters: {'learning_rate': 0.07314085187675474, 'iterations': 370.0, 'min_data_in_leaf': 17, 'depth': 2}. Best is trial 5 with value: 0.875.
  'iterations': trial.suggest_discrete_uniform('iterations', 100, 500, 10),
[I 2023-09-07 15:58:37,526] Trial 18 finished with value: 0.827 and parameters: {'learning_rate': 0.06202071033530124, 'iterations': 180.0

In [30]:
best_params.update(my_config_catboost['fixed_hyperparameters'])

In [31]:
catboost_hyperopt = CatBoostClassifier(**best_params)

In [32]:
catboost_hyperopt.fit(
    X = X_train[features_names], 
    y = y_train,
    cat_features = cat_features
)

<catboost.core.CatBoostClassifier at 0x7f4385962910>

In [33]:
X_test['catboost_hyperopt'] = catboost_hyperopt.predict_proba(X = Pool(X_test[features_names], cat_features = cat_features))[:, 1]

In [34]:
prc_auc(
    y_true = X_test['Survived'].values,
    y_score = X_test['catboost_hyperopt'].values
)

0.875

### hyperopt logreg

In [35]:
logreg_combo = preprocess_and_train(
    X_train = X_train[features_names].copy(),
    y_train = y_train.copy(),
    features_names = features_names,
    cat_features = cat_features,
    num_features = num_features,
    params = {'penalty' : 'l1', 'max_iter' : 20, 'solver': 'liblinear'}
)



In [36]:
X_test_preprocessed = preprocess(
    X = X_test.copy(),
    features_names = features_names,
    cat_features = cat_features,
    num_features = num_features,
    return_obj = False,
    num_features_means = logreg_combo['num_means'],
    encoder = logreg_combo['encoder'],
    scaler = logreg_combo['scaler']
)



In [37]:
X_test['logreg_default'] = logreg_combo['model'].predict_proba(X_test_preprocessed)[:, 1]

In [38]:
prc_auc(
    y_true = X_test['Survived'].values,
    y_score = X_test['logreg_default'].values
)

0.812

In [39]:
#TODO: need a model class with preprocessing inside to make my own model object and pass it to config['model']
#now preprocessing relies on current split, that's not good

X_train_preprocessed = preprocess(
    X = X_train.copy(),
    features_names = features_names,
    cat_features = cat_features,
    num_features = num_features,
    return_obj = False,
    num_features_means = logreg_combo['num_means'],
    encoder = logreg_combo['encoder'],
    scaler = logreg_combo['scaler']
)

X_eval_preprocessed = preprocess(
    X = X_eval.copy(),
    features_names = features_names,
    cat_features = cat_features,
    num_features = num_features,
    return_obj = False,
    num_features_means = logreg_combo['num_means'],
    encoder = logreg_combo['encoder'],
    scaler = logreg_combo['scaler']
)



In [42]:
my_config_logreg = {
    "model": LogisticRegression,
    
    "hyperparameters": lambda trial: {
        'C': trial.suggest_float('C', 0.5, 1.5),
        'max_iter': trial.suggest_int('max_iter', 7, 100),
        'solver': trial.suggest_categorical('solver',  ['lbfgs', "liblinear", 'newton-cholesky'])
    },

    "fixed_hyperparameters": {
        "random_state": 30
    },

    "metric": prc_auc,
    "direction": "maximize",
    "n_trials": 30,
    "sampler" : TPESampler
}

In [43]:
optimizer = ModelHyperOpt(train_set = (X_train_preprocessed, y_train), eval_set = (X_eval_preprocessed, y_eval), config = my_config_logreg)
best_params_logreg = optimizer.process()

[I 2023-09-07 15:59:12,564] A new study created in memory with name: no-name-c2ecd5e6-0b15-4329-83e9-1eee223af30a
[I 2023-09-07 15:59:12,578] Trial 0 finished with value: 0.807 and parameters: {'C': 0.7370971676590244, 'max_iter': 60, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.807.
[I 2023-09-07 15:59:12,587] Trial 1 finished with value: 0.785 and parameters: {'C': 0.6495833832998459, 'max_iter': 32, 'solver': 'liblinear'}. Best is trial 0 with value: 0.807.
[I 2023-09-07 15:59:12,596] Trial 2 finished with value: 0.807 and parameters: {'C': 0.5869151569660007, 'max_iter': 32, 'solver': 'liblinear'}. Best is trial 0 with value: 0.807.
[I 2023-09-07 15:59:12,606] Trial 3 finished with value: 0.754 and parameters: {'C': 0.5260004036309117, 'max_iter': 26, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.807.
[I 2023-09-07 15:59:12,615] Trial 4 finished with value: 0.817 and parameters: {'C': 1.1679510769199788, 'max_iter': 65, 'solver': 'liblinear'}. Best is trial 4 with value: 0

In [44]:
best_params_logreg.update(my_config_logreg['fixed_hyperparameters'])

In [45]:
best_params_logreg

{'C': 0.9406371394965491,
 'max_iter': 46,
 'solver': 'newton-cholesky',
 'random_state': 30}

In [46]:
logreg_combo_new = preprocess_and_train(
    X_train = X_train[features_names].copy(),
    y_train = y_train.copy(),
    features_names = features_names,
    cat_features = cat_features,
    num_features = num_features,
    params = best_params_logreg
)



In [47]:
X_test['logreg_hyperopt'] = logreg_combo_new['model'].predict_proba(X_test_preprocessed)[:, 1]

In [48]:
prc_auc(
    y_true = X_test['Survived'].values,
    y_score = X_test['logreg_hyperopt'].values
)

0.824