# Searching for best hyperparameters set

# AdaBoost optimization

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_parallel_coordinate
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
train_df = pd.read_feather('./data/train_processed.ftr')
val_df = pd.read_feather('./data/val_processed.ftr')
cols = list(train_df.columns)

In [4]:
cols

['is_g734s',
 'CryoSleep',
 'VIP',
 'Europa',
 'Mars',
 'PSO J318.5-22',
 'TRAPPIST-1e',
 'Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'segment',
 'Transported']

## Prepare AdaBoost model

In [5]:
train_x = train_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].to_numpy()
val_x = val_df[cols[:-1]].to_numpy()
val_y = val_df[cols[-1]].to_numpy()

In [9]:
class ModelTester:
    
    def __init__(
        self,
        params: dict,
        train_x: np.array,
        train_y: np.array,
        val_x: np.array,
        val_y: np.array
    ):
        self.params = params
        self.train_x = train_x
        self.train_y = train_y
        self.val_x = val_x
        self.val_y = val_y
        
        self.model = self.train_model()
        self.y_pred_bin = self.get_bin_predictions()
            
    def train_model(self):
        # 1 - Run model
        tree = DecisionTreeClassifier(
            min_samples_split=self.params['min_samples_split'],
            max_depth=self.params['max_depth'],
            min_impurity_decrease=self.params['min_impurity_decrease'],
            criterion=self.params['criterion']
        )
        model = AdaBoostClassifier(
            base_estimator=tree,
            n_estimators=self.params['n_estimators'],
            learning_rate=self.params['learning_rate'],
            algorithm=self.params['algorithm'],
        )
        model.fit(self.train_x, self.train_y)
        return model
    
    def get_bin_predictions(self):
        # 2 - Get predictions
        y_pred = self.model.predict(val_x)
        y_pred_bin = [1 if y else 0 for y in y_pred]
        
        return y_pred_bin
    
    def get_accuracy(self):
        return round(accuracy_score(self.val_y, self.y_pred_bin), 3)
    
    def get_class_report(self):
        return classification_report(self.val_y, self.y_pred_bin)    

## Optuna research

In [18]:
def objective(trial):
    # 0 - Prepare params
    n_estimators = trial.suggest_int("n_estimators", 10, 200)
    algorithm = trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5)
    max_depth = trial.suggest_int('max_depth', 1, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0, 0.0001)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    params = {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'algorithm': algorithm,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_impurity_decrease': min_impurity_decrease,
        'criterion': criterion
    }
    
    # 1 - Model
    model_tester = ModelTester(params, train_x, train_y, val_x, val_y)
    acc = model_tester.get_accuracy()
    error = 1 - acc
    
    return error

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=1000)

### Show results

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
key_params = ['algorithm', 'learning_rate', 'n_estimators']
plot_slice(study, params = key_params).show()

In [None]:
study.best_params

{'n_estimators': 106,
 'algorithm': 'SAMME.R',
 'learning_rate': 0.4827054897488229,
 'max_depth': 11,
 'min_samples_split': 1,
 'min_impurity_decrease': 7.200517833654635e-05,
 'criterion': 'gini'}

In [20]:
best_params = {
    'n_estimators': 106,
    'algorithm': 'SAMME.R',
    'learning_rate': 0.4827054897488229,
    'max_depth': 11,
    'min_samples_split': 1,
    'min_impurity_decrease': 7.200517833654635e-05,
    'criterion': 'gini'
}

## Create model with best params

In [21]:
train_x = train_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].to_numpy()
val_x = val_df[cols[:-1]].to_numpy()
val_y = val_df[cols[-1]].to_numpy()

In [22]:
params = best_params
tree = DecisionTreeClassifier(
#     min_samples_split=1,
    max_depth=11,
    min_impurity_decrease=7.200517833654635e-05,
    criterion='gini'
)
model = AdaBoostClassifier(
    base_estimator=tree,
    n_estimators=106,
    learning_rate=0.4827054897488229,
    algorithm='SAMME.R'
)

In [23]:
X = pd.concat([train_df[cols[:-1]], val_df[cols[:-1]]])
y = pd.concat([train_df[cols[-1]], val_df[cols[-1]]])

In [24]:
cv_scores = cross_val_score(model, X, y, cv=10)
print(f'Mean cv (k=10) accuracy score = {round(np.mean(cv_scores), 3)}')

Mean cv (k=10) accuracy score = 0.748


## Feature selection

In [25]:
model.fit(train_x, train_y)

In [26]:
imp_df = pd.DataFrame({
    'feature': cols[:-1],
    'imp': model.feature_importances_
}).sort_values('imp', ascending=False)
best_features = imp_df['feature'].to_list()
imp_df

Unnamed: 0,feature,imp
7,Age,0.03153
9,FoodCourt,0.028827
8,RoomService,0.024136
12,VRDeck,0.023459
10,ShoppingMall,0.023196
11,Spa,0.016956
1,CryoSleep,0.007763
6,TRAPPIST-1e,0.003949
13,segment,0.00241
4,Mars,0.002278


In [27]:
temp_cols = []
cv_list = []

for feature in best_features:
    temp_cols.append(feature)
    X = pd.concat([train_df[temp_cols], val_df[temp_cols]]).to_numpy()
    cv_score = cross_val_score(model, X, y, cv=10)
    cv_list.append(cv_score)

In [28]:
means = []
for scores in cv_list:
    means.append(round(np.mean(scores), 3))

i_features = 0
for i in range(len(means)):
    if means[i] == max(means):
        i_features = i
        print(i_features)
        break

9


In [29]:
best_features[:i_features+1]

['Age',
 'FoodCourt',
 'RoomService',
 'VRDeck',
 'ShoppingMall',
 'Spa',
 'CryoSleep',
 'TRAPPIST-1e',
 'segment',
 'Mars']

## Make final model

In [30]:
train_x = train_df[best_features[:i_features+1]].to_numpy()
val_x = val_df[best_features[:i_features+1]].to_numpy()

In [31]:
params = best_params
tree = DecisionTreeClassifier(
#     min_samples_split=1,
    max_depth=11,
    min_impurity_decrease=7.200517833654635e-05,
    criterion='gini'
)
model = AdaBoostClassifier(
    base_estimator=tree,
    n_estimators=106,
    learning_rate=0.4827054897488229,
    algorithm='SAMME.R'
)

In [32]:
model.fit(train_x, train_y)
pred_val = model.predict_proba(val_x)
pred_train = model.predict_proba(train_x)

In [33]:
train_proba_df = pd.DataFrame(pred_train)
train_proba_df.columns = ['0', 'ada_boost']
train_proba_df['y'] = train_y.astype(int)
train_proba_df.drop(columns='0', inplace=True)
train_proba_df

Unnamed: 0,ada_boost,y
0,0.506409,0
1,0.503845,0
2,0.364225,0
3,0.501657,1
4,0.307844,0
...,...,...
6996,0.423004,0
6997,0.259809,0
6998,0.540369,1
6999,0.453338,0


In [34]:
val_proba_df = pd.DataFrame(pred_val)
val_proba_df.columns = ['0', 'ada_boost']
val_proba_df['y'] = val_y.astype(int)
val_proba_df.drop(columns='0', inplace=True)
val_proba_df

Unnamed: 0,ada_boost,y
0,0.376808,1
1,0.503726,0
2,0.591132,1
3,0.440961,0
4,0.505682,1
...,...,...
1687,0.448637,0
1688,0.505167,1
1689,0.510012,1
1690,0.498603,1


In [239]:
train_proba_df.to_csv('./data/ensemble_train_df.csv')
val_proba_df.to_csv('./data/ensemble_val_df.csv')

In [35]:
filepath = os.path.join('models', 'adaboost.pickle')
pickle.dump(model, open(filepath, 'wb'))