# Searching for best hyperparameters set

In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice
import plotly.express as px
import pickle
import os

In [6]:
train_df = pd.read_feather('./data/train_processed.ftr')
val_df = pd.read_feather('./data/val_processed.ftr')
cols = list(train_df.columns)

In [7]:
cols

['is_g734s',
 'CryoSleep',
 'VIP',
 'Europa',
 'Mars',
 'PSO J318.5-22',
 'TRAPPIST-1e',
 'Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'segment',
 'Transported']

## Prepare LGB Model

In [8]:
train_x = train_df[cols[:-1]].to_numpy()
train_y = train_df[cols[-1]].to_numpy()

In [9]:
val_x = val_df[cols[:-1]].to_numpy()
val_y = val_df[cols[-1]].to_numpy()

In [15]:
class ModelTester:
    
    def __init__(self, params, train_x, train_y, val_x, val_y):
        self.params = params
        self.train_x = train_x
        self.train_y = train_y
        self.val_x = val_x
        self.val_y = val_y
        
        self.model = self.train_model()
        self.y_pred_bin = self.get_bin_predictions()
            
    def train_model(self):
        # 1 - Run model
        model = lgb.LGBMClassifier(
            boosting=self.params['boosting'],
            n_estimators=self.params['num_rounds'],
            max_depth=self.params['max_depth'],
            learnig_rate=self.params['learning_rate'],
            num_leaves = self.params['num_leaves'],
            min_child_samples = self.params['min_data_in_leaf'],
            subsample=self.params['bagging_fraction'],
            reg_alpha=self.params['lambda_l1'],
            reg_lambda=self.params['lambda_l2']
        )
        model.fit(train_x, train_y)
        return model
    
    def get_bin_predictions(self):
        # 2 - Get predictions
        y_pred = self.model.predict(val_x)
        y_pred_bin = [1 if y >= 0.5 else 0 for y in y_pred]
        
        return y_pred_bin
    
    def get_accuracy(self):
        return round(accuracy_score(self.val_y, self.y_pred_bin), 3)
    
    def get_class_report(self):
        return classification_report(self.val_y, self.y_pred_bin)    

## Optuna research

In [16]:
def objective(trial):
    # 0 - Prepare params
    num_rounds = trial.suggest_int("num_rounds", 10, 100)
    boosting = trial.suggest_categorical('boosting', ['gbdt', 'dart'])
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5)
    max_depth = trial.suggest_int('max_depth', 5, 100)
    num_leaves = trial.suggest_int("num_leaves", 2, 100)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 500)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.9, 1.)
    extra_trees = trial.suggest_categorical('extra_trees', [True, False])
    lambda_l1 = trial.suggest_float('lambda_l1', 0, 0.1)
    lambda_l2 = trial.suggest_float('lambda_l2', 0, 0.1)
    
    params = {
        'num_rounds': num_rounds,
        'objectives':'binary',
        'verbosity':0,
        'boosting':boosting,
        'learning_rate':learning_rate,
        'max_depth':max_depth,
        'num_leaves':num_leaves,
        'min_data_in_leaf':min_data_in_leaf,
        'bagging_fraction':bagging_fraction,
        'extra_trees':extra_trees,
        'lambda_l1':lambda_l1,
        'lambda_l2':lambda_l2
    }
    
    # 1 - Model
    model_tester = ModelTester(params, train_x, train_y, val_x, val_y)
    acc = model_tester.get_accuracy()
    error = 1 - acc
    
    return error

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=1000)

### Show results

In [18]:
plot_optimization_history(study)

In [19]:
plot_param_importances(study)

In [20]:
key_params = ['num_leaves', 'min_data_in_leaf', 'num_rounds', 'learning_rate', 'max_depth']
plot_slice(study, params = key_params).show()

In [21]:
study.best_params

{'num_rounds': 87,
 'boosting': 'gbdt',
 'learning_rate': 0.24559942360632506,
 'max_depth': 10,
 'num_leaves': 9,
 'min_data_in_leaf': 20,
 'bagging_fraction': 0.9652531375116534,
 'extra_trees': True,
 'lambda_l1': 0.007416739301259171,
 'lambda_l2': 0.034711374104249}

In [22]:
best_params = {
'num_rounds': 89,
'boosting': 'gbdt',
'learning_rate': 0.19673057125087745,
'max_depth': 88,
'num_leaves': 8,
'min_data_in_leaf': 37,
'bagging_fraction': 0.9514661346499478,
'extra_trees': False,
'lambda_l1': 0.08212421594574758,
'lambda_l2': 0.0021191892176790073
}

## Create model with best params

In [23]:
model = lgb.LGBMClassifier(
    boosting=best_params['boosting'],
    n_estimators=best_params['num_rounds'],
    max_depth=best_params['max_depth'],
    learnig_rate=best_params['learning_rate'],
    num_leaves=best_params['num_leaves'],
    min_child_samples=best_params['min_data_in_leaf'],
    subsample=best_params['bagging_fraction'],
    reg_alpha=best_params['lambda_l1'],
    reg_lambda=best_params['lambda_l2']
)

In [24]:
X = np.concatenate((train_x, val_x))
y = np.concatenate((train_y, val_y))

In [25]:
cv_scores = cross_val_score(model, X, y, cv=10)
print(f'Mean cv (k=10) accuracy score = {round(np.mean(cv_scores), 3)}')

Mean cv (k=10) accuracy score = 0.792


## Feature selection

In [26]:
is_cat_col = {
    'is_g734s': True,
    'CryoSleep': True,
    'VIP': True,
    'Europa': True,
    'Mars': True,
    'PSO J318.5-22': True,
    'TRAPPIST-1e': True,
    'Age': False,
    'RoomService': False,
    'FoodCourt': False,
    'ShoppingMall': False,
    'Spa': False,
    'VRDeck': False,
    'segment': True
 }

feature_mi = mutual_info_classif(X, y, discrete_features=list(is_cat_col.values()))
feature_f = f_classif(X, y)

feature_df = pd.DataFrame({
    'feature': cols[:-1],
    'mi': feature_mi,
    'f': feature_f[0]
})

scaler = StandardScaler()
feature_df[['mi_z', 'f_z']] = scaler.fit_transform(feature_df[['mi', 'f']])
feature_df['feature_quality'] = feature_df[['mi_z', 'f_z']].mean(axis=1)

feature_df.sort_values('feature_quality', ascending=False, inplace=True)
best_features = feature_df['feature'].to_list()
feature_df

Unnamed: 0,feature,mi,f,mi_z,f_z,feature_quality
1,CryoSleep,0.1072555,2228.334929,1.777779,2.196114,1.986947
13,segment,0.1066814,1800.139509,1.76285,1.61608,1.689465
11,Spa,0.07587675,1285.480254,0.961739,0.918922,0.940331
8,RoomService,0.07407989,1239.221137,0.91501,0.85626,0.885635
12,VRDeck,0.05922664,1101.496913,0.528733,0.669698,0.599216
10,ShoppingMall,0.04706093,267.552878,0.212349,-0.459963,-0.123807
9,FoodCourt,0.03975172,151.085174,0.022265,-0.61773,-0.297733
3,Europa,0.01587072,280.812746,-0.598789,-0.442001,-0.520395
7,Age,0.01318313,48.491713,-0.668683,-0.756703,-0.712693
6,TRAPPIST-1e,0.004653416,81.384065,-0.890508,-0.712147,-0.801328


In [27]:
fig = px.bar(feature_df, x='feature_quality', y='feature')
fig.show()

In [28]:
temp_cols = []
cv_list = []

model = lgb.LGBMClassifier(
    boosting=best_params['boosting'],
    n_estimators=best_params['num_rounds'],
    max_depth=best_params['max_depth'],
    learnig_rate=best_params['learning_rate'],
    num_leaves=best_params['num_leaves'],
    min_child_samples=best_params['min_data_in_leaf'],
    subsample=best_params['bagging_fraction'],
    reg_alpha=best_params['lambda_l1'],
    reg_lambda=best_params['lambda_l2']
)

for feature in best_features:
    temp_cols.append(feature)
    X = pd.concat([train_df[temp_cols], val_df[temp_cols]]).to_numpy()
    cv_score = cross_val_score(model, X, y, cv=10)
    cv_list.append(cv_score)



In [29]:
means = []
for scores in cv_list:
    means.append(round(np.mean(scores), 3))

i_features = 0
for i in range(len(means)):
    if means[i] == max(means):
        i_features = i
        break

In [30]:
train_x = train_df[best_features[:i_features+1]].to_numpy()
val_x = val_df[best_features[:i_features+1]].to_numpy()

In [31]:
model = lgb.LGBMClassifier(
    boosting=best_params['boosting'],
    n_estimators=best_params['num_rounds'],
    max_depth=best_params['max_depth'],
    learnig_rate=best_params['learning_rate'],
    num_leaves=best_params['num_leaves'],
    min_child_samples=best_params['min_data_in_leaf'],
    subsample=best_params['bagging_fraction'],
    reg_alpha=best_params['lambda_l1'],
    reg_lambda=best_params['lambda_l2']
)

model.fit(train_x, train_y)
pred_val = model.predict_proba(val_x)
pred_train = model.predict_proba(train_x)



In [32]:
train_proba_df = pd.DataFrame(pred_train)
train_proba_df.columns = ['0', 'lgbm']
train_proba_df['y'] = train_y.astype(int)
train_proba_df.drop(columns='0', inplace=True)
train_proba_df

Unnamed: 0,lgbm,y
0,0.730929,0
1,0.728635,0
2,0.040106,0
3,0.583155,1
4,0.033270,0
...,...,...
6996,0.566189,0
6997,0.044433,0
6998,0.755337,1
6999,0.353885,0


In [33]:
val_proba_df = pd.DataFrame(pred_val)
val_proba_df.columns = ['0', 'lgbm']
val_proba_df['y'] = val_y.astype(int)
val_proba_df.drop(columns='0', inplace=True)
val_proba_df

Unnamed: 0,lgbm,y
0,0.361728,1
1,0.969889,0
2,0.795015,1
3,0.129603,0
4,0.977549,1
...,...,...
1687,0.124004,0
1688,0.970353,1
1689,0.730929,1
1690,0.723095,1


In [34]:
train_proba_final = pd.read_csv('./data/ensemble_train_df.csv')
train_proba_final['lgbm'] = train_proba_df['lgbm']
val_proba_final = pd.read_csv('./data/ensemble_val_df.csv')
val_proba_final['lgbm'] = val_proba_df['lgbm']

In [35]:
train_proba_final = train_proba_final[['ada_boost', 'svc', 'lgbm', 'y']]
train_proba_final.to_feather('./data/ensemble_train_df.ftr')
val_proba_final = val_proba_final[['ada_boost', 'svc', 'lgbm', 'y']]
val_proba_final.to_feather('./data/ensemble_val_df.ftr')

In [38]:
filepath = os.path.join('models', 'lgbm.pickle')
pickle.dump(model, open(filepath, 'wb'))