# Imports

In [59]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer, Normalizer

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier


!pip install --quiet optuna
import optuna


In [3]:
np.random.seed(42)

# Utils

In [29]:
def one_hot_encode(df , col):
  one_hot_encoded = pd.get_dummies(df[col] , dtype=int)
  df = pd.concat([df, one_hot_encoded], axis=1)
  df.drop(col, axis = 1 , inplace = True)



In [4]:
# return variables that are correlated beyond thresh - drop one of the two variables
def highly_corr(df, thresh= 0.8):
  corrmat = df.corr()
  upper_tri = corrmat.where(np.triu(np.ones(corrmat.shape),k=1).astype(bool))
  to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >thresh)]
  return to_drop

In [5]:
# set extreme outliers equal to a specified percentile of the data
def winsorize_data( columns, train, valid, test, lower_percentile=5, upper_percentile=95):

    for col in columns:
        lower_limit = np.percentile(train[col], lower_percentile)
        upper_limit = np.percentile(train[col], upper_percentile)
        train[col] = np.where(train[col] < lower_limit, lower_limit,
                                      np.where(train[col] > upper_limit, upper_limit, train[col]))
        valid[col] = np.where(valid[col] < lower_limit, lower_limit,
                                      np.where(valid[col] > upper_limit, upper_limit, valid[col]))
        test[col] = np.where(test[col] < lower_limit, lower_limit,
                                      np.where(test[col] > upper_limit, upper_limit, test[col]))
    return train, valid, test

In [6]:
def handle_null_values(df , mean_impute = False):
  if mean_impute:
    return df.fillna(df.mean()) #Fill missing values with column mean
  else:
    return df.dropna().reset_index() # Remove rows with null values


In [7]:
# Method - Interquartile Range (IQR)
def get_outliers(df , col , alpha = 1.5):
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - alpha * IQR
  upper_bound = Q3 + alpha * IQR
  outliers_iqr = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  return len(outliers_iqr) , outliers_iqr.index

In [8]:
def info(df):
  print('-----------------------------------------------------------')
  print('Column wise null counts : ')
  null_counts = df.isnull().sum()
  print(null_counts)

  print('-----------------------------------------------------------')
  rows_with_null = df.isnull().any(axis=1).sum()
  print("Number of rows with at least one null value: ", rows_with_null)

  print('-----------------------------------------------------------')
  print('Unique Values per column : ')
  print(df.nunique())


In [9]:
def outliers_info(df, numerical):
  out = {}
  for col in numerical:
    l , _ = get_outliers(df , col )
    out[col] = l
  return out

In [10]:
def evaluation_metrics(y_true, y_pred):
  print('-----------------------------------------------------------')
  cm = confusion_matrix(y_true, y_pred)
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  fpr, tpr, thresholds = roc_curve(y_true, y_pred)
  roc_auc = auc(fpr, tpr)

  print("Confusion Matrix:")
  print(cm)
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("F1-Score:", f1)
  print("ROC AUC:", roc_auc)

  print('-----------------------------------------------------------')


# Classification

In [37]:
df = pd.read_csv('/content/Titanic Dataset.csv')

In [38]:
# no info -
df.drop('name', axis=1, inplace = True)
df.drop('ticket', axis=1, inplace = True)
df.drop('cabin', axis=1, inplace = True) # todo check this

# data leak
df.drop('body', axis=1, inplace = True)
df.drop('boat', axis=1, inplace = True)

#
df.drop('home.dest' , axis=1, inplace = True)

In [39]:
numerical = ['age', 'sibsp', 'parch', 'fare']
ordinal = ['pclass']
categorical = [ 'sex','embarked']
target = 'survived'

In [40]:
info(df[numerical + categorical + [target]])

-----------------------------------------------------------
Column wise null counts : 
age         263
sibsp         0
parch         0
fare          1
sex           0
embarked      2
survived      0
dtype: int64
-----------------------------------------------------------
Number of rows with at least one null value:  266
-----------------------------------------------------------
Unique Values per column : 
age          98
sibsp         7
parch         8
fare        281
sex           2
embarked      3
survived      2
dtype: int64


In [41]:
# one hot encode less freq cat variables
df = one_hot_encode(df, 'sex')
df = one_hot_encode(df, 'embarked')


# if ordianal then label encode
# if too many values in cat label then design features

In [50]:
numerical = ['pclass',

 'age',
 'sibsp',
 'parch',
 'fare',
 'female',
 'male',
 'C',
 'Q',
 'S']

In [51]:
outliers_info(df, numerical + [target])

{'pclass': 0,
 'age': 9,
 'sibsp': 57,
 'parch': 307,
 'fare': 171,
 'female': 0,
 'male': 0,
 'C': 270,
 'Q': 123,
 'S': 0,
 'survived': 0}

In [52]:
train_temp, test = train_test_split(df,  test_size=0.2, random_state=42)
train, valid = train_test_split(train_temp, test_size=0.25, random_state=42)

del train_temp

In [53]:
train, valid, test = handle_null_values(train, mean_impute= True), handle_null_values(valid,  mean_impute= True), handle_null_values(test,  mean_impute= True)

In [54]:
X_train , y_train = train[numerical], train[target]
X_valid , y_valid = valid[numerical], valid[target]
X_test , y_test = test[numerical], test[target]

In [55]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer, Normalizer

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)
X_test = scaler.fit_transform(X_test)

# Classification Models train

In [75]:
def objective(trial):

  classifier = trial.suggest_categorical("classifier", [
                                                      "GradientBoostingClassifier",
                                                      "RandomForestClassifier",
                                                      "XGBClassifier"])

  clf = None
  if classifier == "GradientBoostingClassifier":
     params = {
              'n_estimators': trial.suggest_int("n_estimators", 10, 100),
            'max_depth': trial.suggest_int("max_depth", 3, 15),
             'learning_rate': trial.suggest_float("learning_rate", 1e-7, 0.3, log=True),
            'min_samples_split': trial.suggest_int("min_samples_split", 2, 20),
            'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 10),
     "random_state": 42,
              # 'min_samples_split' : trial.suggest_int("min_samples_split", 10, 32)
              }
     clf = GradientBoostingClassifier(**params)

  elif classifier == "RandomForestClassifier":
     params = {
              "n_estimators": trial.suggest_int("n_estimators", 10, 100),

            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            # "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
           "random_state": 42

            }
     clf = RandomForestClassifier(**params)

  elif classifier == "XGBClassifier":
     params = {
              'n_estimators': trial.suggest_int("n_estimators", 10, 100),
            'max_depth': trial.suggest_int("max_depth", 3, 15),
               'learning_rate': trial.suggest_float("learning_rate", 1e-7, 0.3, log=True),
            'subsample': trial.suggest_float("subsample", 0.5, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),

              'random_state' : 42
              }

     clf = XGBClassifier(**params)


  clf.fit(X_train, y_train)

  predict = clf.predict(X_valid)
  f1 = f1_score(y_valid, predict)
  return f1

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100,  timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-04-10 22:24:57,348] A new study created in memory with name: no-name-c0e0d064-7711-4b7c-94ee-3b0251118b89
[I 2024-04-10 22:24:58,108] Trial 0 finished with value: 0.0 and parameters: {'classifier': 'GradientBoostingClassifier', 'n_estimators': 95, 'max_depth': 13, 'learning_rate': 0.00023685553886011908, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2024-04-10 22:24:58,633] Trial 1 finished with value: 0.6628571428571429 and parameters: {'classifier': 'GradientBoostingClassifier', 'n_estimators': 67, 'max_depth': 15, 'learning_rate': 0.007145252994123352, 'min_samples_split': 9, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.6628571428571429.
[I 2024-04-10 22:24:59,028] Trial 2 finished with value: 0.6984126984126984 and parameters: {'classifier': 'RandomForestClassifier', 'n_estimators': 58, 'max_depth': 12, 'min_samples_split': 12, 'min_samples_leaf': 8}. Best is trial 2 with value: 0.6984126984126984.
[I 2024-04-10 22:24:59,518] T

Number of finished trials: 100
Best trial:
  Value: 0.7487179487179487
  Params: 
    classifier: RandomForestClassifier
    n_estimators: 18
    max_depth: 7
    min_samples_split: 10
    min_samples_leaf: 7


In [76]:
name  = trial.params['classifier']
trial.params.pop('classifier')
params = trial.params

if name == "GradientBoostingClassifier":
    model = GradientBoostingClassifier(**params)
elif name == "RandomForestClassifier":
    model = RandomForestClassifier(**params)
elif name == "XGBClassifier":
    model = XGBClassifier(**params)

In [77]:

model.fit(X_train , y_train)

print('Train')
predict = model.predict(X_train)
evaluation_metrics(y_train, predict )

print('Valid')
predict = model.predict(X_valid)
evaluation_metrics(y_valid, predict )

print('Test')
predict = model.predict(X_test)
evaluation_metrics(y_test, predict )

Train
-----------------------------------------------------------
Confusion Matrix:
[[490  22]
 [ 97 176]]
Accuracy: 0.8484076433121019
Precision: 0.8888888888888888
Recall: 0.6446886446886447
F1-Score: 0.7473460721868366
ROC AUC: 0.8008599473443223
-----------------------------------------------------------
Valid
-----------------------------------------------------------
Confusion Matrix:
[[138  15]
 [ 45  64]]
Accuracy: 0.7709923664122137
Precision: 0.810126582278481
Recall: 0.5871559633027523
F1-Score: 0.6808510638297873
ROC AUC: 0.7445583738082389
-----------------------------------------------------------
Test
-----------------------------------------------------------
Confusion Matrix:
[[139   5]
 [ 57  61]]
Accuracy: 0.7633587786259542
Precision: 0.9242424242424242
Recall: 0.5169491525423728
F1-Score: 0.6630434782608695
ROC AUC: 0.7411134651600753
-----------------------------------------------------------


In [78]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}
fig = plot_optimization_history(study)
# fig.show()
fig.show(config=plotly_config)

In [67]:
optuna.visualization.plot_slice(study)

In [68]:
optuna.visualization.plot_contour(study, params=["n_estimators", "max_depth"])

In [None]:
# https://www.kaggle.com/code/bextuychiev/no-bs-guide-to-hyperparameter-tuning-with-optuna#Defining-the-search-space

In [None]:
# todo write cross val for smaller dataset

In [None]:
# Todo write for class imbalance