In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [6]:
titanic = sns.load_dataset('titanic')

In [7]:
titanic.__class__

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### Preprocessing

In [8]:
# Drop unnecessary columns
titanic = titanic.drop(['deck', 'embark_town', 'alive'], axis=1)

# Replace missing values with the median or mode
titanic['age'] = titanic['age'].fillna(titanic['age'].median())
titanic['fare'] = titanic['fare'].fillna(titanic['fare'].mode()[0])
titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])

In [9]:
# Convert categorical variables to numerical variables
titanic['sex'] = pd.Categorical(titanic['sex']).codes
titanic['embarked'] = pd.Categorical(titanic['embarked']).codes

# Split the dataset into input features and the target variable
X = titanic.drop('survived', axis=1)
y = titanic['survived']

In [33]:
class_dict = {
"Third": 3,
"First": 1,
"Second": 2
}
who_dict = {
"child": 0,
"woman": 1,
"man": 2
}

X['class'] = X['class'].apply(lambda x: class_dict[x])
X['who'] = X['who'].apply(lambda x: who_dict[x])

# X_train['class'] = X_train['class'].apply(lambda x: class_dict[x])
# X_train['who'] = X_train['who'].apply(lambda x: who_dict[x])
# X_test['class'] = X_test['class'].apply(lambda x: class_dict[x])
# X_test['who'] = X_test['who'].apply(lambda x: who_dict[x])

In [34]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


### Training the Model

In [41]:
#Setting Training Parameters
params = {
'objective': 'binary',
'boosting_type': 'gbdt',
'num_leaves': 31, #8-256
'learning_rate': 0.05,
'feature_fraction': 0.9,
'verbosity': 2,
}

# 
# 'max_depth':5, #3-16
# 'metric':'l1',
# 'seed':1,
# 'verbosity':2,
# 'min_data_in_leaf': 20, #5-300

In [42]:
#Load the Classifier Model
clasifier = lgb.LGBMClassifier(**params)
clasifier.get_params(True)

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.05,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'feature_fraction': 0.9,
 'verbosity': 2}

In [43]:
clasifier.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(5)], eval_metric='logloss')

[LightGBM] [Info] Number of positive: 212, number of negative: 357
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.748682
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.264675
[LightGBM] [Debug] init for col-wise cost 0.000386 seconds, init for row-wise cost 0.000400 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000874 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 188
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372583 -> initscore=-0.521150
[LightGBM] [Info] Start training from score -0.521150
[LightGBM] [Debug] Trained a tree with leaves = 19 and depth = 7
Training until validation scores don't improve for 5 rounds
[LightGBM] [Debug] Tr

In [44]:
#Prediction

predictions = clasifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.79      0.89      0.83       105
           1       0.80      0.66      0.73        74

    accuracy                           0.79       179
   macro avg       0.80      0.77      0.78       179
weighted avg       0.79      0.79      0.79       179



In [19]:
#Hyperparameter Tuning
model = lgb.LGBMClassifier(num_leaves=31, min_data_in_leaf=20, max_depth=5)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


In [20]:
#Re-run Prediction
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       105
           1       0.80      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

