# Importing Libraries

In [73]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Loading Data 

In [74]:
os.chdir('C:\\Users\\Avita\\OneDrive\\Desktop\\Capstone Project\\titanic')

In [75]:
df=pd.read_csv('train.csv')

In [76]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Checking Data 

In [77]:
df.shape

(891, 12)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [79]:
## columns Age, Cabin and Embarked have some null values

In [80]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [81]:
## Age should be in int datatype 

In [82]:
df.describe().T.drop(columns=['count','mean','std'])

Unnamed: 0,min,25%,50%,75%,max
PassengerId,1.0,223.5,446.0,668.5,891.0
Survived,0.0,0.0,0.0,1.0,1.0
Pclass,1.0,2.0,3.0,3.0,3.0
Age,0.42,20.125,28.0,38.0,80.0
SibSp,0.0,0.0,0.0,1.0,8.0
Parch,0.0,0.0,0.0,0.0,6.0
Fare,0.0,7.9104,14.4542,31.0,512.3292


# Data Cleaning 

In [83]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [84]:
df.isnull().sum()/len(df)*100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [85]:
## ## Age and Embarked columns have less than 20 % null values so we can impute it
## Wheras Cabin column has 77 % null values so we can drop those

In [86]:
df['Age'].median()

28.0

In [87]:
df['Age'].fillna(df['Age'].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(),inplace=True)


In [88]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [89]:
df['Embarked'].mode()

0    S
Name: Embarked, dtype: object

In [90]:
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)


In [91]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [92]:
df.drop('Cabin',axis=1,inplace=True)

In [93]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [94]:
df.duplicated().sum()

0

In [95]:
df.drop(['PassengerId','Ticket','Name'],axis=1,inplace=True)

In [96]:
## Also dropped 'PassengerId','Ticket' and 'Name' beacuse it has too many unique values in each rows and does not help in predicting survial

In [97]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Machine Learning 

## Without any Optimization

In [98]:
# Survival Prediction - Supervised approch / Binary Classification 

In [99]:
df['Survived'].value_counts()/len(df)*100

Survived
0    61.616162
1    38.383838
Name: count, dtype: float64

## Label Encoding

In [100]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])    

In [101]:
le1 = LabelEncoder()
df['Embarked'] = le1.fit_transform(df['Embarked'])

In [102]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.9250,2
3,1,1,0,35.0,1,0,53.1000,2
4,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,2
887,1,1,0,19.0,0,0,30.0000,2
888,0,3,0,28.0,1,2,23.4500,2
889,1,1,1,26.0,0,0,30.0000,0


## Split Feature and Target 

In [103]:
X=df.drop('Survived',axis=1)

In [104]:
y=df['Survived']

## Train Test Split 

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

## Standardisation

In [106]:
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test = ss.transform(X_test)

## 1. Logistic Regression 

In [107]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

In [108]:
log_pred = log_model.predict(X_test)

In [109]:
y_prob = log_model.predict_proba(X_test)[:, 1]
auc_for_lr = roc_auc_score(y_test, y_prob)

In [110]:
print('Accuracy score:\n',accuracy_score(y_test, log_pred))
print('Classification Report:\n', classification_report(y_test, log_pred))
print('ROC-AUC Score:\n',auc_for_lr)

Accuracy score:
 0.8044692737430168
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.84       107
           1       0.78      0.71      0.74        72

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

ROC-AUC Score:
 0.8707814122533749


## 2. Random Forest 

In [111]:
rf_model=RandomForestClassifier(random_state=23)
rf_model.fit(X_train,y_train)

In [112]:
rf_pred = rf_model.predict(X_test)

In [113]:
y_prob = rf_model.predict_proba(X_test)[:,1]
auc_for_rf = roc_auc_score(y_test, y_prob)

In [114]:
print('Accuracy score:\n',accuracy_score(y_test, rf_pred))
print('Classification Report:\n', classification_report(y_test, rf_pred))
print('ROC-AUC Score:\n',auc_for_rf)

Accuracy score:
 0.8268156424581006
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       107
           1       0.82      0.74      0.77        72

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

ROC-AUC Score:
 0.892263759086189


## 3. Gradient Boosting 

In [115]:
gb_model=GradientBoostingClassifier(random_state=66)
gb_model.fit(X_train,y_train)

In [116]:
gb_pred=gb_model.predict(X_test)

In [117]:
y_prob = gb_model.predict_proba(X_test)[:,1]
auc_for_gb = roc_auc_score(y_test, y_prob)

In [118]:
print('Accuracy score:\n',accuracy_score(y_test, gb_pred))
print('Classification Report:\n', classification_report(y_test, gb_pred))
print('ROC-AUC Score:\n',auc_for_gb)

Accuracy score:
 0.8324022346368715
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.87       107
           1       0.85      0.71      0.77        72

    accuracy                           0.83       179
   macro avg       0.84      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

ROC-AUC Score:
 0.885708722741433


## 4. XGBoost 

In [119]:
xgb_model=XGBClassifier()
xgb_model.fit(X_train,y_train)

In [120]:
xgb_pred=xgb_model.predict(X_test)

In [121]:
y_prob = xgb_model.predict_proba(X_test)[:,1]
auc_for_xgb = roc_auc_score(y_test, y_prob)

In [122]:
print('Accuracy score:\n',accuracy_score(y_test, xgb_pred))
print('Classification Report:\n', classification_report(y_test, xgb_pred))
print('ROC-AUC Score:\n',auc_for_xgb)

Accuracy score:
 0.8324022346368715
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86       107
           1       0.83      0.74      0.78        72

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

ROC-AUC Score:
 0.8831126687435099


## 5. Light GBM

In [123]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [124]:
lgbm_model=LGBMClassifier()
lgbm_model.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 270, number of negative: 442
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 207
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.379213 -> initscore=-0.492888
[LightGBM] [Info] Start training from score -0.492888


In [125]:
lgbm_pred=lgbm_model.predict(X_test)

In [126]:
y_prob = lgbm_model.predict_proba(X_test)[:,1]
auc_for_lgbm = roc_auc_score(y_test, y_prob)

In [127]:
print('Accuracy score:\n',accuracy_score(y_test, lgbm_pred))
print('Classification Report:\n', classification_report(y_test, lgbm_pred))
print('ROC-AUC Score:\n',auc_for_lgbm)

Accuracy score:
 0.8491620111731844
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88       107
           1       0.85      0.76      0.80        72

    accuracy                           0.85       179
   macro avg       0.85      0.84      0.84       179
weighted avg       0.85      0.85      0.85       179

ROC-AUC Score:
 0.8959631360332294


## 6. KNN

In [128]:
knn_model=KNeighborsClassifier()
knn_model.fit(X_train,y_train)

In [129]:
knn_pred=knn_model.predict(X_test)

In [130]:
y_prob = knn_model.predict_proba(X_test)[:,1]
auc_for_knn = roc_auc_score(y_test, y_prob)

In [131]:
print('Accuracy score:\n',accuracy_score(y_test, knn_pred))
print('Classification Report:\n', classification_report(y_test, knn_pred))
print('ROC-AUC Score:\n',auc_for_knn)

Accuracy score:
 0.8156424581005587
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85       107
           1       0.78      0.75      0.77        72

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.81      0.82      0.81       179

ROC-AUC Score:
 0.8498831775700935


## 7. SVM

In [132]:
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train, y_train)

In [133]:
svm_pred = svm_model.predict(X_test)

In [134]:
y_prob = svm_model.predict_proba(X_test)[:, 1]
auc_for_svm = roc_auc_score(y_test, y_prob)

In [135]:
print('Accuracy score:\n',accuracy_score(y_test, svm_pred))
print('Classification Report:\n', classification_report(y_test, svm_pred))
print('ROC-AUC Score:\n',auc_for_svm)

Accuracy score:
 0.8212290502793296
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.92      0.86       107
           1       0.84      0.68      0.75        72

    accuracy                           0.82       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179

ROC-AUC Score:
 0.8533878504672897


## 8. Naive Bayes Algorithm

In [136]:
nb_model=GaussianNB()
nb_model.fit(X_train,y_train)

In [137]:
nb_pred=nb_model.predict(X_test)

In [138]:
y_prob = nb_model.predict_proba(X_test)[:,1]
auc_for_nb = roc_auc_score(y_test, y_prob)

In [139]:
print('Accuracy score:\n',accuracy_score(y_test, nb_pred))
print('Classification Report:\n', classification_report(y_test, nb_pred))
print('ROC-AUC Score:\n',auc_for_nb)

Accuracy score:
 0.7932960893854749
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.83      0.83       107
           1       0.75      0.74      0.74        72

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

ROC-AUC Score:
 0.843782450674974


## With HyperParameter

## 1 Logistic Regression

In [173]:
parameter_lr={'C': [0.01, 0.1, 1, 10],'penalty': ['l1', 'l2']}

In [174]:
grid_search_log = GridSearchCV(estimator=LogisticRegression(max_iter=1000), param_grid=parameter_lr, scoring='accuracy', cv=5)

In [175]:
grid_search_log.fit(X_train,y_train)

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Avita\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Avita\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Avita\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [176]:
best_log_model = grid_search_log.best_estimator_
log_pred = best_log_model.predict(X_test)
y_prob = best_log_model.predict_proba(X_test)[:, 1]

print("Best Hyperparameters:", grid_search_log.best_params_)
print("Accuracy Score:", accuracy_score(y_test, log_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n", classification_report(y_test, log_pred))


Best Hyperparameters: {'C': 0.01, 'penalty': 'l2'}
Accuracy Score: 0.8156424581005587
ROC AUC Score: 0.8632528556593978
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.91      0.85       107
           1       0.83      0.68      0.75        72

    accuracy                           0.82       179
   macro avg       0.82      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179



## 2. Random Forest 

In [177]:
parameter_rf = {'n_estimators': [100, 200, 300],'max_depth': [None, 5, 10],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4],
                'bootstrap': [True, False]
}

In [178]:
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=67), param_grid=parameter_rf, scoring='accuracy', cv=5)

In [179]:
grid_search_rf.fit(X_train,y_train)

In [180]:
best_rf_model = grid_search_rf.best_estimator_
rf_pred = best_rf_model.predict(X_test)
y_prob_rf = best_log_model.predict_proba(X_test)[:, 1]

print("Best Hyperparameters:", grid_search_rf.best_params_)
print("Accuracy Score:", accuracy_score(y_test, rf_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_rf))
print("Classification Report:\n", classification_report(y_test, rf_pred))

Best Hyperparameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy Score: 0.8547486033519553
ROC AUC Score: 0.8632528556593978
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.88       107
           1       0.87      0.75      0.81        72

    accuracy                           0.85       179
   macro avg       0.86      0.84      0.84       179
weighted avg       0.86      0.85      0.85       179



## 3. Gradient Boosting

In [181]:
parameter_gb = {
    'n_estimators': [50, 100, 150],        
    'learning_rate': [0.01, 0.1, 0.2],     
    'max_depth': [3, 4, 5],                
    'subsample': [0.8, 1.0],               
    'min_samples_split': [2, 5],           
}

In [182]:
grid_search_gb = GridSearchCV(estimator=GradientBoostingClassifier(random_state=31), param_grid=parameter_gb, scoring='roc_auc', cv=5)

In [183]:
grid_search_gb.fit(X_train,y_train)

In [184]:
best_gb_model = grid_search_gb.best_estimator_
gb_pred = best_gb_model.predict(X_test)
y_prob_gb = best_gb_model.predict_proba(X_test)[:, 1]

print("Best Hyperparameters:", grid_search_gb.best_params_)
print("Accuracy Score:", accuracy_score(y_test, gb_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_gb))
print("Classification Report:\n", classification_report(y_test, gb_pred))

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.8}
Accuracy Score: 0.8491620111731844
ROC AUC Score: 0.8916796469366562
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.93      0.88       107
           1       0.87      0.74      0.80        72

    accuracy                           0.85       179
   macro avg       0.85      0.83      0.84       179
weighted avg       0.85      0.85      0.85       179



## 4. XG Boost

In [195]:
parameter_xgb={
    'n_estimators': [50, 100, 150],        
    'learning_rate': [0.01, 0.1, 0.2],     
    'max_depth': [3, 4, 5],                
    'subsample': [0.8, 1.0],             
}

In [196]:
grid_search_xgb = GridSearchCV(estimator=XGBClassifier(random_state=37), param_grid=parameter_xgb, scoring='roc_auc', cv=5)

In [197]:
grid_search_xgb.fit(X_train,y_train)

In [198]:
best_xgb_model = grid_search_xgb.best_estimator_
xgb_pred = best_xgb_model.predict(X_test)
y_prob_xgb = best_xgb_model.predict_proba(X_test)[:, 1]

print("Best Hyperparameters:", grid_search_xgb.best_params_)
print("Accuracy Score:", accuracy_score(y_test, xgb_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_xgb))
print("Classification Report:\n", classification_report(y_test, xgb_pred))

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Accuracy Score: 0.8100558659217877
ROC AUC Score: 0.8719496365524403
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       107
           1       0.81      0.69      0.75        72

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



## 5. Light GBM

In [190]:
parameter_lgbm={
    'n_estimators': [50, 100, 150],        
    'learning_rate': [0.01, 0.1, 0.2],     
    'max_depth': [3, 4, 5],                
    'subsample': [0.8, 1.0],               
}

In [191]:
grid_search_lgbm = GridSearchCV(estimator=LGBMClassifier(), param_grid=parameter_lgbm, scoring='accuracy', cv=5)

In [193]:
grid_search_lgbm.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 216, number of negative: 353
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.379613 -> initscore=-0.491190
[LightGBM] [Info] Start training from score -0.491190
[LightGBM] [Info] Number of positive: 216, number of negative: 353
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 189
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.379613 -> initscore=-0.491190
[LightGBM] [Info] 

In [194]:
best_lgbm_model = grid_search_lgbm.best_estimator_
lgbm_pred = best_lgbm_model.predict(X_test)
y_prob_lgbm = best_lgbm_model.predict_proba(X_test)[:, 1]

print("Best Hyperparameters:", grid_search_lgbm.best_params_)
print("Accuracy Score:", accuracy_score(y_test, lgbm_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_lgbm))
print("Classification Report:\n", classification_report(y_test, lgbm_pred))

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Accuracy Score: 0.8379888268156425
ROC AUC Score: 0.878309968847352
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87       107
           1       0.84      0.74      0.79        72

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179



## 6. KNN

In [199]:
parameter_knn={
    'n_neighbors': [3, 5, 7, 9, 11],        
    'weights': ['uniform','distance'],     
    'metric': ['euclidean','manhattan','minkowski'],                               
}

In [200]:
grid_search_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parameter_knn, scoring='accuracy', cv=5)

In [201]:
grid_search_knn.fit(X_train,y_train)

In [202]:
best_knn_model = grid_search_knn.best_estimator_
knn_pred = best_knn_model.predict(X_test)
y_prob_knn = best_knn_model.predict_proba(X_test)[:, 1]

print("Best Hyperparameters:", grid_search_knn.best_params_)
print("Accuracy Score:", accuracy_score(y_test, knn_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_knn))
print("Classification Report:\n", classification_report(y_test, knn_pred))

Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Accuracy Score: 0.8100558659217877
ROC AUC Score: 0.8781152647975079
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       107
           1       0.81      0.69      0.75        72

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



## 7. SVM

In [203]:
parameter_svm={
    'C': [0.1, 1, 10],        
    'kernel': ['linear', 'rbf', 'poly'], 
    'gamma': ['scale', 'auto'],
}

In [204]:
grid_search_svm = GridSearchCV(estimator=SVC(probability=True), param_grid=parameter_svm, scoring='roc_auc', cv=5)

In [205]:
grid_search_svm.fit(X_train,y_train)

In [206]:
best_svm_model = grid_search_svm.best_estimator_
svm_pred = best_knn_model.predict(X_test)
y_prob_svm = best_knn_model.predict_proba(X_test)[:, 1]

print("Best Hyperparameters:", grid_search_svm.best_params_)
print("Accuracy Score:", accuracy_score(y_test, svm_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_svm))
print("Classification Report:\n", classification_report(y_test, svm_pred))

Best Hyperparameters: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Accuracy Score: 0.8100558659217877
ROC AUC Score: 0.8781152647975079
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       107
           1       0.81      0.69      0.75        72

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

