In [1]:
#Import the essential function
import sqlite3
import pandas as pd
import numpy as np

# Import the train-test, standard scaler function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import the classification measurement function
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix

# Import the GridSearch function
from sklearn.model_selection import GridSearchCV


# Import the Classification Machine Learning Models function
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier





In [2]:
# Import the supporting function  

# - Data cleaning and standardisation based on the EDA finding
from support import DataStandard

# - Consolidated measurment function - Classification report with Confusion Matrix
from support import ClassMeasure 

# - Search for the defined parameters based on the given param grid and the machine learning and the train with the best available parameters
from support import MLGridSearch

In [3]:
#Connect to the Survive Database

sql_connect = sqlite3.connect('..\data\survive.db')

# Save the SQL query string

query = "SELECT * FROM survive"

#Create a dataframe 

source = pd.read_sql_query(query,sql_connect)
#source

In [4]:
# Load the clean data with the support function

df = DataStandard(source)


#Split the dataframe to Feature(X) and Label(y)

#-Features
X = df.drop('Survive',axis=1)

#-Label
y = df['Survive']

#Split the Feature and Label data from Training and Testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101,stratify=y)

# Standard Scaling

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)


____________________

#### Logistic Regression

In [5]:
# Define the parameters for the param grid

# - Penalty Type
penalty = ['l1', 'l2', 'elasticnet', 'none']

# - Use logarithmically spaced C values
C = np.logspace(0, 4, 10)

param_grid = {'C':C,'penalty':penalty}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

LRGrid = MLGridSearch(LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000),param_grid,scaled_X_train,y_train)


In [6]:
# Check the Best Parameters

LRGrid.best_params_

{'C': 1.0, 'penalty': 'none'}

In [7]:
# Run the prediction

y_pred_LRGrid = LRGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_LRGrid)


Classification report
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      2872
           1       0.78      0.66      0.71      1479

    accuracy                           0.82      4351
   macro avg       0.81      0.78      0.79      4351
weighted avg       0.82      0.82      0.82      4351


Confusion Matrix
[[2591  281]
 [ 501  978]]


#### K Nearest Neighbour 

In [8]:
# Define the parameters for the param grid

k_values = list(range(1,100))

param_grid = {'n_neighbors': k_values}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

KnnGrid = MLGridSearch(KNeighborsClassifier(),param_grid,scaled_X_train, y_train)

In [9]:
# Check the Best Parameters

KnnGrid.best_params_

{'n_neighbors': 1}

In [10]:
y_pred_knn = KnnGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_knn)


Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2872
           1       0.99      0.99      0.99      1479

    accuracy                           1.00      4351
   macro avg       1.00      1.00      1.00      4351
weighted avg       1.00      1.00      1.00      4351


Confusion Matrix
[[2862   10]
 [   8 1471]]


#### Support Vector Classification

In [24]:
# Define the parameters for the param grid

param_grid = {'C':[0.001,0.01,0.1,0.5,1],'gamma':['scale','auto'],'decision_function_shape':['ovo', 'ovr']}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

SVCGrid = MLGridSearch(SVC(),param_grid,scaled_X_train, y_train)

In [25]:
# Check the Best Parameters

SVCGrid.best_params_

{'C': 1, 'decision_function_shape': 'ovo', 'gamma': 'scale'}

In [26]:
y_pred_SVC = SVCGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_SVC)


Classification report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      2872
           1       0.95      0.84      0.90      1479

    accuracy                           0.93      4351
   macro avg       0.94      0.91      0.92      4351
weighted avg       0.93      0.93      0.93      4351


Confusion Matrix
[[2813   59]
 [ 231 1248]]


#### Random Forest

In [23]:
# Define the parameters for the param grid

estimators = list(range(1,100))

param_grid = {'n_estimators': estimators}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

RFGrid = MLGridSearch(RandomForestClassifier(),param_grid,scaled_X_train, y_train)

In [15]:
# Check the Best Parameters

RFGrid.best_params_

{'n_estimators': 24}

In [16]:
y_pred_RF = RFGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_RF)


Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2872
           1       1.00      1.00      1.00      1479

    accuracy                           1.00      4351
   macro avg       1.00      1.00      1.00      4351
weighted avg       1.00      1.00      1.00      4351


Confusion Matrix
[[2872    0]
 [   0 1479]]


#### Gradient Boosting

In [17]:
# Define the parameters for the param grid

estimators = list(range(1,100))

param_grid = {"n_estimators":estimators}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

GBGrid = MLGridSearch(GradientBoostingClassifier(),param_grid,scaled_X_train,y_train)

In [18]:
# Check the Best Parameters

GBGrid.best_params_

{'n_estimators': 99}

In [19]:
y_pred_GB = GBGrid.predict(scaled_X_test)
ClassMeasure( y_test, y_pred_GB)


Classification report
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2872
           1       0.97      0.88      0.92      1479

    accuracy                           0.95      4351
   macro avg       0.95      0.93      0.94      4351
weighted avg       0.95      0.95      0.95      4351


Confusion Matrix
[[2825   47]
 [ 171 1308]]


#### Ada Boosting

In [20]:
# Define the parameters for the param grid

estimators = list(range(1,100))

param_grid = {"n_estimators":estimators}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

ABGrid = MLGridSearch(AdaBoostClassifier(),param_grid,scaled_X_train,y_train)

In [21]:
# Check the Best Parameters

ABGrid.best_params_

{'n_estimators': 99}

In [22]:
y_pred_AB = ABGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_AB)


Classification report
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      2872
           1       0.90      0.79      0.84      1479

    accuracy                           0.90      4351
   macro avg       0.90      0.87      0.88      4351
weighted avg       0.90      0.90      0.90      4351


Confusion Matrix
[[2747  125]
 [ 312 1167]]
