In [1]:
#Import the essential function
import sqlite3
import pandas as pd
import numpy as np

# Import the train-test, standard scaler function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import the classification measurement function
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix

# Import the GridSearch function
from sklearn.model_selection import GridSearchCV


# Import the Classification Machine Learning Models function
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier

#Cross Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score,make_scorer



In [2]:
# Import the supporting function  

# - Data cleaning and standardisation based on the EDA finding
from support import DataStandard

# - Consolidated measurment function - Classification report with Confusion Matrix
from support import ClassMeasure 

# - Search for the defined parameters based on the given param grid and the machine learning and the train with the best available parameters
from support import MLGridSearch

In [3]:
#Connect to the Survive Database

sql_connect = sqlite3.connect('..\data\survive.db')

# Save the SQL query string

query = "SELECT * FROM survive"

#Create a dataframe 

source = pd.read_sql_query(query,sql_connect)
#source

In [4]:
# Load the clean data with the support function

df = DataStandard(source)


#Split the dataframe to Feature(X) and Label(y)

#-Features
X = df.drop('Survive',axis=1)

#-Label
y = df['Survive']

#Split the Feature and Label data from Training and Testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101,stratify=y)

# Standard Scaling

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)


In [35]:
#Cross Validation Evaluation on the ML model

#Define the scoring strategy for cross validation
recallscore = make_scorer(recall_score)

# Standard Scaling the X data (In line with GridSearch scaled training set)
scaled_X= scaler.fit_transform(X)


def cross_val(ML):

    # prepare the cross-validation procedure
    cv = KFold(n_splits=5, random_state=1, shuffle=True)


    # evaluate model
    scores = cross_val_score(ML, scaled_X, y, scoring=recallscore, cv=cv, n_jobs=-1)
    
    # report performance
    print('ML model :',ML.best_estimator_)
    print('\nScore',scores )
    print('\nRecall (average & Standard deviation): %.3f (%.3f)' % (scores.mean(), scores.std()))




____________________

#### Logistic Regression

In [6]:
# Define the parameters for the param grid

# - Penalty Type
penalty = ['l1', 'l2', 'elasticnet', 'none']

# - Use logarithmically spaced C values
C = np.logspace(0, 4, 10)

param_grid = {'C':C,'penalty':penalty}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

LRGrid = MLGridSearch(LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000),param_grid,scaled_X_train,y_train)


In [36]:
#Cross Validation on the model with hyperparameter tuning

cross_val(LRGrid)

ML model : LogisticRegression(max_iter=5000, multi_class='ovr', penalty='none',
                   solver='saga')

Score [0.68162839 0.6612411  0.68987342 0.66561514 0.65230769]

Recall (average & Standard deviation): 0.670 (0.014)


In [19]:
# Run the prediction

y_pred_LRGrid = LRGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_LRGrid)


Classification report
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      2906
           1       0.78      0.65      0.71      1445

    accuracy                           0.82      4351
   macro avg       0.81      0.78      0.79      4351
weighted avg       0.82      0.82      0.82      4351


Confusion Matrix
[[2644  262]
 [ 511  934]]


#### K Nearest Neighbour 

In [8]:
# Define the parameters for the param grid

k_values = list(range(1,100))

param_grid = {'n_neighbors': k_values}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

KnnGrid = MLGridSearch(KNeighborsClassifier(),param_grid,scaled_X_train, y_train)

In [21]:
# Check the Best Parameters

KnnGrid.best_params_

{'n_neighbors': 1}

In [37]:
#Cross Validation on the model with hyperparameter tuning

cross_val(KnnGrid)

ML model : KNeighborsClassifier(n_neighbors=1)

Score [0.99478079 0.99694812 0.99683544 0.99684543 0.99897436]

Recall (average & Standard deviation): 0.997 (0.001)


In [22]:
y_pred_knn = KnnGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_knn)


Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2906
           1       0.99      0.99      0.99      1445

    accuracy                           0.99      4351
   macro avg       0.99      0.99      0.99      4351
weighted avg       0.99      0.99      0.99      4351


Confusion Matrix
[[2895   11]
 [  13 1432]]


#### Support Vector Classification

In [10]:
# Define the parameters for the param grid

param_grid = {'C':[0.001,0.01,0.1,0.5,1],'gamma':['scale','auto'],'decision_function_shape':['ovo', 'ovr']}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

SVCGrid = MLGridSearch(SVC(),param_grid,scaled_X_train, y_train)

In [23]:
# Check the Best Parameters

SVCGrid.best_params_

{'C': 1, 'decision_function_shape': 'ovo', 'gamma': 'auto'}

In [38]:
#Cross Validation on the model with hyperparameter tuning

cross_val(SVCGrid)

ML model : SVC(C=1, decision_function_shape='ovo', gamma='auto')

Score [0.90292276 0.88911495 0.8871308  0.87066246 0.89230769]

Recall (average & Standard deviation): 0.888 (0.010)


In [24]:
y_pred_SVC = SVCGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_SVC)


Classification report
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      2906
           1       0.97      0.86      0.91      1445

    accuracy                           0.94      4351
   macro avg       0.95      0.92      0.93      4351
weighted avg       0.94      0.94      0.94      4351


Confusion Matrix
[[2861   45]
 [ 200 1245]]


#### Random Forest

In [12]:
# Define the parameters for the param grid

estimators = list(range(1,100))

param_grid = {'n_estimators': estimators}

# Instantiate the selected ML Model with defined parameters and select the best parameters via GridSearchCV

RFGrid = MLGridSearch(RandomForestClassifier(),param_grid,scaled_X_train, y_train)

In [13]:
# Check the Best Parameters

RFGrid.best_params_

{'n_estimators': 83}

In [39]:
#Cross Validation on the model with hyperparameter tuning

cross_val(RFGrid)

ML model : RandomForestClassifier(n_estimators=83)

Score [1.         0.99898271 1.         1.         1.        ]

Recall (average & Standard deviation): 1.000 (0.000)


In [18]:
y_pred_RF = RFGrid.predict(scaled_X_test)
ClassMeasure(y_test,y_pred_RF)


Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2906
           1       1.00      1.00      1.00      1445

    accuracy                           1.00      4351
   macro avg       1.00      1.00      1.00      4351
weighted avg       1.00      1.00      1.00      4351


Confusion Matrix
[[2906    0]
 [   0 1445]]
