## Final Project

## Predicting the probability of a stroke

Importing libraries

In [16]:
import pandas as pd
import numpy as np
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, auc
from xgboost import XGBClassifier 

Load data - preprocessed in ETL file

In [17]:
def load_file(file_path):
    file_to_read = file_path
    df = pd.read_csv(file_to_read)
    return df
  
df = load_file(r'df_ohencoded.csv')

In [18]:
def load_file(file_path):
    file_to_read = file_path
    df = pd.read_csv(file_to_read)
    return df
  
df_ohencoded = load_file(r'df_ohencoded.csv')

In [19]:
df_ohencoded

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,67.0,0,1,1,1,228.69,36.600000,1,0,0,1,0,0,0,1,0,0
1,0,61.0,0,0,1,0,202.21,28.893237,1,0,0,0,1,0,0,0,1,0
2,1,80.0,0,1,1,0,105.92,32.500000,1,0,0,1,0,0,0,0,1,0
3,0,49.0,0,0,1,1,171.23,34.400000,1,0,0,1,0,0,0,0,0,1
4,0,79.0,1,0,1,0,174.12,24.000000,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,0,80.0,1,0,1,1,83.75,28.893237,0,0,0,1,0,0,0,0,1,0
5105,0,81.0,0,0,1,1,125.20,40.000000,0,0,0,0,1,0,0,0,1,0
5106,0,35.0,0,0,1,0,82.99,30.600000,0,0,0,0,1,0,0,0,1,0
5107,1,51.0,0,0,1,0,166.29,25.600000,0,0,0,1,0,0,0,1,0,0


Setting up the data - separating, train-test split, scaling, and instituting SMOTE

In [20]:
#separate the data to X and y
X = df_ohencoded.drop('stroke', axis=1)
y = df_ohencoded['stroke']
# SMOTEless train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state= 27, stratify = y)
# standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
#xtrain and y train with smote
sm = SMOTE(random_state=27)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)



Function to train the models.  The function will loop through each of the models in the list

In [21]:

def evaluate(X_train,y_train,X_test,y_test, smote):
    """Function that loops through several models and produces various scores"""
    models= [['Logistic Regression ',LogisticRegression(random_state = 50, max_iter = 10000)],
            ['KNearest Neighbor ',KNeighborsClassifier(n_neighbors = 285)],
            ['Decision Tree Classifier ',DecisionTreeClassifier(random_state =27, max_depth =2, min_samples_split =3)],
            ['Ada Boost ',AdaBoostClassifier(random_state = 27, n_estimators = 137)],
            ['SVM ',SVC(random_state = 27, kernel = 'linear')]]

    for name,model in models:

        model = model
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        roc = roc_auc_score(y_test, y_pred) 
        precision = precision_score(y_test, y_pred) 
        recall = recall_score(y_test, y_pred) 
        f1 = f1_score(y_test, y_pred) 
        cm_model = confusion_matrix(y_test, y_pred)
        
        if smote == 1:
            print ("Smote Version")

        print(name)
        print('CM Model: ')
        print(cm_model)
        print('Test Accuracy: ',accuracy_score(y_test,model.predict(X_test)))
        print('Training Accuracy: ',accuracy_score(y_train,model.predict(X_train)))
        print('ROC AUC Score: ', roc)
        print('Precision: ', precision)
        print('Recall: ', recall)
        print('f1: ', f1)

        print('-------------------------------------------------')
        
    
        


Scores without using SMOTE

In [22]:

evaluate(X_train,y_train,X_test,y_test, 0)


  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression 
CM Model: 
[[1458    0]
 [  75    0]]
Test Accuracy:  0.9510763209393346
Training Accuracy:  0.9513422818791947
ROC AUC Score:  0.5
Precision:  0.0
Recall:  0.0
f1:  0.0
-------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


KNearest Neighbor 
CM Model: 
[[1458    0]
 [  75    0]]
Test Accuracy:  0.9510763209393346
Training Accuracy:  0.9513422818791947
ROC AUC Score:  0.5
Precision:  0.0
Recall:  0.0
f1:  0.0
-------------------------------------------------
Decision Tree Classifier 
CM Model: 
[[1458    0]
 [  75    0]]
Test Accuracy:  0.9510763209393346
Training Accuracy:  0.9513422818791947
ROC AUC Score:  0.5
Precision:  0.0
Recall:  0.0
f1:  0.0
-------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Ada Boost 
CM Model: 
[[1458    0]
 [  74    1]]
Test Accuracy:  0.9517286366601435
Training Accuracy:  0.9532997762863534
ROC AUC Score:  0.5066666666666667
Precision:  1.0
Recall:  0.013333333333333334
f1:  0.02631578947368421
-------------------------------------------------
SVM 
CM Model: 
[[1458    0]
 [  75    0]]
Test Accuracy:  0.9510763209393346
Training Accuracy:  0.9513422818791947
ROC AUC Score:  0.5
Precision:  0.0
Recall:  0.0
f1:  0.0
-------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Models trained with SMOTE sampling

In [23]:
# Scores using SMOTE
evaluate(X_train_smote,y_train_smote,X_test,y_test, 1)


Smote Version
Logistic Regression 
CM Model: 
[[1084  374]
 [  17   58]]
Test Accuracy:  0.7449445531637312
Training Accuracy:  0.7833627278071722
ROC AUC Score:  0.75840877914952
Precision:  0.13425925925925927
Recall:  0.7733333333333333
f1:  0.2287968441814596
-------------------------------------------------
Smote Version
KNearest Neighbor 
CM Model: 
[[894 564]
 [  9  66]]
Test Accuracy:  0.6262230919765166
Training Accuracy:  0.7361845972957084
ROC AUC Score:  0.7465843621399177
Precision:  0.10476190476190476
Recall:  0.88
f1:  0.1872340425531915
-------------------------------------------------
Smote Version
Decision Tree Classifier 
CM Model: 
[[812 646]
 [  5  70]]
Test Accuracy:  0.5753424657534246
Training Accuracy:  0.7748383303938859
ROC AUC Score:  0.7451303155006859
Precision:  0.09776536312849161
Recall:  0.9333333333333333
f1:  0.17699115044247787
-------------------------------------------------
Smote Version
Ada Boost 
CM Model: 
[[493 965]
 [  3  72]]
Test Accuracy

In [24]:
#option 1 - the parameters together

def evaluate_2( n_splits, n_repeats, n_jobs):
    models= [['AdaBoost ',AdaBoostClassifier()],
            ['DecisionTree ',DecisionTreeClassifier()]]


    for name,model in models:
        resample=SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
        pipeline=Pipeline(steps=[('r', resample), ('m', model)])
        cv=RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)
        scoring=['accuracy','precision_macro','recall_macro']
        scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=n_jobs)
        print(name)
        print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
        print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
        print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))

        print('----------------------------------------')
        
    
        


In [25]:
evaluate_2(10, 3, -1)

AdaBoost 
Mean Accuracy: 0.8349
Mean Precision: 0.5615
Mean Recall: 0.6828
----------------------------------------
DecisionTree 
Mean Accuracy: 0.8555
Mean Precision: 0.5465
Mean Recall: 0.6128
----------------------------------------


In [26]:
#option 2, run the adaboost and decisiontree with kfold individually to adjust the parameters. It is set up with default ones. 
def evaluate_3(model, n_splits = 10, n_repeats = 3, n_jobs = -1):
    resample=SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
    pipeline=Pipeline(steps=[('r', resample), ('m', model)])
    cv=RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)
    scoring=['accuracy','precision_macro','recall_macro']
    scores = cross_validate(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=n_jobs)

    print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
    print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
    print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))

In [27]:
model=AdaBoostClassifier(random_state =27)
evaluate_3(model)

Mean Accuracy: 0.8362
Mean Precision: 0.5621
Mean Recall: 0.6829


In [28]:
model_2 = DecisionTreeClassifier()
evaluate_3(model)

Mean Accuracy: 0.8334
Mean Precision: 0.5609
Mean Recall: 0.6833


Functions to find the best optimization of each model

In [29]:
# finding the right number of neighbors
def optimize(upper):
    k = 0
    n = 0 
    for number in range(1, upper):
        knn = KNeighborsClassifier(n_neighbors = number)
        knn.fit(X_train_smote, y_train_smote)
        y_pred = knn.predict(X_test)       
        b = recall_score(y_test, y_pred)
        if b>k:
            n=number
            k=b

    print('The recall score is ' + str(k))
    print('The optimal n_neighbor number is ' + str(n))

optimize(300)
    

The recall score is 0.88
The optimal n_neighbor number is 285


In [30]:
def optimizeAda(upper):
    k = 0
    n = 0 
    for number in range(1, upper):
        abc = AdaBoostClassifier(n_estimators = number)
        abc.fit(X_train_smote, y_train_smote)
        y_pred = abc.predict(X_test)       
        b = recall_score(y_test, y_pred)
        if b>k:
            n=number
            k=b
    print('The recall score is ' + str(k))
    print('The optimal n_estimators number is ' + str(n))

optimizeAda(300)
    

The recall score is 0.96
The optimal n_estimators number is 137


In [31]:
# finding the best learning rate. Looks like the optimal one is the default. 

def optimizeAdaLR():
    k = 0
    n = [.00001, .0001, .001, .01, .1, 1, 1.1, 1.2, 1.3, 1.4 , 1.5, 1,6, 1.7, 1.8, 1.9, 2.0] 
    for number in n:
        abc = AdaBoostClassifier(n_estimators = 137, learning_rate = number)
        abc.fit(X_train_smote, y_train_smote)
        y_pred = abc.predict(X_test)       
        b = recall_score(y_test, y_pred)
        if b>k:
            n=number
            k=b
    
        
    print('The recall score is ' + str(k))
    print('The optimal learning rate is ' + str(n))

    
    
    
optimizeAdaLR()

The recall score is 0.96
The optimal learning rate is 1


In [32]:
# Fine tuning DecisionTreeClassifier - didn't keep every iteration 
dtc = DecisionTreeClassifier(random_state = 27, max_depth = 2, min_samples_leaf = 5)
dtc.fit(X_train_smote, y_train_smote)
y_pred = dtc.predict(X_test)  
c = accuracy_score(y_test,dtc.predict(X_test))
b = recall_score(y_test, y_pred)

print('The recall score is ' + str(b))
print("accuracy = " + str(c))
    
    
    


The recall score is 0.9333333333333333
accuracy = 0.5753424657534246


In [33]:
#fine tuning SVC - didn't keep every iteration
k = 0
n = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'] 

sv = SVC(random_state = 27, kernel = 'linear')
sv.fit(X_train_smote, y_train_smote)
y_pred = sv.predict(X_test)     
c = accuracy_score(y_test,sv.predict(X_test))

b = recall_score(y_test, y_pred)

print('The recall score is ' + str(b))
print("accuracy = " + str(c))
    
    
    


The recall score is 0.8133333333333334
accuracy = 0.7247227658186562


In [34]:
def optimizeC(upper):
    k = 0
    n = 0 
    for number in range(1, upper):
        sv = SVC(random_state = 27, kernel = 'linear', C= number)
        sv.fit(X_train_smote, y_train_smote)
        y_pred = sv.predict(X_test)       
        b = recall_score(y_test, y_pred)
        if b>k:
            n=number
            k=b
    
        
    print('The recall score is ' + str(k))
    print('The optimal C is ' + str(n))

    
    
    
optimizeC(20)
    

The recall score is 0.8133333333333334
The optimal C is 1
