In [78]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
# from pandas_profiling import ProfileReport

#model
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier #boosted decision tree
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import VotingClassifier

#scoring
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.model_selection import GridSearchCV
import missingno as msno

import warnings
warnings.filterwarnings("ignore")

In [89]:
df = pd.read_csv("df_train.csv")
df_norm = pd.read_csv("df_train_normalized.csv")
df_stand = pd.read_csv("df_train_standarize.csv")
df_val = pd.read_csv("df_val.csv")

In [90]:
X = df.drop(['stroke'], axis = 1)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42, stratify = y)

X_norm = df_norm.drop(['stroke'], axis = 1)
y_norm = df_norm['stroke']
X_norm_train, X_norm_test, y_norm_train, y_norm_test = train_test_split(X_norm, y_norm, test_size = 0.2,random_state = 42, stratify = y_norm)

X_stand = df_stand.drop(['stroke'], axis = 1)
y_stand = df_stand['stroke']
X_stand_train, X_stand_test, y_stand_train, y_stand_test = train_test_split(X_stand, y_stand, test_size = 0.2,random_state = 42, stratify = y_stand)

X_val = df_val.drop(['stroke'], axis = 1)
y_val = df_val['stroke']

In [None]:
y_train.value_counts()

0    2915
1     150
Name: stroke, dtype: int64

In [None]:
y_test.value_counts()

0    730
1     37
Name: stroke, dtype: int64

In [None]:
model = LogisticRegression().fit(X_train, y_train)

#Baseline Model

In [24]:
model = DummyClassifier(strategy='constant', constant = 1, random_state= 42)
scores = cross_val_score(model, X, y, scoring='roc_auc', n_jobs=-1)
print('Mean roc_auc: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean roc_auc: 0.500 (0.000)


In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       730
           1       0.00      0.00      0.00        37

    accuracy                           0.95       767
   macro avg       0.48      0.50      0.49       767
weighted avg       0.91      0.95      0.93       767



In [20]:
roc_auc_score(y_test, y_pred)

0.5

#Model Selection

In [36]:
models = []
names = []

models.append(LogisticRegression(solver='liblinear'))
names.append('LogisticRegression')

models.append(GaussianNB())
names.append('Naive Bayes')

models.append(KNeighborsClassifier())
names.append('KNN')

models.append(SVC(gamma='scale'))
names.append('SVM')

models.append(DecisionTreeClassifier())
names.append('Decision Tree')

models.append(BaggingClassifier())
names.append('Bagging Decision Tree')

models.append(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=25, learning_rate = 0.1))
names.append('Boosted Decision tree')

models.append(RandomForestClassifier())
names.append('Random Forest')

In [47]:
def model_score(model, name):
  scores = cross_val_score(model, X, y, scoring='roc_auc', n_jobs=-1)
  print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
  # print(f'{name} {np.mean(scores)} {np.std(scores)}')

In [48]:
for i in range(len(models)):
  model_score(models[i],names[i])

>LogisticRegression 0.835 (0.018)
>Naive Bayes 0.796 (0.014)
>KNN 0.662 (0.050)
>SVM 0.616 (0.074)
>Decision Tree 0.567 (0.027)
>Bagging Decision Tree 0.744 (0.033)
>Boosted Decision tree 0.552 (0.028)
>Random Forest 0.794 (0.018)


##normalize

In [49]:
def model_score_norm(model, name):
  scores = cross_val_score(model, X_norm, y_norm, scoring='roc_auc', n_jobs=-1)
  print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

In [50]:
for i in range(len(models)):
  model_score_norm(models[i],names[i])

>LogisticRegression 0.834 (0.018)
>Naive Bayes 0.796 (0.014)
>KNN 0.626 (0.027)
>SVM 0.655 (0.047)
>Decision Tree 0.567 (0.034)
>Bagging Decision Tree 0.717 (0.051)
>Boosted Decision tree 0.545 (0.030)
>Random Forest 0.788 (0.021)


##standarization

In [51]:
def model_score_stand(model, name):
  scores = cross_val_score(model, X_stand, y_stand, scoring='roc_auc', n_jobs=-1)
  print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

In [52]:
for i in range(len(models)):
  model_score_stand(models[i],names[i])

>LogisticRegression 0.835 (0.019)
>Naive Bayes 0.796 (0.014)
>KNN 0.624 (0.040)
>SVM 0.683 (0.042)
>Decision Tree 0.564 (0.035)
>Bagging Decision Tree 0.706 (0.041)
>Boosted Decision tree 0.569 (0.025)
>Random Forest 0.783 (0.019)


In [73]:
def model_score2(model, name):
  y_pred = model.fit(X_train, y_train).predict(X_test)
  try:
    score  = model.predict_proba(X_test)[:, 1]
    roc = roc_auc_score(y_test, score, average ='weighted')
  except:
    roc = 0
  print(f'{name}:')
  print(f'ROC score {roc}')
  print(f'Accuracy {accuracy_score(y_test,y_pred)}')
  print(f'f1 {f1_score(y_test,y_pred, average = "weighted")}')
  print(f'Recall {recall_score(y_test,y_pred, average = "weighted")}')
  print(f'Precision {precision_score(y_test,y_pred, average = "weighted")}')
  print()

def model_score2_norm(model, name):
  y_pred = model.fit(X_norm_train, y_norm_train).predict(X_norm_test)
  try:
    score  = model.predict_proba(X_norm_test)[:, 1]
    roc = roc_auc_score(y_norm_test, score, average ='weighted')
  except:
    roc = 0
  print(f'{name}:')
  print(f'ROC score {roc}')
  print(f'Accuracy {accuracy_score(y_norm_test,y_pred)}')
  print(f'f1 {f1_score(y_norm_test,y_pred, average = "weighted")}')
  print(f'Recall {recall_score(y_norm_test,y_pred, average = "weighted")}')
  print(f'Precision {precision_score(y_norm_test,y_pred, average = "weighted")}')
  print()

def model_score2_stand(model, name):
  y_pred = model.fit(X_stand_train, y_stand_train).predict(X_stand_test)
  try:
    score  = model.predict_proba(X_stand_test)[:, 1]
    roc = roc_auc_score(y_stand_test, score, average ='weighted')
  except:
    roc = 0
  print(f'{name}:')
  print(f'ROC score {roc}')
  print(f'Accuracy {accuracy_score(y_stand_test,y_pred)}')
  print(f'f1 {f1_score(y_stand_test,y_pred, average = "weighted")}')
  print(f'Recall {recall_score(y_stand_test,y_pred, average = "weighted")}')
  print(f'Precision {precision_score(y_stand_test,y_pred, average = "weighted")}')
  print()

In [71]:
for i in range(len(models)):
  model_score2(models[i],names[i])

LogisticRegression:
ROC score 0.845353572750833
Accuracy 0.9517601043024772
f1 0.9282363074693499
Recall 0.9517601043024772
Precision 0.9058472961418623

Naive Bayes:
ROC score 0.8417252869307664
Accuracy 0.5280312907431551
f1 0.6461614913593164
Recall 0.5280312907431551
Precision 0.9562334780889642

KNN:
ROC score 0.6301740096260644
Accuracy 0.9413298565840938
f1 0.922996367100589
Recall 0.9413298565840938
Precision 0.9053633666750837

SVM:
ROC score 0
Accuracy 0.9517601043024772
f1 0.9282363074693499
Recall 0.9517601043024772
Precision 0.9058472961418623

Decision Tree:
ROC score 0.5523139577934099
Accuracy 0.9048239895697523
f1 0.9106211757041874
Recall 0.9048239895697523
Precision 0.9167545373934897

Bagging Decision Tree:
ROC score 0.7437615697889671
Accuracy 0.9478487614080835
f1 0.9305949425389762
Recall 0.9478487614080835
Precision 0.9217118742096441

Boosted Decision tree:
ROC score 0.565142539800074
Accuracy 0.9048239895697523
f1 0.9115073052860053
Recall 0.9048239895697523
P

In [74]:
for i in range(len(models)):
  model_score2_norm(models[i],names[i])

LogisticRegression:
ROC score 0.8494631617919289
Accuracy 0.9517601043024772
f1 0.9282363074693499
Recall 0.9517601043024772
Precision 0.9058472961418623

Naive Bayes:
ROC score 0.8417252869307664
Accuracy 0.36897001303780963
f1 0.4861781138390232
Recall 0.36897001303780963
Precision 0.9551859702157369

KNN:
ROC score 0.6320066641984451
Accuracy 0.9504563233376793
f1 0.9321138412336666
Recall 0.9504563233376793
Precision 0.9273400472920024

SVM:
ROC score 0
Accuracy 0.9517601043024772
f1 0.9282363074693499
Recall 0.9517601043024772
Precision 0.9058472961418623

Decision Tree:
ROC score 0.550259163272862
Accuracy 0.9009126466753585
f1 0.9083149192509906
Recall 0.9009126466753585
Precision 0.9162278955998356

Bagging Decision Tree:
ROC score 0.6905960755275824
Accuracy 0.940026075619296
f1 0.9243078087062621
Recall 0.940026075619296
Precision 0.9108236396371989

Boosted Decision tree:
ROC score 0.5637726767863754
Accuracy 0.9022164276401564
f1 0.9099490891299867
Recall 0.9022164276401564

In [75]:
for i in range(len(models)):
  model_score2_stand(models[i],names[i])

LogisticRegression:
ROC score 0.8453165494261385
Accuracy 0.9517601043024772
f1 0.9282363074693499
Recall 0.9517601043024772
Precision 0.9058472961418623

Naive Bayes:
ROC score 0.8417252869307664
Accuracy 0.38852672750977835
f1 0.5079048806388288
Recall 0.38852672750977835
Precision 0.9552875275056558

KNN:
ROC score 0.6261940022213995
Accuracy 0.9478487614080835
f1 0.9285408855091231
Recall 0.9478487614080835
Precision 0.9164430391442269

SVM:
ROC score 0
Accuracy 0.9517601043024772
f1 0.9282363074693499
Recall 0.9517601043024772
Precision 0.9058472961418623

Decision Tree:
ROC score 0.5637726767863754
Accuracy 0.9022164276401564
f1 0.9099490891299867
Recall 0.9022164276401564
Precision 0.9183199460252915

Bagging Decision Tree:
ROC score 0.708237689744539
Accuracy 0.9426336375488917
f1 0.9257140471275065
Recall 0.9426336375488917
Precision 0.9119177658598973

Boosted Decision tree:
ROC score 0.5779711218067383
Accuracy 0.9048239895697523
f1 0.9123504467531869
Recall 0.90482398956975

**Logistic Regression get the highest score so we will use logistic regression**

In [79]:
param_grid = {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}

In [80]:
model = LogisticRegression()
gscv = GridSearchCV(model, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

In [83]:
best_model = gscv.fit(X,y)
best_model.best_params_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    5.3s finished


{'C': 0.0001, 'penalty': 'l1', 'solver': 'liblinear'}

In [84]:
best_model = gscv.fit(X_norm,y_norm)
best_model.best_params_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   12.8s finished


{'C': 0.0001, 'penalty': 'l1', 'solver': 'liblinear'}

In [85]:
best_model = gscv.fit(X_stand,y_stand)
best_model.best_params_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   10.4s finished


{'C': 0.0001, 'penalty': 'l1', 'solver': 'liblinear'}

##Preprocess unseen data

In [86]:
X_train.shape

(3065, 20)

#final score

###With hyperparameter Tuning

In [101]:
model = LogisticRegression(C = 0.0001, penalty = 'l1', solver= 'liblinear')
def final_score(model,X,y, X_val, y_val):
  y_pred = model.fit(X, y).predict(X_val)
  try:
    score  = model.predict_proba(X_val)[:, 1]
    roc = roc_auc_score(y_val, score, average ='weighted')
  except:
    roc = 0
  print(f'ROC score {roc}')
  print(f'Accuracy {accuracy_score(y_val,y_pred)}')
  print(f'f1 {f1_score(y_val,y_pred, average = "weighted")}')
  print(f'Recall {recall_score(y_val,y_pred, average = "weighted")}')
  print(f'Precision {precision_score(y_val,y_pred, average = "weighted")}')
  print()

In [102]:
final_score(model, X, y, X_val, y_val)

ROC score 0.3669288518675721
Accuracy 0.9514866979655712
f1 0.9278330591228023
Recall 0.9514866979655712
Precision 0.905326936405426



###Without hyperparameter tuning

In [118]:
model = LogisticRegression()

In [124]:
y_pred = model.fit(X, y).predict(X_val)
try:
  score  = model.predict_proba(X_val)[:, 1]
  roc = roc_auc_score(y_val, score, average ='weighted')
except:
  roc = 0
print('ROC score {:.3f}'.format(roc))
print('Accuracy {:.3f}'.format(accuracy_score(y_val,y_pred)))
print('f1 {:.3f}'.format(f1_score(y_val,y_pred, average = "weighted")))
print('Recall {:.3f}'.format(recall_score(y_val,y_pred, average = "weighted")))
print('Precision {:.3f}'.format(precision_score(y_val,y_pred, average = "weighted")))
print()

ROC score 0.841
Accuracy 0.952
f1 0.930
Recall 0.952
Precision 0.955



In [120]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [125]:
loaded_model = pickle.load(open(filename, 'rb'))

In [126]:
def model_score(model):
  y_pred = model.predict(X_val)
  try:
    score  = model.predict_proba(X_val)[:, 1]
    roc = roc_auc_score(y_val, score, average ='weighted')
  except:
    roc = 0
  print('ROC score {:.3f}'.format(roc))
  print('Accuracy {:.3f}'.format(accuracy_score(y_val,y_pred)))
  print('f1 {:.3f}'.format(f1_score(y_val,y_pred, average = "weighted")))
  print('Recall {:.3f}'.format(recall_score(y_val,y_pred, average = "weighted")))
  print('Precision {:.3f}'.format(precision_score(y_val,y_pred, average = "weighted")))
  print()

In [127]:
model_score(loaded_model)

ROC score 0.841
Accuracy 0.952
f1 0.930
Recall 0.952
Precision 0.955



Catboost Classifier Testing

In [129]:
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,39.0,0,0,79.44,22.7,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0
1,55.0,0,0,68.79,27.0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0
2,59.0,0,0,240.71,43.9,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
3,79.0,0,0,82.07,30.4,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0
4,75.0,0,0,68.38,33.8,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3827,45.0,0,0,92.86,35.1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
3828,16.0,0,0,113.47,19.5,0,1,0,1,0,0,0,0,0,1,1,0,1,0,0,0
3829,61.0,0,0,78.65,36.2,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
3830,31.0,0,0,74.05,26.0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0


In [130]:
df.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'gender_Female', 'gender_Male', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes'],
      dtype='object')

In [131]:
cat_features = ['hypertension', 'heart_disease', 'gender_Female', 'gender_Male', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes']

In [132]:
! pip install catboost

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 27 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1


In [136]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(
    iterations=100,
)


clf.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
)

Learning rate set to 0.113778
0:	learn: 0.5163550	test: 0.5177935	best: 0.5177935 (0)	total: 2.85ms	remaining: 283ms
1:	learn: 0.4193709	test: 0.4210284	best: 0.4210284 (1)	total: 5.72ms	remaining: 280ms
2:	learn: 0.3425561	test: 0.3422662	best: 0.3422662 (2)	total: 8.88ms	remaining: 287ms
3:	learn: 0.2947274	test: 0.2964628	best: 0.2964628 (3)	total: 11.7ms	remaining: 280ms
4:	learn: 0.2646433	test: 0.2662796	best: 0.2662796 (4)	total: 16.2ms	remaining: 308ms
5:	learn: 0.2326303	test: 0.2352189	best: 0.2352189 (5)	total: 20.4ms	remaining: 319ms
6:	learn: 0.2151903	test: 0.2182179	best: 0.2182179 (6)	total: 24.9ms	remaining: 330ms
7:	learn: 0.2046411	test: 0.2076103	best: 0.2076103 (7)	total: 26.6ms	remaining: 306ms
8:	learn: 0.1890220	test: 0.1934256	best: 0.1934256 (8)	total: 30.1ms	remaining: 305ms
9:	learn: 0.1785712	test: 0.1839145	best: 0.1839145 (9)	total: 33.2ms	remaining: 299ms
10:	learn: 0.1709101	test: 0.1788489	best: 0.1788489 (10)	total: 36ms	remaining: 292ms
11:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7fdf19484890>

In [None]:
def model_score(model):
  y_pred = model.predict(X_val)
  try:
    score  = model.predict_proba(X_val)[:, 1]
    roc = roc_auc_score(y_val, score, average ='weighted')
  except:
    roc = 0
  print('ROC score {:.3f}'.format(roc))
  print('Accuracy {:.3f}'.format(accuracy_score(y_val,y_pred)))
  print('f1 {:.3f}'.format(f1_score(y_val,y_pred, average = "weighted")))
  print('Recall {:.3f}'.format(recall_score(y_val,y_pred, average = "weighted")))
  print('Precision {:.3f}'.format(precision_score(y_val,y_pred, average = "weighted")))
  print()

In [134]:
def model_score2(model, name):
  y_pred = model.predict(X_test)
  try:
    score  = model.predict_proba(X_test)[:, 1]
    roc = roc_auc_score(y_test, score, average ='weighted')
  except:
    roc = 0
  print(f'{name}:')
  print(f'ROC score {roc}')
  print(f'Accuracy {accuracy_score(y_test,y_pred)}')
  print(f'f1 {f1_score(y_test,y_pred, average = "weighted")}')
  print(f'Recall {recall_score(y_test,y_pred, average = "weighted")}')
  print(f'Precision {precision_score(y_test,y_pred, average = "weighted")}')
  print()

In [137]:
model_score2(clf, 'Catboost')

Catboost:
ROC score 0.8307293594964827
Accuracy 0.9517601043024772
f1 0.9282363074693499
Recall 0.9517601043024772
Precision 0.9058472961418623



In [None]:
# LogisticRegression:
# ROC score 0.845353572750833
# Accuracy 0.9517601043024772
# f1 0.9282363074693499
# Recall 0.9517601043024772
# Precision 0.9058472961418623