In [84]:
# IMPORT Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
from tensorflow.keras.utils import to_categorical
import numpy as np

In [85]:
# Import the Australian Road Death Dataset
stroke_data = pd.read_csv('../Resources/healthcare-dataset-stroke-data.csv')
stroke_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [86]:
stroke_data.shape

(5110, 12)

In [87]:
#drop duplicates
stroke_data = stroke_data.drop_duplicates(subset='id', keep="first")

In [88]:
stroke_data = stroke_data.dropna()

In [89]:
stroke_data.shape

(4909, 12)

In [90]:
stroke_data = stroke_data.drop('id', 1)


  stroke_data = stroke_data.drop('id', 1)


In [91]:
stroke_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [92]:
stroke_data['gender'].value_counts()

Female    2897
Male      2011
Other        1
Name: gender, dtype: int64

In [93]:
stroke_data['age'].value_counts()

78.00    93
57.00    93
52.00    85
54.00    84
55.00    83
         ..
1.40      3
0.16      3
0.40      2
0.08      2
0.48      2
Name: age, Length: 104, dtype: int64

In [94]:
stroke_data['hypertension'].value_counts()

0    4458
1     451
Name: hypertension, dtype: int64

In [95]:
stroke_data['heart_disease'].value_counts()

0    4666
1     243
Name: heart_disease, dtype: int64

In [96]:
stroke_data['ever_married'].value_counts()

Yes    3204
No     1705
Name: ever_married, dtype: int64

In [97]:
stroke_data['work_type'].value_counts()

Private          2811
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: work_type, dtype: int64

In [98]:
stroke_data['Residence_type'].value_counts()

Urban    2490
Rural    2419
Name: Residence_type, dtype: int64

In [99]:
stroke_data['avg_glucose_level'].value_counts()

93.88     6
72.49     5
83.16     5
73.00     5
91.68     5
         ..
72.06     1
97.90     1
230.59    1
224.63    1
85.28     1
Name: avg_glucose_level, Length: 3852, dtype: int64

In [100]:
stroke_data['bmi'].value_counts()

28.7    41
28.4    38
26.7    37
27.6    37
26.1    37
        ..
48.7     1
49.2     1
51.0     1
49.4     1
14.9     1
Name: bmi, Length: 418, dtype: int64

In [101]:
stroke_data['smoking_status'].value_counts()

never smoked       1852
Unknown            1483
formerly smoked     837
smokes              737
Name: smoking_status, dtype: int64

In [102]:
stroke_data['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [103]:
stroke_data = stroke_data.replace("Unknown", np.nan)

In [104]:
#drop rows containing null values
stroke_data = stroke_data.dropna()

In [105]:
#check shape 
stroke_data.shape

(3426, 11)

In [106]:
# Set features. 
X = stroke_data.drop("stroke", axis=1)
y = stroke_data["stroke"]
print(X.shape, y.shape)

(3426, 10) (3426,)


In [107]:
X.rename(columns = {'Residence_type':'residence_type'}, inplace = True)

In [108]:
# Establish the spending bins and group names. - ref ; https://www.statcan.gc.ca/en/concepts/definitions/age2
age_bins = [0, 15, 25, 65, 120]
group_names = ["child", "youth", "adult", "senior"]

# Categorize spending based on the bins.
X["age_category"] = pd.cut(stroke_data['age'], age_bins, labels=group_names)
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,age_category
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,senior
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,senior
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,adult
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,senior
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,senior
...,...,...,...,...,...,...,...,...,...,...,...
5100,Male,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,senior
5102,Female,57.0,0,0,Yes,Private,Rural,77.93,21.7,never smoked,adult
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,senior
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,adult


In [109]:
# Establish the spending bins and group names. https://www.health.nsw.gov.au/heal/Pages/bmi.aspx
bmi_bins = [0, 18.5, 25, 30, 10000000]
groupbmi_names = ["underweight", "healthy_weight", "overweight", "obese"]

# Categorize spending based on the bins.
X["bmi_category"] = pd.cut(stroke_data['bmi'], bmi_bins, labels=groupbmi_names)
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,age_category,bmi_category
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,senior,obese
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,senior,obese
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,adult,obese
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,senior,healthy_weight
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,senior,overweight
...,...,...,...,...,...,...,...,...,...,...,...,...
5100,Male,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,senior,overweight
5102,Female,57.0,0,0,Yes,Private,Rural,77.93,21.7,never smoked,adult,healthy_weight
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,senior,obese
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,adult,obese


In [110]:
# Establish the spending bins and group names. https://www.mayoclinic.org/diseases-conditions/diabetes/diagnosis-treatment/drc-20371451
bg_bins = [0, 140, 200, 10000000]
groupbg_names = ["normal", "prediabetic", "diabetic"]

# Categorize spending based on the bins.
X["bloodglucose_cat"] = pd.cut(stroke_data['avg_glucose_level'], bg_bins, labels=groupbg_names)
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,age_category,bmi_category,bloodglucose_cat
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,senior,obese,diabetic
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,senior,obese,normal
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,adult,obese,prediabetic
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,senior,healthy_weight,prediabetic
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,senior,overweight,prediabetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5100,Male,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,senior,overweight,normal
5102,Female,57.0,0,0,Yes,Private,Rural,77.93,21.7,never smoked,adult,healthy_weight,normal
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,senior,obese,normal
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,adult,obese,normal


In [111]:
X = X.drop(["age","avg_glucose_level","bmi"], axis=1)
X

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,residence_type,smoking_status,age_category,bmi_category,bloodglucose_cat
0,Male,0,1,Yes,Private,Urban,formerly smoked,senior,obese,diabetic
2,Male,0,1,Yes,Private,Rural,never smoked,senior,obese,normal
3,Female,0,0,Yes,Private,Urban,smokes,adult,obese,prediabetic
4,Female,1,0,Yes,Self-employed,Rural,never smoked,senior,healthy_weight,prediabetic
5,Male,0,0,Yes,Private,Urban,formerly smoked,senior,overweight,prediabetic
...,...,...,...,...,...,...,...,...,...,...
5100,Male,1,0,Yes,Self-employed,Rural,never smoked,senior,overweight,normal
5102,Female,0,0,Yes,Private,Rural,never smoked,adult,healthy_weight,normal
5106,Female,0,0,Yes,Self-employed,Urban,never smoked,senior,obese,normal
5107,Female,0,0,Yes,Self-employed,Rural,never smoked,adult,obese,normal


In [112]:
X['gender'] = X['gender'].replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
X['residence_type'] = X['residence_type'].replace({'Rural':0,'Urban':1}).astype(np.uint8)
X['work_type'] = X['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':3,'Never_worked':4}).astype(np.uint8)
X['ever_married'] = X['ever_married'].replace({'Yes':1,'No':0}).astype(np.uint8)
X['smoking_status'] = X['smoking_status'].replace({'never smoked':0,'smokes':1,'formerly smoked':2}).astype(np.uint8)
X['age_category'] = X['age_category'].replace({'child':0,'youth':1,'adult':2,'senior':3}).astype(np.uint8)
X['bmi_category'] = X['bmi_category'].replace({'underweight':0,'healthy_weight':1,'overweight':2,'obese':3}).astype(np.uint8)
X['bloodglucose_cat'] = X['bloodglucose_cat'].replace({'normal':0,'prediabetic':1,'diabetic':2}).astype(np.uint8)
X

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,residence_type,smoking_status,age_category,bmi_category,bloodglucose_cat
0,0,0,1,1,0,1,2,3,3,2
2,0,0,1,1,0,0,0,3,3,0
3,1,0,0,1,0,1,1,2,3,1
4,1,1,0,1,1,0,0,3,1,1
5,0,0,0,1,0,1,2,3,2,1
...,...,...,...,...,...,...,...,...,...,...
5100,0,1,0,1,1,0,0,3,2,0
5102,1,0,0,1,0,0,0,2,1,0
5106,1,0,0,1,1,1,0,3,3,0
5107,1,0,0,1,1,0,0,2,3,0


In [113]:
#from sklearn.preprocessing import LabelEncoder
#label_encoder = LabelEncoder()
#label_encoder.fit(y)
#encoded_y = label_encoder.transform(y)

In [114]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=42)

In [151]:
X_train.shape

(1027, 10)

In [152]:
X_test.shape

(2399, 10)

In [115]:
from sklearn.preprocessing import StandardScaler 

In [116]:

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [117]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [118]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
classifier

In [125]:
classifier.fit(X_train_res, y_train_res)

In [129]:
print(f"Training Data Score: {classifier.score(X_train_res, y_train_res)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7505165289256198
Testing Data Score: 0.7348895373072113


In [144]:
input_variables = pd.DataFrame([[0, 0, 0, 0, 0, 0,0, 0, 2, 1]],
                                       columns=["gender", "hypertension", "heart_disease", "ever_married", "work_type", "residence_type", "smoking_status", "age_category", "bmi_category", "bloodglucose_cat"])

prediction = classifier.predict(input_variables)
print(prediction)

[0]




In [146]:
prediction = classifier.predict(X_test)

In [147]:
prediction_df = pd.DataFrame({"actual":y_test, "prediction":prediction})
prediction_df

Unnamed: 0,actual,prediction
1133,0,0
2086,0,1
3236,0,0
1533,0,0
636,0,1
...,...,...
2794,0,0
871,0,1
1937,0,0
4906,0,0


In [149]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, prediction)

array([[1686,  592],
       [  44,   77]])

In [150]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, prediction))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.74      0.64      0.84      0.69      0.48      2278
          1       0.12      0.64      0.74      0.19      0.69      0.47       121

avg / total       0.93      0.73      0.64      0.81      0.69      0.48      2399



In [132]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
numeric_features = ["hypertension", "heart_disease"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["gender","ever_married", "work_type"
                        ,"Residence_type","smoking_status","age_category","bmi_category","bloodglucose_cat"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

NameError: name 'OneHotEncoder' is not defined

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print("model score: %.3f" % clf.score(X_test, y_test))

NameError: name 'preprocessor' is not defined

In [94]:
X_test

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,residence_type,smoking_status,age_category,bmi_category,bloodglucose_cat
4100,Female,0,0,No,Private,Rural,never smoked,youth,overweight,normal
255,Female,0,0,Yes,Private,Urban,formerly smoked,adults,underweight,normal
44,Male,1,0,Yes,Govt_job,Urban,smokes,adults,healthy_weight,diabetes
2180,Female,0,0,Yes,Private,Rural,never smoked,adults,obese,prediabetic
2369,Male,0,0,Yes,Self-employed,Rural,formerly smoked,seniors,obese,normal
...,...,...,...,...,...,...,...,...,...,...
2229,Male,1,0,Yes,Self-employed,Rural,smokes,adults,obese,normal
3398,Female,0,0,No,Private,Rural,never smoked,adults,overweight,normal
1722,Female,0,0,Yes,Self-employed,Urban,never smoked,adults,underweight,normal
1397,Female,0,0,Yes,Private,Rural,formerly smoked,adults,obese,normal


In [89]:
#scaler = sgd_randomized_pipe.best_estimator_.named_steps['scl']
#classifier = sgd_randomized_pipe.best_estimator_.named_steps['clf']

In [90]:
param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "classifier__C": [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['hypertension',
                                                                          'heart_disease']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                               

In [91]:
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best params:
{'classifier__C': 0.1, 'preprocessor__num__imputer__strategy': 'mean'}


In [51]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: nan


In [52]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001,0.01,0.1,1,10,100,1000],
                'max_iter':[100,500,1000]}
grid = GridSearchCV(clf, param_grid, verbose=10)

In [92]:
#print(grid.best_params_)
#print(grid.best_score_)

In [93]:
import pickle
with open('stroke2_logisticregression.pkl', 'wb') as file:
    pickle.dump(clf, file)

In [97]:
loaded_model = pickle.load(open('stroke2_logisticregression.pkl', 'rb'))
loaded_model.predict()

<function sklearn.pipeline.Pipeline.predict(self, X, **predict_params)>

In [124]:
input_variables = pd.DataFrame([["Female", 1, 1, "Yes", "Private", "Urban","smokes", "seniors", "obese", "diabetes"]],
                                       columns=["gender", "hypertension", "heart_disease", "ever_married", "work_type", "residence_type", "smoking_status", "age_category", "bmi_category", "bloodglucose_cat"])

prediction = clf.predict(input_variables)

In [125]:
print(prediction)

[0]


In [115]:
X_train

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,residence_type,smoking_status,age_category,bmi_category,bloodglucose_cat
4498,Female,0,0,Yes,Self-employed,Rural,never smoked,adults,healthy_weight,normal
1793,Female,0,0,Yes,Private,Rural,formerly smoked,seniors,obese,normal
2597,Female,0,0,Yes,Private,Urban,never smoked,adults,overweight,normal
4180,Male,0,0,Yes,Private,Urban,smokes,adults,obese,normal
4898,Female,0,0,No,children,Rural,never smoked,children,underweight,normal
...,...,...,...,...,...,...,...,...,...,...
1198,Female,0,1,Yes,Private,Urban,formerly smoked,seniors,healthy_weight,normal
4867,Male,0,0,Yes,Self-employed,Urban,formerly smoked,adults,overweight,normal
2446,Female,0,0,Yes,Self-employed,Urban,never smoked,seniors,obese,normal
3893,Female,0,0,Yes,Private,Rural,smokes,adults,overweight,normal
