In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
import math
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import TomekLinks
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
from sklearn import preprocessing

In [2]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
# We should delete Customer ID because we don't want to get visualizations at customer level. That would lead to a lot of categories in the visualization
df.drop('id', inplace=True, axis=1)

In [4]:
# Replacing nAns with mean
df['bmi']=df['bmi'].replace(np.NaN,df['bmi'].mean())


In [5]:
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
# encoding ordinal categorical variable "gender"
gender_mapping = {"Male": 0, "Female": 1, "Other": 2}
df.gender = df.gender.map(gender_mapping)

# encoding the ordinal categorical variable "ever_married"
ever_married_mapping = {"Yes": 0, "No":1}
df.ever_married = df.ever_married.map(ever_married_mapping)

# encoding the ordinal categorical variable "work_type"
work_type_mapping = {"children": 0, "Govt_jov":1, "Never_worked": 2, "Private": 3, "Self-employed": 4}
df.work_type = df.work_type.map(work_type_mapping)

# encoding the ordinal categorical variable "Residence_type"
Residence_type_mapping = {"Rural": 0, "Urban":1}
df.Residence_type = df.Residence_type.map(Residence_type_mapping)

# encoding the ordinal categorical variable "smoking_status"
smoking_status_mapping = {"formerly smoked": 0, "never smoked": 1, "smokes": 2, "Unknown": 3}
df.smoking_status = df.smoking_status.map(smoking_status_mapping)

In [7]:
def encoder_categorical_variables(dataframe):
    # encoding ordinal categorical variable "gender"
    gender_mapping = {"Male": 0, "Female": 1, "Other": 2}
    dataframe.gender = dataframe.gender.map(gender_mapping)

    # encoding the ordinal categorical variable "ever_married"
    ever_married_mapping = {"Yes": 0, "No":1}
    dataframe.ever_married = dataframe.ever_married.map(ever_married_mapping)

    # encoding the ordinal categorical variable "work_type"
    work_type_mapping = {"children": 0, "Govt_jov": 1, "Never_worked": 2, "Private": 3, "Self-employed": 4}
    dataframe.work_type = dataframe.work_type.map(work_type_mapping)

    # encoding the ordinal categorical variable "Residence_type"
    Residence_type_mapping = {"Rural": 0, "Urban":1}
    dataframe.Residence_type = dataframe.Residence_type.map(Residence_type_mapping)
    
    # encoding the ordinal categorical variable "smoking_status"
    smoking_status_mapping = {"formerly smoked": 0, "never smoked": 1, "smokes": 2, "Unknown": 3}
    dataframe.smoking_status = dataframe.smoking_status.map(smoking_status_mapping)
    
    return dataframe


In [8]:
df1 = pd.read_csv("healthcare-dataset-stroke-data.csv", index_col =0)
df1 = encoder_categorical_variables(df1)

In [9]:
df1['bmi']=df1['bmi'].replace(np.NaN,df1['bmi'].mean())
df1['work_type']=df1['work_type'].replace(np.NaN,df1['work_type'].mean())

In [10]:
df1.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [11]:
df1

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,0,67.0,0,1,0,3.000000,1,228.69,36.600000,0,1
51676,1,61.0,0,0,0,4.000000,0,202.21,28.893237,1,1
31112,0,80.0,0,1,0,3.000000,0,105.92,32.500000,1,1
60182,1,49.0,0,0,0,3.000000,1,171.23,34.400000,2,1
1665,1,79.0,1,0,0,4.000000,0,174.12,24.000000,1,1
...,...,...,...,...,...,...,...,...,...,...,...
18234,1,80.0,1,0,0,3.000000,1,83.75,28.893237,1,0
44873,1,81.0,0,0,0,4.000000,1,125.20,40.000000,1,0
19723,1,35.0,0,0,0,4.000000,0,82.99,30.600000,1,0
37544,0,51.0,0,0,0,3.000000,0,166.29,25.600000,0,0


In [12]:
df['work_type']=df['work_type'].replace(np.NaN,df['work_type'].mean())

In [13]:
model_df = df.copy()

In [14]:
model_df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [15]:
model_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,3.000000,1,228.69,36.600000,0,1
1,1,61.0,0,0,0,4.000000,0,202.21,28.893237,1,1
2,0,80.0,0,1,0,3.000000,0,105.92,32.500000,1,1
3,1,49.0,0,0,0,3.000000,1,171.23,34.400000,2,1
4,1,79.0,1,0,0,4.000000,0,174.12,24.000000,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,1,80.0,1,0,0,3.000000,1,83.75,28.893237,1,0
5106,1,81.0,0,0,0,4.000000,1,125.20,40.000000,1,0
5107,1,35.0,0,0,0,4.000000,0,82.99,30.600000,1,0
5108,0,51.0,0,0,0,3.000000,0,166.29,25.600000,0,0


In [16]:
def preprocessing(dataframe):
    X = dataframe.drop(["stroke"], axis=1)
    y = dataframe["stroke"]
    

    X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, test_size=0.2, random_state =0)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 0.25, random_state=0)
    return X_train, X_test, X_val, y_train, y_test, y_val

In [17]:
X_train, X_test, X_val, y_train, y_test, y_val = preprocessing(model_df)

In [18]:
X_train['work_type']=X_train['work_type'].replace(np.NaN,X_train['work_type'].mean())

In [19]:
X_train.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

In [20]:
X_test

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
42,0,82.0,0,1,0,3.0,1,144.90,26.400000,2
380,0,4.0,0,0,1,0.0,0,106.22,16.700000,3
3524,0,58.0,0,0,0,3.0,1,79.95,25.900000,1
352,1,20.0,0,0,1,3.0,0,96.57,34.100000,1
4927,1,10.0,0,0,1,0.0,0,69.84,13.700000,3
...,...,...,...,...,...,...,...,...,...,...
472,1,43.0,0,0,0,3.0,0,75.05,22.900000,2
4446,1,42.0,0,0,0,3.0,1,191.94,27.900000,1
660,0,52.0,0,0,1,3.0,0,69.37,36.200000,3
57,0,78.0,0,0,0,3.0,1,237.75,28.893237,0


In [21]:
X_val

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
2087,1,42.0,0,0,0,4.0,1,98.76,26.4,2
5083,1,19.0,0,0,1,3.0,1,90.57,24.2,3
119,1,77.0,0,0,0,3.0,1,105.22,31.0,1
2032,1,54.0,0,0,0,3.0,1,100.29,30.2,1
3137,0,10.0,0,0,1,0.0,1,70.70,25.4,3
...,...,...,...,...,...,...,...,...,...,...
1006,1,6.0,0,0,1,0.0,0,72.07,19.5,3
4982,0,20.0,0,0,1,3.0,0,75.90,32.2,1
433,1,69.0,0,0,0,4.0,0,225.47,36.9,1
1579,0,23.0,0,0,1,3.0,0,83.86,19.5,1


In [22]:
scaler1 = StandardScaler()
scaler2 = MinMaxScaler()
scaler3 = PolynomialFeatures(degree=2, interaction_only=True) 

In [23]:

#X_train = scaler2.fit_transform(X_train)
#X_test = scaler2.fit_transform(X_test)

In [24]:
weights={0:1, 1:20}

In [25]:
from sklearn import model_selection, datasets
from sklearn.tree import DecisionTreeClassifier
import joblib
import pickle

In [26]:
model = LogisticRegression(class_weight=weights, random_state=42)


In [27]:
# fit the model with data
model.fit(X_train, y_train)
y_val = model.predict(X_test)

In [28]:
target_names = ['0', '1']
print(classification_report(y_test, y_val, target_names=target_names))

              precision    recall  f1-score   support

           0       0.98      0.74      0.85       968
           1       0.14      0.74      0.23        54

    accuracy                           0.74      1022
   macro avg       0.56      0.74      0.54      1022
weighted avg       0.94      0.74      0.81      1022



In [29]:


filename = "Completed_model.joblib"
joblib.dump(model, filename)

['Completed_model.joblib']

In [30]:
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)


In [31]:
print(X_test)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
42         0  82.0             0              1             0        3.0   
380        0   4.0             0              0             1        0.0   
3524       0  58.0             0              0             0        3.0   
352        1  20.0             0              0             1        3.0   
4927       1  10.0             0              0             1        0.0   
...      ...   ...           ...            ...           ...        ...   
472        1  43.0             0              0             0        3.0   
4446       1  42.0             0              0             0        3.0   
660        0  52.0             0              0             1        3.0   
57         0  78.0             0              0             0        3.0   
2367       1  35.0             0              0             0        3.0   

      Residence_type  avg_glucose_level        bmi  smoking_status  
42                

# 