In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,recall_score

In [23]:
data=pd.read_csv("../Dataset/healthcare-dataset-stroke-data.csv")
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [24]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [25]:
data.drop(columns='id',axis=0,inplace=True)


In [26]:
num_cols=[col for col in data.select_dtypes(include=['int64','float64']).columns]
cat_cols=[col for col in data.select_dtypes(exclude=['int64','float64']).columns]



In [27]:
data=pd.get_dummies(data,columns=cat_cols,drop_first=True)
data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,True,False,True,False,True,False,False,True,True,False,False
1,61.0,0,0,202.21,,1,False,False,True,False,False,True,False,False,False,True,False
2,80.0,0,1,105.92,32.5,1,True,False,True,False,True,False,False,False,False,True,False
3,49.0,0,0,171.23,34.4,1,False,False,True,False,True,False,False,True,False,False,True
4,79.0,1,0,174.12,24.0,1,False,False,True,False,False,True,False,False,False,True,False


In [28]:
data=data.astype({
    col:'int'
    for col in data.select_dtypes(include='bool').columns
})
data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,1,0,1,0,1,0,0,1,1,0,0
1,61.0,0,0,202.21,,1,0,0,1,0,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.5,1,1,0,1,0,1,0,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,0,0,1,0,1,0,0,1,0,0,1
4,79.0,1,0,174.12,24.0,1,0,0,1,0,0,1,0,0,0,1,0


In [29]:
data.isna().sum()

age                                 0
hypertension                        0
heart_disease                       0
avg_glucose_level                   0
bmi                               201
stroke                              0
gender_Male                         0
gender_Other                        0
ever_married_Yes                    0
work_type_Never_worked              0
work_type_Private                   0
work_type_Self-employed             0
work_type_children                  0
Residence_type_Urban                0
smoking_status_formerly smoked      0
smoking_status_never smoked         0
smoking_status_smokes               0
dtype: int64

In [30]:
X=data.drop(columns='stroke',axis=0)
y=data['stroke']

In [31]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [32]:
preprocess=Pipeline(steps=[
    ('SimpleImputer',SimpleImputer(strategy='median')),
    ('Standarize',StandardScaler())
])

In [33]:
num_cols

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']

In [34]:
column_tranfer=ColumnTransformer(transformers=[
 ('num',preprocess,num_cols[:-1])
],remainder='passthrough')

In [35]:
X_train.head()
X_train.isna().sum()

age                                 0
hypertension                        0
heart_disease                       0
avg_glucose_level                   0
bmi                               170
gender_Male                         0
gender_Other                        0
ever_married_Yes                    0
work_type_Never_worked              0
work_type_Private                   0
work_type_Self-employed             0
work_type_children                  0
Residence_type_Urban                0
smoking_status_formerly smoked      0
smoking_status_never smoked         0
smoking_status_smokes               0
dtype: int64

In [36]:
model=Pipeline(steps=[
    ('preprocess',column_tranfer),
    ('classifier',LogisticRegression(
        class_weight='balanced', max_iter=1000
        
    ))
    
])
model.fit(X_train,y_train)

y_pred=model.predict(X_train)
# print(y_pred)
print("predicted :",y_pred.sum())
print("Actual :",y_train.sum())


predicted : 1189
Actual : 199


In [37]:
y_pred_test=model.predict(X_test)
print("predicted :",y_pred_test.sum())
print("Actual :",y_test.sum())

predicted : 290
Actual : 50


In [38]:
precision_score(y_train,y_pred)

0.13624894869638352

In [39]:
precision_score(y_test,y_pred_test)


0.13793103448275862

In [40]:
f1_score(y_train,y_pred)

0.2334293948126801

In [41]:
f1_score(y_test,y_pred_test)


0.23529411764705882