# Importing Necessary Libraries

In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [8]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

## Handling Null Values

In [10]:
# Filling up null values using median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())


In [11]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [12]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

## Encoding Of Data

### Appraoch 1 : Using Label Encoder


In [17]:
le = LabelEncoder()
df_label_encoded = pd.DataFrame()

columns_to_copy =['id','age','hypertension','heart_disease','avg_glucose_level','bmi','stroke']
columns_to_encode = ['gender','ever_married','work_type','Residence_type','smoking_status']

for cols in columns_to_copy:
    df_label_encoded[cols] = df[cols]


for cols in columns_to_encode:
    df_label_encoded[cols] = le.fit_transform(df[cols])

df_label_encoded
                     

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender,ever_married,work_type,Residence_type,smoking_status
0,9046,67.0,0,1,228.69,36.6,1,1,1,2,1,1
1,51676,61.0,0,0,202.21,28.1,1,0,1,3,0,2
2,31112,80.0,0,1,105.92,32.5,1,1,1,2,0,2
3,60182,49.0,0,0,171.23,34.4,1,0,1,2,1,3
4,1665,79.0,1,0,174.12,24.0,1,0,1,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,83.75,28.1,0,0,1,2,1,2
5106,44873,81.0,0,0,125.20,40.0,0,0,1,3,1,2
5107,19723,35.0,0,0,82.99,30.6,0,0,1,3,0,2
5108,37544,51.0,0,0,166.29,25.6,0,1,1,2,0,1


In [20]:
X = df_label_encoded.drop(['stroke'],axis = 1)
y = df_label_encoded['stroke']

In [23]:
lr1 = LogisticRegression(class_weight='balanced')
lr1.fit(X,y)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
lr1.score(X,y)

0.7135029354207436

In [28]:
y_pred =lr1.predict(X)
print(accuracy_score(y,y_pred))
print(confusion_matrix(y,y_pred))
print(classification_report(y,y_pred))

0.7135029354207436
[[3460 1401]
 [  63  186]]
              precision    recall  f1-score   support

           0       0.98      0.71      0.83      4861
           1       0.12      0.75      0.20       249

    accuracy                           0.71      5110
   macro avg       0.55      0.73      0.51      5110
weighted avg       0.94      0.71      0.80      5110



### Appraoch 3 : Using One Hot Encoder


In [36]:
lr2.score(X,y)

0.47084148727984343

In [33]:
# This is because in newer versions of scikit-learn (v1.2.0 and above), the sparse parameter has been deprecated and replaced by:
#sparse_output=False
enc = OneHotEncoder(sparse_output=False)
columns_to_encode = ['gender','ever_married','work_type','Residence_type','smoking_status']
one_hot_encoded = enc.fit_transform(df[columns_to_encode])
df_one_hot_encoded = pd.DataFrame(one_hot_encoded, columns = enc.get_feature_names_out(columns_to_encode))
df_one_hot_encoded

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5106,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5107,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5108,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [34]:
X = df_one_hot_encoded
y = df['stroke']
lr2 = LogisticRegression(class_weight='balanced')
lr2.fit(X,y)


In [37]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # after one-hot encoding


In [39]:
lr2.score(X,y)

0.47084148727984343

In [41]:
y_pred =lr2.predict(X)

In [42]:
print(confusion_matrix(y,y_pred))
print(accuracy_score(y,y_pred))
print(classification_report(y,y_pred))

[[2209 2652]
 [  52  197]]
0.47084148727984343
              precision    recall  f1-score   support

           0       0.98      0.45      0.62      4861
           1       0.07      0.79      0.13       249

    accuracy                           0.47      5110
   macro avg       0.52      0.62      0.37      5110
weighted avg       0.93      0.47      0.60      5110

