In [10]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split


from sklearn.preprocessing import MinMaxScaler, StandardScaler
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [2]:
config

{'data': {'raw': {'file_raw': '../data/raw/healthcare-dataset-stroke-data.csv'},
  'clean': {'file_clean': '../data/clean/clean.csv'}}}

In [11]:
df = pd.read_csv(config["data"]["raw"]["file_raw"])
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [12]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [13]:
df["bmi"] = df["bmi"].fillna(df["bmi"].mean())

In [14]:
features = df.drop(columns= "stroke")
target = df["stroke"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [None]:
X_train,X_test

In [20]:
X_train_num = X_train.select_dtypes(include="number").drop("id",axis = 1)
X_test_num = X_test.select_dtypes(include="number").drop("id",axis = 1)

In [23]:
normalizer = MinMaxScaler()
normalizer.fit(X_train_num)
X_train_num_norm = normalizer.transform(X_train_num)
X_test_num_norm = normalizer.transform(X_test_num)

In [27]:
X_train_num_norm = pd.DataFrame(X_train_num_norm, columns=X_train_num.columns, index=X_train_num.index )
X_train_num_norm.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
3029,0.438477,0.0,0.0,0.056181,0.302405
2277,0.414062,0.0,0.0,0.131151,0.437572
3002,0.731445,0.0,0.0,0.046348,0.234822
246,0.914551,0.0,0.0,0.109316,0.21764
2825,0.926758,0.0,0.0,0.016296,0.175258


In [28]:
X_test_num_norm = pd.DataFrame(X_test_num_norm, columns=X_test_num.columns, index=X_test_num.index )
X_test_num_norm.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
42,1.0,0.0,1.0,0.414458,0.184422
380,0.047852,0.0,0.0,0.235897,0.07331
3524,0.707031,0.0,0.0,0.114625,0.178694
352,0.243164,0.0,0.0,0.191349,0.272623
4927,0.121094,0.0,0.0,0.067953,0.038946


In [None]:
# X_obj = df.select_dtypes(include = "object")
# X_train_obj = X_train.select_dtypes(include="object")
# X_test_obj = X_test.select_dtypes(include="object")
# X_obj, X_train_obj,X_test_obj

(      gender ever_married      work_type Residence_type   smoking_status
 0       Male          Yes        Private          Urban  formerly smoked
 1     Female          Yes  Self-employed          Rural     never smoked
 2       Male          Yes        Private          Rural     never smoked
 3     Female          Yes        Private          Urban           smokes
 4     Female          Yes  Self-employed          Rural     never smoked
 ...      ...          ...            ...            ...              ...
 5105  Female          Yes        Private          Urban     never smoked
 5106  Female          Yes  Self-employed          Urban     never smoked
 5107  Female          Yes  Self-employed          Rural     never smoked
 5108    Male          Yes        Private          Rural  formerly smoked
 5109  Female          Yes       Govt_job          Urban          Unknown
 
 [5110 rows x 5 columns],
       gender ever_married      work_type Residence_type   smoking_status
 3029  Fem

In [63]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

ohe.fit(X_train[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
X_train_obj_trans = ohe.transform(X_train[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
X_test_obj_trans = ohe.transform(X_test[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])

In [64]:
X_train_obj_trans,X_test_obj_trans

(array([[1., 0., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 1., 0., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 1.],
        [0., 1., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 0., 1., 0.]]),
 array([[0., 1., 0., ..., 0., 0., 1.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 1., 0., 0.]]))

In [65]:
X_train_obj_trans = pd.DataFrame(X_train_obj_trans, columns=ohe.get_feature_names_out(), index=X_train.index)
X_test_obj_trans = pd.DataFrame(X_test_obj_trans, columns=ohe.get_feature_names_out(), index=X_test.index)

X_train_obj_trans.head()


Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
3029,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2277,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3002,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
246,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2825,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [66]:
X_train_trans = pd.concat([X_train_num_norm, X_train_obj_trans], axis=1)
X_test_trans = pd.concat([X_test_num_norm, X_test_obj_trans], axis=1)

In [68]:
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor

1.KNN modell

Using KNeighborsClassifier because the predicted "stroke" column is categorical

In [70]:
knn = KNeighborsClassifier()
knn.fit(X_train_trans, y_train)
y_train_knn_pred = knn.predict(X_train_trans)
y_test_knn_pred = knn.predict(X_test_trans)
accuracy_train_knn = knn.score(X_train_trans, y_train)
accuracy_test_knn = knn.score(X_test_trans, y_test)

In [77]:
y_train_knn_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [79]:
y_test_knn_pred

array([0.8, 0. , 0. , ..., 0.2, 0. , 0. ])

In [76]:
print(f"The accuracy of the model with Classification is {accuracy_train_knn: .2f}")

The accuracy of the model with Classification is  0.95


In [74]:
print(f"The accuracy of the model with Classification is {accuracy_test_knn: .2f}")

The accuracy of the model with Classification is  0.95
