In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.impute import KNNImputer
from impyute import mice
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
# Read csv file healthcare-dataset-stroke-data.csv and shufle it

df = pd.read_csv('healthcare.csv')
df = df.sample(frac=1,random_state=69)
df.head(20)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
898,62716,Female,59.0,0,0,Yes,Self-employed,Urban,81.64,32.8,Unknown,0
2034,64393,Male,56.0,0,0,No,Self-employed,Rural,87.95,25.2,never smoked,0
4444,38617,Male,28.0,0,0,Yes,Self-employed,Urban,73.98,29.9,never smoked,0
1425,1577,Female,17.0,0,0,No,Private,Urban,70.01,43.0,Unknown,0
1744,66592,Male,16.0,0,0,No,Private,Rural,122.46,18.7,never smoked,0
249,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,Unknown,0
5059,64420,Female,61.0,0,0,Yes,Govt_job,Rural,120.23,22.7,Unknown,0
4320,30002,Male,44.0,1,0,Yes,Self-employed,Rural,83.59,24.1,never smoked,0
3775,4707,Female,63.0,0,0,Yes,Private,Urban,83.74,21.4,Unknown,0
3884,44642,Male,52.0,0,0,Yes,Govt_job,Urban,93.28,36.3,never smoked,0


## Features data types

- Seven features are integer or floats.
- Five features are strings (object).

### Features with null values

- bmi feature contain 201 of null values


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5110 entries, 898 to 4041
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 519.0+ KB


### Distribution of categorical feature

- gender feature have 3 possible values with 58% Female.
- ever_married feature have 2 possible values with 66% Yes.
- work_type feature have 5 possible values with 57% Private.
- Residence_type feature are almost equal with 2 possible values with 51% Urban.
- smoking_status feature have 4 possible values with 37% Urban.

In [4]:
# categorical feature

df.describe(include='O')

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,5110,5110,5110,5110,5110
unique,3,2,5,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2994,3353,2925,2596,1892


### Distribution of numerical feature

- Total samples are 5110.
- hypertension, heart_disease and stroke are categorical feature with 0 or 1 values.
- Around 0.4% samples had stroke events
- we have inbalanced data

In [5]:
# numerical feature

df.describe(percentiles=[.1, .2, .3, .4, .5, .6, .7, .8, .95, .96,.97])

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
10%,6972.5,11.0,0.0,0.0,65.789,19.7,0.0
20%,14160.6,20.0,0.0,0.0,73.76,22.5,0.0
30%,21968.4,30.0,0.0,0.0,80.038,24.5,0.0
40%,29365.8,38.0,0.0,0.0,85.6,26.4,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
60%,44330.2,51.0,0.0,0.0,98.914,29.8,0.0


In [6]:
# How we check before sum of nan values

df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

### Function that get accuracy score of KNN

In [7]:
def get_knn_acc(df):
    X = df.iloc[:, :-1]
    y = df['stroke']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69)
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    return accuracy_score(knn.predict(X_test), y_test)

### Score with unchanged data

In [8]:
get_knn_acc(df[['age','hypertension','heart_disease','avg_glucose_level','stroke']])

0.9420970266040689

### Convert the categorical  to ordinal.

In [9]:
df = pd.get_dummies(df, columns=['gender','ever_married','work_type',\
                                'Residence_type','smoking_status'], drop_first=True)
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
898,62716,59.0,0,0,81.64,32.8,0,0,0,1,0,0,1,0,1,0,0,0
2034,64393,56.0,0,0,87.95,25.2,0,1,0,0,0,0,1,0,0,0,1,0
4444,38617,28.0,0,0,73.98,29.9,0,1,0,1,0,0,1,0,1,0,1,0
1425,1577,17.0,0,0,70.01,43.0,0,0,0,0,0,1,0,0,1,0,0,0
1744,66592,16.0,0,0,122.46,18.7,0,1,0,0,0,1,0,0,0,0,1,0


### Score with converted values

In [10]:
get_knn_acc(df.drop(['bmi'],axis=1))

0.9507042253521126

# Methods for handle NAN values
##  1. Drop nan

In [11]:
drop_nan_df = df.copy()
drop_nan_df = drop_nan_df.dropna()

### drop_nan_df score

In [12]:
get_knn_acc(drop_nan_df)

0.9600977198697068

## 2. Fill with some values

In [13]:
fill_nan_df = df.copy()
fill_nan_df = fill_nan_df.fillna(0)

###   fill_nan_df score

In [14]:
get_knn_acc(fill_nan_df)

0.9507042253521126

## 3. fill with mean values

In [15]:
mean_nan_df = df.copy()
mean_nan_df = mean_nan_df.fillna(mean_nan_df.mean())
mean_nan_df = mean_nan_df.fillna(mean_nan_df.mean())

### mean_nan_df score

In [16]:
get_knn_acc(mean_nan_df)

0.9507042253521126

## 4. Fill with median values

In [17]:
median_nan_df = df.copy()
median_nan_df = median_nan_df.fillna(median_nan_df.median())

### median_nan_df score

In [18]:
get_knn_acc(median_nan_df)

0.9507042253521126

## 5. Fill using KNNInputer

In [19]:
KNNInputer_nan_df = df.copy()


X = KNNInputer_nan_df.values
imputer = KNNImputer(n_neighbors=5, weights='uniform')
X_imputed = imputer.fit_transform(X)
KNNInputer_nan_df = pd.DataFrame(X_imputed, columns=KNNInputer_nan_df.columns) 

### KNNInputer_nan_df score

In [20]:
get_knn_acc(KNNInputer_nan_df)

0.9507042253521126

## 6. Fill using MICE

In [21]:
MICE_nan_df = df.copy()
X = MICE_nan_df.values
X_imputed = mice(X)
MICE_nan_df = pd.DataFrame(X_imputed, columns=MICE_nan_df.columns)

### MICE_nan_df score

In [22]:
get_knn_acc(MICE_nan_df)

0.9507042253521126

## As we noticed, our best score was with droping nan value

In [24]:
drop_nan_df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
898,62716,59.0,0,0,81.64,32.8,0,0,0,1,0,0,1,0,1,0,0,0
2034,64393,56.0,0,0,87.95,25.2,0,1,0,0,0,0,1,0,0,0,1,0
4444,38617,28.0,0,0,73.98,29.9,0,1,0,1,0,0,1,0,1,0,1,0
1425,1577,17.0,0,0,70.01,43.0,0,0,0,0,0,1,0,0,1,0,0,0
1744,66592,16.0,0,0,122.46,18.7,0,1,0,0,0,1,0,0,0,0,1,0


## Now i will use IsolationForest for identifying outliers

In [25]:
# Define X and y

X = drop_nan_df.iloc[:, :-1].values
y = drop_nan_df['stroke'].values

In [26]:
# Predict outliers

forest = IsolationForest(random_state=69)
forest.fit(X)
outlayer = forest.predict(X)

In [27]:
# Add column with outliers

drop_nan_df['OUT'] = outlayer
drop_nan_df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,OUT
898,62716,59.0,0,0,81.64,32.8,0,0,0,1,0,0,1,0,1,0,0,0,1
2034,64393,56.0,0,0,87.95,25.2,0,1,0,0,0,0,1,0,0,0,1,0,-1
4444,38617,28.0,0,0,73.98,29.9,0,1,0,1,0,0,1,0,1,0,1,0,-1
1425,1577,17.0,0,0,70.01,43.0,0,0,0,0,0,1,0,0,1,0,0,0,1
1744,66592,16.0,0,0,122.46,18.7,0,1,0,0,0,1,0,0,0,0,1,0,-1


### Check outiers in raport with stroke feature
- 201 values with stroke 1
- 1599 values with stroke 0

In [28]:
print(f"1 - {len(drop_nan_df[(drop_nan_df['stroke']==1) & (drop_nan_df['OUT']==-1)])}")
print(f"0 - {len(drop_nan_df[(drop_nan_df['stroke']==0) & (drop_nan_df['OUT']==-1)])}")

1 - 201
0 - 1599


In [29]:
# Values before deleting
drop_nan_df['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [30]:
# Deleting 1599 outlier values that have 
drop = drop_nan_df[(drop_nan_df['stroke']==0) & (drop_nan_df['OUT']==1) | (drop_nan_df['stroke']==1)]

In [31]:
# Values after deleting
drop['stroke'].value_counts()

0    3101
1     209
Name: stroke, dtype: int64

In [32]:
# Drop OUT feature

drop = drop.drop(['OUT'], axis=1)

In [33]:
# prepare data for modeling 

X = drop.iloc[:, :-1].values
y = drop['stroke'].values

In [34]:
# Split Data in test and train

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

In [35]:
# Apply StandardScaler

std = StandardScaler()
X_scaled = std.fit_transform(X_train)
X_test_scaled = std.transform(X_test)

In [36]:
# Train and predict with standartized data

knn2 = KNeighborsClassifier()
knn2.fit(X_scaled, y_train)
accuracy_score(knn2.predict(X_test_scaled), y_test)

1.0

# Final Result

## So we  got  1.0