In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score
from pandas import Series, DataFrame


In [2]:
dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
NAs = pd.concat([dataset.isnull().sum()], axis = 1, keys=["Dataset"])
NAs[NAs.sum(axis=1)>0]
#checks if there is any missing data in the dataset

Unnamed: 0,Dataset
bmi,201


In [5]:
dataset["bmi"] = dataset["bmi"].fillna(dataset["bmi"].mean())
#fills the missing data with the mean of the column 

In [6]:
subset = dataset.groupby('stroke').sample(n=249, random_state=0)
subset.stroke.value_counts()
#taking a subset of the data to have equal number of stoke patients, if this step is not done you'll get
#95% accuracy because the there is an imbalance between people that have stokes and people that don't
#In the 5000 person dataset, only 249 people actually have a stoke.

0    249
1    249
Name: stroke, dtype: int64

In [7]:
subset = subset.replace(to_replace = ['Male','Female'],value = [1 , 0])
subset['Residence_type'] = subset['Residence_type'].map({'Urban': 1, 'Rural': 0})
#Changing protected variables to binary to compare them in the bias tool(Avoiding one hot encoding)

In [8]:
for col in subset.dtypes[subset.dtypes == "object"].index:
    for_dummy = subset.pop(col)
    subset = pd.concat([subset, pd.get_dummies(for_dummy, prefix=col)], axis=1)
subset.head()
#creates dummy nodes for categorical variable, "0 and 1" dummy nodes

Unnamed: 0,id,gender,age,hypertension,heart_disease,Residence_type,avg_glucose_level,bmi,stroke,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
560,44912,1,12.0,0,0,1,67.06,16.1,0,1,0,0,0,0,0,1,1,0,0,0
403,66972,0,52.0,0,0,1,80.88,23.8,0,0,1,1,0,0,0,0,0,0,0,1
2276,1451,0,17.0,0,0,1,78.46,23.5,0,1,0,0,0,1,0,0,1,0,0,0
1936,49797,0,28.0,0,0,0,75.53,34.9,0,1,0,0,0,1,0,0,0,0,1,0
1768,70241,0,22.0,0,0,1,66.29,20.5,0,1,0,0,0,1,0,0,0,0,0,1


In [9]:
labels = subset.pop("stroke")
#removes the sroke column, so that we can test the data without the stoke column.
#removes the stroke column for the subset only, not all of dataset.

In [10]:
x_train, x_test, y_train, y_test = train_test_split(subset, labels, random_state=42, test_size=0.2)
#split the data, 80% train and 20% test
#print(x_train.dtypes)

In [11]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [12]:
classifier = KNeighborsClassifier(n_neighbors=15, p=2,metric='euclidean')
classifier.fit(x_train, y_train)
#Using 15 nearest neighbors because it's most accurate.

KNeighborsClassifier(metric='euclidean', n_neighbors=15)

In [13]:
y_pred = classifier.predict(x_test)
#Predicting

In [14]:
cm = confusion_matrix(y_test, y_pred)
print (cm)

[[32 19]
 [11 38]]


In [15]:
print(accuracy_score(y_test, y_pred))
#accuracy

0.7


In [16]:
len(y_pred)

100

In [17]:
subset['score']=pd.Series(y_pred)
subset['label_value']=pd.Series(labels)
subset
#putting the stroke data back into the subset, to measure how bias the predictions are

Unnamed: 0,id,gender,age,hypertension,heart_disease,Residence_type,avg_glucose_level,bmi,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,score,label_value
560,44912,1,12.0,0,0,1,67.06,16.100000,1,0,...,0,0,0,1,1,0,0,0,,0
403,66972,0,52.0,0,0,1,80.88,23.800000,0,1,...,0,0,0,0,0,0,0,1,,0
2276,1451,0,17.0,0,0,1,78.46,23.500000,1,0,...,0,1,0,0,1,0,0,0,,0
1936,49797,0,28.0,0,0,0,75.53,34.900000,1,0,...,0,1,0,0,0,0,1,0,,0
1768,70241,0,22.0,0,0,1,66.29,20.500000,1,0,...,0,1,0,0,0,0,0,1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,25904,0,76.0,1,1,1,199.86,28.893237,0,1,...,0,0,1,0,0,0,0,1,,1
124,14164,0,72.0,0,0,1,219.91,28.893237,0,1,...,0,1,0,0,1,0,0,0,,1
72,27169,0,66.0,1,0,0,116.55,31.100000,0,1,...,0,0,0,0,0,1,0,0,0.0,1
12,12175,0,54.0,0,0,1,104.51,27.300000,0,1,...,0,1,0,0,0,0,0,1,1.0,1


In [18]:
#compression_opts = dict(method='zip',
#                       archive_name='KNNsubset.csv')  
#subset.to_csv('KNNsubset.zip', index=False,
#         compression=compression_opts)  
#creating a csv file for the subset
#Uncomment this block of code to generate csv file to put into Aequitas tool