In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')
dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
from collections import Counter

In [4]:
columns = dataset.columns
columns_lower = [x.lower() for x in columns]
last_column = columns[len(columns)-1]
y = dataset[last_column]
dataset.drop(last_column,inplace=True,axis=1)
count = Counter(y)
count

Counter({1: 249, 0: 4861})

In [5]:
datatypes = dataset.dtypes
datatypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
dtype: object

In [6]:
cat_cols = []
num_cols = []
id1 = "id"
for i in range(len(datatypes)):
    if datatypes[i]=='object':
        unqval = dataset[datatypes.index[i]].nunique()
        if (unqval < 30):
            cat_cols.append(datatypes.index[i])
        else:
            del dataset[datatypes.index[i]]
            print('Deleted: ',datatypes.index[i])
    else:
        if(id1 in columns_lower[i]):
            del dataset[datatypes.index[i]]
            print('Deleted: ',datatypes.index[i])
        else:
            num_cols.append(datatypes.index[i])
num_cols

Deleted:  id


['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

In [7]:
cat_cols

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [8]:
if(len(cat_cols)>0):
    cat_data = dataset[cat_cols]
if(len(num_cols)>0):
    num_data = dataset[num_cols]

In [9]:
#num,cat

In [10]:
if len(num_data):
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')
    imputer.fit(num_data.iloc[:,:].values)
    num_data = pd.DataFrame(imputer.transform(num_data.iloc[:,:].values))
    num_data.columns = num_cols
num_data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
0,67.0,0.0,1.0,228.69,36.600000
1,61.0,0.0,0.0,202.21,28.893237
2,80.0,0.0,1.0,105.92,32.500000
3,49.0,0.0,0.0,171.23,34.400000
4,79.0,1.0,0.0,174.12,24.000000
...,...,...,...,...,...
5105,80.0,1.0,0.0,83.75,28.893237
5106,81.0,0.0,0.0,125.20,40.000000
5107,35.0,0.0,0.0,82.99,30.600000
5108,51.0,0.0,0.0,166.29,25.600000


In [11]:
df = pd.concat([num_data,cat_data],axis=1)
df = df.dropna()
df.isnull().sum()
(df)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender,ever_married,work_type,Residence_type,smoking_status
0,67.0,0.0,1.0,228.69,36.600000,Male,Yes,Private,Urban,formerly smoked
1,61.0,0.0,0.0,202.21,28.893237,Female,Yes,Self-employed,Rural,never smoked
2,80.0,0.0,1.0,105.92,32.500000,Male,Yes,Private,Rural,never smoked
3,49.0,0.0,0.0,171.23,34.400000,Female,Yes,Private,Urban,smokes
4,79.0,1.0,0.0,174.12,24.000000,Female,Yes,Self-employed,Rural,never smoked
...,...,...,...,...,...,...,...,...,...,...
5105,80.0,1.0,0.0,83.75,28.893237,Female,Yes,Private,Urban,never smoked
5106,81.0,0.0,0.0,125.20,40.000000,Female,Yes,Self-employed,Urban,never smoked
5107,35.0,0.0,0.0,82.99,30.600000,Female,Yes,Self-employed,Rural,never smoked
5108,51.0,0.0,0.0,166.29,25.600000,Male,Yes,Private,Rural,formerly smoked


In [12]:
cat_data = df[cat_cols]
num_data = df[num_cols]

In [13]:
#if len(cat_cols):
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = pd.DataFrame(OH_encoder.fit_transform(df[cat_cols]))

# One-hot encoding removed index; put it back
X_encoded.index = df.index

# Remove categorical columns (will replace with one-hot encoding)
num_X = df.drop(cat_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X = pd.concat([num_X, X_encoded], axis=1)

In [14]:
OH_X

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,0,1,2,3,4,...,6,7,8,9,10,11,12,13,14,15
0,67.0,0.0,1.0,228.69,36.600000,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,61.0,0.0,0.0,202.21,28.893237,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,80.0,0.0,1.0,105.92,32.500000,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,49.0,0.0,0.0,171.23,34.400000,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,79.0,1.0,0.0,174.12,24.000000,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,80.0,1.0,0.0,83.75,28.893237,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5106,81.0,0.0,0.0,125.20,40.000000,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5107,35.0,0.0,0.0,82.99,30.600000,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5108,51.0,0.0,0.0,166.29,25.600000,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
