# EDA on Stroke Data Set

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [4]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:

for column in df: 
    counted_nulls = df[column].isna().sum()
    print('{} Nas   {}'.format(counted_nulls, column))

0 Nas   id
0 Nas   gender
0 Nas   age
0 Nas   hypertension
0 Nas   heart_disease
0 Nas   ever_married
0 Nas   work_type
0 Nas   Residence_type
0 Nas   avg_glucose_level
201 Nas   bmi
0 Nas   smoking_status
0 Nas   stroke


There is 201 Nulls within bmi. 

In [6]:
total_rows = len(df)
print('Number of rows = {} \nRatio of nulls to total = {}%'.format(total_rows, round(df.bmi.isna().sum()/total_rows*100)))

Number of rows = 5110 
Ratio of nulls to total = 4%


The missing BMI is assumed MNAR. There is a possibility the patient didn't want to disclose that information to the record. Or they didn't wanted to be weighed. 

In [7]:
df_clean = df.dropna()
df_clean

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [9]:
df_clean.gender = df_clean.gender.replace({'Male':0,'Female':1})
# df_clean.work_type.uniques()
df_clean.work_type = df_clean.work_type.replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':3,'Never_worked':4})
df_clean.dtypes
df_clean.corr().round(2)
# sns.pairplot(df)

# fig = plt.figure()
# ax = fig.add_subplot(111)
# cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
# fig.colorbar(cax)
# ticks = np.arange(0,len(df_clean.columns),1)
# ax.set_xticks(ticks)
# plt.xticks(rotation=90)
# ax.set_yticks(ticks)
# ax.set_xticklabels(df_clean.columns)
# ax.set_yticklabels(df_clean.columns)
# plt.show()

Unnamed: 0,id,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,stroke
id,1.0,0.01,0.0,0.0,-0.02,0.01,0.0,0.0
age,0.01,1.0,0.27,0.26,-0.42,0.24,0.33,0.23
hypertension,0.0,0.27,1.0,0.12,-0.07,0.18,0.17,0.14
heart_disease,0.0,0.26,0.12,1.0,-0.05,0.15,0.04,0.14
work_type,-0.02,-0.42,-0.07,-0.05,1.0,-0.06,-0.35,-0.06
avg_glucose_level,0.01,0.24,0.18,0.15,-0.06,1.0,0.18,0.14
bmi,0.0,0.33,0.17,0.04,-0.35,0.18,1.0,0.04
stroke,0.0,0.23,0.14,0.14,-0.06,0.14,0.04,1.0


The data is being prepared to be placed inside the PIMA dataset.

In [11]:
df_out = pd.DataFrame(columns=['Glucose','BMI','Age'])

In [13]:
df_out.Glucose = df_clean.avg_glucose_level
df_out.BMI = df_clean.bmi
df_out.Age = df_clean.age
df_out

Unnamed: 0,Glucose,BMI,Age
0,228.69,36.6,67.0
2,105.92,32.5,80.0
3,171.23,34.4,49.0
4,174.12,24.0,79.0
5,186.21,29.0,81.0
...,...,...,...
5104,103.08,18.6,13.0
5106,125.20,40.0,81.0
5107,82.99,30.6,35.0
5108,166.29,25.6,51.0


In [14]:
df_out.to_csv('stroke_data_prepared.csv')