# Stroke data Exploration

## Loading


In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('data/stroke.csv')
pd.set_option("display.precision", 2)

## First approach to data

In [3]:
print(
    f" Type = {type(df)}\n Len = {len(df)}\n Shape = {df.shape} "
)

 Type = <class 'pandas.core.frame.DataFrame'>
 Len = 5110
 Shape = (5110, 12) 


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
df.describe()


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.83,43.23,0.1,0.05,106.15,28.89,0.05
std,21161.72,22.61,0.3,0.23,45.28,7.85,0.22
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.25,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.88,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [7]:
df.describe(include=object)

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,5110,5110,5110,5110,5110
unique,3,2,5,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2994,3353,2925,2596,1892


In [31]:
df['stroke'] = df['stroke'].astype('category')

## Exploring Dataset

In [8]:
df.duplicated(subset=["id"]).value_counts()

False    5110
dtype: int64

Since there is no duplicated values we can drop this column. 

In [9]:
df.drop("id", axis=1, inplace=True)

In [10]:
df["gender"].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

Drop "Other" gender since it is not relevant. 

In [11]:
df.drop(df.index[df['gender']=="Other"], inplace=True)

In [12]:
df["ever_married"].value_counts()

Yes    3353
No     1756
Name: ever_married, dtype: int64

In [13]:
df["Residence_type"].value_counts()

Urban    2596
Rural    2513
Name: Residence_type, dtype: int64

Substitue binary types with 0 and 1

In [34]:
df.replace({'Yes':1, 'No':0, 'Urban':1, 'Rural':0, 'Male':0, 'Female':1}, inplace=True)

df['heart_disease'] = df['heart_disease'].astype('category')
df["hypertension"] = df["hypertension"].astype('category')
df["gender"] = df["gender"].astype('category')
df["ever_married"] = df["ever_married"].astype('category')
df["Residence_type"] = df["Residence_type"].astype('category')


In [15]:
df.isna().any()

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
stroke               False
dtype: bool

In [16]:
df[df.isna().any(axis='columns') ].shape

(201, 11)

In [17]:
df[df.isna().any(axis='columns') & (df["stroke"] == 1)].shape

(40, 11)

In [18]:
df[(df["stroke"] == 1)].shape

(249, 11)

In [21]:
df_noNulls = df.dropna()
print(f'df shape = {df.shape}\ndf_noNulls shape = {df_noNulls.shape} ')

df shape = (5109, 11)
df_noNulls shape = (4908, 11) 


In [27]:
df_noNulls_zero = df_noNulls.query("stroke == 0").sample(n= 209)
df_noNulls_one = df_noNulls[df_noNulls["stroke"]==1].copy()
df_noNulls_reduced = pd.concat([df_noNulls_zero, df_noNulls_one])

In [29]:
df_noNulls_reduced.shape

(418, 11)

In [30]:
df.to_pickle('./data/clean_stroke.pkl')
df_noNulls_reduced.to_pickle('./data/clean_reduced.pkl')