# Cleaning "Stroke Prediction Dataset" (Parquet) with Pandas

## Import Dependencies

In [21]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import time

## Load and Read Parquet Dataset into Pandas DataFrame

In [22]:
%%time
#? Load File
Path = "1_parquet_conversion/stroke.parquet.gzip"

#? Read the CSVs into a dataframe
stroke_df = pd.read_parquet(Path)

Wall time: 12.5 ms


## Show DataFrame

In [23]:
stroke_df.sample(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1010,84,Male,55.0,0,0,Yes,Private,Urban,89.17,31.5,never smoked,0
3103,23171,Male,66.0,0,0,Yes,Private,Rural,88.83,29.1,Unknown,0
4159,9189,Female,20.0,0,0,No,Private,Urban,80.27,27.9,never smoked,0
129,48796,Female,75.0,0,0,Yes,Govt_job,Urban,62.48,,Unknown,1
2719,49229,Male,52.0,0,0,No,Govt_job,Rural,72.71,36.9,formerly smoked,0
1378,7806,Female,42.0,0,0,Yes,Private,Urban,158.89,37.6,smokes,0
144,17308,Female,72.0,1,0,Yes,Private,Urban,221.79,30.0,never smoked,1
2749,29789,Female,46.0,0,0,Yes,Private,Rural,116.84,28.2,never smoked,0
1008,58567,Female,42.0,0,0,Yes,Private,Rural,84.86,22.8,Unknown,0
1265,45209,Female,14.0,0,0,No,Private,Rural,118.81,24.7,Unknown,0


## Convert to the smallest datatype possible for each numeric column

In [24]:
%%time 
float_cols = stroke_df.select_dtypes(include=['float'])
int_cols = stroke_df.select_dtypes(include=['int'])

for cols in float_cols.columns:
    stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'float')
    
for cols in int_cols.columns:
    stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'integer')
    
print(stroke_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int32  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float32
 3   hypertension       5110 non-null   int8   
 4   heart_disease      5110 non-null   int8   
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float32
 9   bmi                4909 non-null   float32
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int8   
dtypes: float32(3), int32(1), int8(3), object(5)
memory usage: 294.5+ KB
None
Wall time: 35.9 ms


## Convert 'age' column from "float32" to "int8" datatype

In [25]:
%%time
stroke_df['age'] = stroke_df['age'].astype('int8')

Wall time: 2 ms


## What is the Shape of the DataFrame?

In [28]:
%%time
stroke_df.shape

Wall time: 0 ns


(5110, 12)

## Find Missing Values

### Count Method

In [30]:
%%time
stroke_df.count()

Wall time: 7.98 ms


id                   5110
gender               5110
age                  5110
hypertension         5110
heart_disease        5110
ever_married         5110
work_type            5110
Residence_type       5110
avg_glucose_level    5110
bmi                  4909
smoking_status       5110
stroke               5110
dtype: int64

### isnull Method for 'bmi' column

In [37]:
%%time
stroke_isnull_df = stroke_df['bmi'].isnull().sum()
stroke_isnull_df

Wall time: 998 µs


201

## If I dropped all these rows, what percentage of the data would be lost?