In [1]:
# dependenies
import pandas as pd

In [3]:
# read the .csv

# file path to the CSV file
file_path = 'resources/healthcare-dataset-stroke-data.csv'

# Read the CSV file into a DataFrame
stroke_df = pd.read_csv(file_path)

stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
num_rows = stroke_df.shape[0]
print("Number of rows:", num_rows)


Number of rows: 5110


In [4]:
# Column Names for reference
print(stroke_df.columns)


Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


# Clean up: Find and Remove Missing Values

In [5]:
#Count how many null values are in the dataset

null_counts = stroke_df.isnull().sum()
print(null_counts)

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [7]:
#Drop null values from the dataset

stroke_df.dropna(inplace=True)


In [8]:
#count the number of rows after the drop 
# should have 4909 [5110 - 201]

num_rows = stroke_df.shape[0]
print("Number of rows:", num_rows)

Number of rows: 4909


In [10]:
# reset the index to ensure that the numbers in the index are consecutive after the drop

stroke_df.reset_index(drop=True, inplace=True)
print(stroke_df.index)


RangeIndex(start=0, stop=4909, step=1)


# Data Clean Up: Duplicates

In [12]:
# check for duplicate values

duplicate_rows = stroke_df.duplicated()

num_duplicates = duplicate_rows.sum()
print("Number of duplicate rows:", num_duplicates)


Number of duplicate rows: 0


# Data Clean Up: Update the Data Types

In [13]:
# Check the data types

data_types = stroke_df.dtypes
print(data_types)


id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [14]:
# Convert age column to integer 

stroke_df['age'] = stroke_df['age'].astype(int)


In [16]:
# confirm the data types 
data_types = stroke_df.dtypes
print(data_types)

id                     int64
gender                object
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


# Data Clean Up: Export and save the cleaned data

In [18]:
# Verify changes in the original DataFrame
stroke_df.head()  
stroke_df.info()  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4909 non-null   int64  
 1   gender             4909 non-null   object 
 2   age                4909 non-null   int64  
 3   hypertension       4909 non-null   int64  
 4   heart_disease      4909 non-null   int64  
 5   ever_married       4909 non-null   object 
 6   work_type          4909 non-null   object 
 7   Residence_type     4909 non-null   object 
 8   avg_glucose_level  4909 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     4909 non-null   object 
 11  stroke             4909 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 460.3+ KB


In [19]:
# Export the cleaned data set 

stroke_df.to_csv('resources/Cleaned_Stroke_Data.csv', index=False)
