In [8]:
# Import libraries and dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt

In [9]:
# Import the healthcare dataset into a Pandas Dataframe
healthcare_df = pd.read_csv(Path("./Resources/healthcare-dataset-stroke-data.csv"))

# Review the DataFrame
healthcare_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [10]:
# Drop the ;id' column
healthcare_df = healthcare_df.drop(['id'],axis=1)

In [11]:
# Checking for null values
healthcare_df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [12]:
# Determine mean value for BMI
healthcare_df['bmi'].mean()

28.893236911794666

In [13]:
# filling the Nan value in bmi colum with mean value 
healthcare_df = healthcare_df.fillna(healthcare_df['bmi'].mean())
healthcare_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [14]:
# Add a feature for the 'number of risks'
number_of_risks = []

for index, row in healthcare_df.iterrows():
    counter=0
    if row['avg_glucose_level'] >= 140:
        counter+=1
    if row['heart_disease']==1:
        counter+=1
    if row['hypertension']==1:
        counter+=1
    if row['smoking_status']=='smokes' or row['smoking_status']=='formerly smoked':
        counter+=1
    if row['bmi']>25:
        counter+=1
        
    number_of_risks.append(counter)

healthcare_df["number_of_risks"]= number_of_risks

healthcare_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,number_of_risks
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,4
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1,2
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,2
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,3
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,2


In [15]:
#convert categorical data
#create encoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse_output=False)

In [18]:
healthcare_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
 11  number_of_risks    5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [19]:
list(healthcare_df.dtypes[healthcare_df.dtypes == "object"].index)

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [20]:
# Create a list of the columns with categorical variables
categorical_variables = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'number_of_risks']

# Use the fit_transform method from the OneHotEncoder to encode the data
encoded_data = enc.fit_transform(healthcare_df[categorical_variables])

In [21]:
 # Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names_out(categorical_variables)
)

# Display sample data
encoded_df.head()

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,...,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,number_of_risks_0,number_of_risks_1,number_of_risks_2,number_of_risks_3,number_of_risks_4,number_of_risks_5
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [22]:
# Create a DataFrame with the columnns containing numerical variables from the original dataset
numerical_variables_df = healthcare_df.drop(columns = categorical_variables)

# Review the DataFrame
numerical_variables_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,67.0,0,1,228.69,36.6,1
1,61.0,0,0,202.21,28.893237,1
2,80.0,0,1,105.92,32.5,1
3,49.0,0,0,171.23,34.4,1
4,79.0,1,0,174.12,24.0,1


In [23]:
# Using the Pandas concat function, combine the DataFrames the contain the encoded categorical data and the numerical data
healthcare_encoded = pd.concat(
    [
        numerical_variables_df,
        encoded_df
    ],
    axis=1
)

healthcare_encoded.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,number_of_risks_0,number_of_risks_1,number_of_risks_2,number_of_risks_3,number_of_risks_4,number_of_risks_5
0,67.0,0,1,228.69,36.6,1,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,61.0,0,0,202.21,28.893237,1,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,80.0,0,1,105.92,32.5,1,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,49.0,0,0,171.23,34.4,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,79.0,1,0,174.12,24.0,1,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
