In [188]:
#TITLE : StrokeGuard: Machine Learning-Based Stroke Risk Prediction Analysis

In [4]:
#Project Overview
#Problem Statement
#Dataset Description
#Project Goals

In [6]:
#2 Data Analysis Pipeline

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [9]:
# Set display options for better output readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [12]:
# Load the dataset
df = pd.read_csv('healthcare-dataset-stroke-data.csv')


In [250]:
# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows of the dataset:")
print(df.head())

Dataset Shape: (5110, 12)

First few rows of the dataset:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1 

In [252]:
print("\nDataset Information:")
print(df.info())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None


In [254]:
print("\nBasic statistical summary:")
print(df.describe())



Basic statistical summary:
                 id          age  hypertension  heart_disease  \
count   5110.000000  5110.000000   5110.000000    5110.000000   
mean   36517.829354    43.226614      0.097456       0.054012   
std    21161.721625    22.612647      0.296607       0.226063   
min       67.000000     0.080000      0.000000       0.000000   
25%    17741.250000    25.000000      0.000000       0.000000   
50%    36932.000000    45.000000      0.000000       0.000000   
75%    54682.000000    61.000000      0.000000       0.000000   
max    72940.000000    82.000000      1.000000       1.000000   

       avg_glucose_level          bmi       stroke  
count        5110.000000  4909.000000  5110.000000  
mean          106.147677    28.893237     0.048728  
std            45.283560     7.854067     0.215320  
min            55.120000    10.300000     0.000000  
25%            77.245000    23.500000     0.000000  
50%            91.885000    28.100000     0.000000  
75%           1

In [18]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())
print("\nChecking for memory usage:")
print(df.memory_usage(deep=True))


Missing values in each column:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Checking for memory usage:
Index                   132
id                    40880
gender               276819
age                   40880
hypertension          40880
heart_disease         40880
ever_married          40880
work_type            292528
Residence_type       275940
avg_glucose_level     40880
bmi                   40880
smoking_status       301911
stroke                40880
dtype: int64


In [20]:
def verify_imputation(df):
    """
    Function to verify BMI imputation results
    """
    print("\nBMI Statistics after imputation:")
    print(df['bmi'].describe())

# Ensure df is defined before calling the function
verify_imputation(df)



BMI Statistics after imputation:
count    4909.000000
mean       28.893237
std         7.854067
min        10.300000
25%        23.500000
50%        28.100000
75%        33.100000
max        97.600000
Name: bmi, dtype: float64


In [None]:
# Perform the preprocessing steps
# Convert binary categorical variables to numeric

binary_map = {'Yes': 1, 'No': 0}

# Ensure the column exists before mapping
if 'ever_married' in df.columns:
    df['ever_married'] = df['ever_married'].map(binary_map)

# If there are unexpected values, replace them with NaN
df['ever_married'] = df['ever_married'].apply(lambda x: binary_map.get(x, None))


In [22]:
# Convert categorical variables to category type
categorical_columns = ['gender', 'work_type', 'Residence_type', 'smoking_status']

# Ensure each column exists before converting
for col in categorical_columns:
    if col in df.columns:
        df[col] = df[col].astype('category')


In [32]:
from sklearn.impute import SimpleImputer

# Handle missing values in BMI
if 'bmi' in df.columns:
    bmi_imputer = SimpleImputer(strategy='median')
    df[['bmi']] = bmi_imputer.fit_transform(df[['bmi']])  # Keep DataFrame structure


In [34]:
# Verify if missing BMI values have been handled
missing_bmi_count = df['bmi'].isnull().sum()

if missing_bmi_count == 0:
    print("All missing BMI values have been successfully imputed.")
else:
    print(f"There are still {missing_bmi_count} missing BMI values.")


All missing BMI values have been successfully imputed.


In [38]:
# First, let's verify our processed dataset one final time before saving
print("Final verification before saving:")
print("\nDataset shape:", df.shape)
print("\nColumns and their types:")
for col in df.columns:
    print(f"{col}: {df[col].dtype}")

Final verification before saving:

Dataset shape: (5110, 12)

Columns and their types:
id: int64
gender: category
age: float64
hypertension: int64
heart_disease: int64
ever_married: int64
work_type: category
Residence_type: category
avg_glucose_level: float64
bmi: float64
smoking_status: category
stroke: int64


In [40]:
# Save the preprocessed dataset
output_filename = 'stroke_data_preprocessed.csv'
df.to_csv(output_filename, index=False)

In [42]:
# Verify the saved file
# Load it back to make sure it saved correctly
df_verify = pd.read_csv(output_filename)

print("\nVerification after saving:")
print("Original shape:", df.shape)
print("Saved file shape:", df_verify.shape)

# Verify all columns are preserved
print("\nAll columns present:", all(df.columns == df_verify.columns))

print(f"\nDataset successfully saved as '{output_filename}'")


Verification after saving:
Original shape: (5110, 12)
Saved file shape: (5110, 12)

All columns present: True

Dataset successfully saved as 'stroke_data_preprocessed.csv'
