## Introduction to Stroke Predict Project

This is the final project for DVAE26. In this project I have chosen to create a model for predictive analytics using tabular data. The dataset I decided to use for this project is the stroke dataset from kaggle. The models used for predictions are the decision trees and random forest trees.

### Imports For Data Quality

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [2]:
from ydata_profiling import ProfileReport

## Data Exploration and Quality Analysis.

### Load the data

In [2]:
df = pd.read_csv("data/healthcare-dataset-stroke-data.csv")

In [None]:
df.head()

In [None]:
df.info() 

In [None]:
df.profile_report()

In [None]:
median_bmi = df['bmi'].median()
print(median_bmi)

bmi_below_median = df[df['bmi'] <= df['bmi'].median()]
bmi_above_median = df[df['bmi'] > df['bmi'].median()]
print(bmi_below_median["stroke"].value_counts())
print("__________________")
print(bmi_above_median["stroke"].value_counts())

## Data Cleaning

In [None]:
# Check the unique values in the stroke column
print(df['stroke'].unique())

# Count the number of occurrences of each class
print(df['stroke'].value_counts())

### Remove Unnessary Columns

In [7]:
df.drop(['id'],axis=1,inplace = True)

### Remove duplicates

In [8]:
df.drop_duplicates(inplace=True)

### Outlier handling

In [None]:
%matplotlib inline

numeric_columns = ['age', 'avg_glucose_level', 'bmi']

plt.figure(figsize=(20, 5))
for i, column in enumerate(numeric_columns):
    plt.subplot(1, 3, i + 1)
    sns.boxplot(x=df[column], color="#245D5F")
    plt.title(column)
plt.tight_layout()
plt.show()

There are no outliers in the 'age' column, but there are outliers in the 'bmi' and 'avg_glucose_level' columns. I cannot delete them because they are important and it would affect the model. I will therefore utilize scaling and models that are not sensitive to outliers, such as Random Forest

## Data Visualisation

In [None]:
columns = ['gender','hypertension','ever_married','work_type','Residence_type','smoking_status','heart_disease',]
plt.figure(figsize=(20,13))
for col in range(len(columns)):
    plt.subplot(3,3,col+1)
    
    sns.countplot(x=df[columns[col]],hue = df['stroke'],palette = "ch:start=.5,rot=-.5")

In [None]:
plt.figure(figsize=(18,9))
sns.histplot(
    df["age"], kde=True,
    stat="density", kde_kws=dict(cut=3),color="#036272",
    alpha=.6, edgecolor='white')
plt.title('Age distribution')
plt.show()

In [None]:
plt.figure(figsize=(18,9))
sns.histplot(
    df["avg_glucose_level"], kde=True,
    stat="density", kde_kws=dict(cut=3),color="#036272",
    alpha=.6, edgecolor='white')
plt.title('Average blood glucose distribution')
plt.show()

In [None]:
stroke = dict(df['stroke'].value_counts())

fig = px.pie(names = stroke.keys(),values = stroke.values(),title = 'Stroke Occurance',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')

## Data Preprocessing

### Resamble Unbalanced Data

In [None]:
# Display original class distribution
print("Original class distribution:")
print(df['stroke'].value_counts())

# Split the stroke column into 0 and 1
df_0 = df[df.iloc[:, -1] == 0]
df_1 = df[df.iloc[:, -1] == 1]

# Display counts before resampling
print("\nClass distribution before resampling:")
print(f"Class 0 count: {df_0.shape[0]}")
print(f"Class 1 count: {df_1.shape[0]}")

from sklearn.utils import resample
# Resample minority class (1) to match majority class (0)
df_1 = resample(df_1, replace=True, n_samples=df_0.shape[0], random_state=42)

# Combine the resampled minority class with the majority class
df_resampled = pd.concat([df_0, df_1])

# Display counts after resampling
print("\nClass distribution after resampling:")
print(df_resampled['stroke'].value_counts())

I chose to use resample instead of SMOTE because it ensures simplicity, avoids generating unrealistic categorical combinations, and works directly with the existing data without additional preprocessing.

In [None]:
#concatenate upsampled data 
df = np.concatenate((df_0,df_1))

#create the balanced dataframe
df = pd.DataFrame(df)
df.columns = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 'avg_glucose_level', 'bmi','smoking_status', 'stroke']

# visualize balanced data 

plt.figure(figsize=(15,9))
sns.countplot(x=df["stroke"],palette="dark:salmon_r")
plt.title("Stroke")
plt.show()

## Feature Engineering

### Creating Age Groups

In [None]:
# Define age groups
def age_group(age):
    if age < 18:
        return 'Child'
    elif 18 <= age < 35:
        return 'Young Adult'
    elif 35 <= age < 60:
        return 'Adult'
    else:
        return 'Senior'

# Apply function to create a new feature
df['age_group'] = df['age'].apply(age_group)

# Convert to categorical type for easier handling
df['age_group'] = pd.Categorical(df['age_group'], categories=['Child', 'Young Adult', 'Adult', 'Senior'], ordered=True)

print(df['age_group'].value_counts())

### Encoding Data

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
columns_to_encode =['age_group', 'gender','ever_married','work_type','Residence_type','smoking_status']

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

df.head()

### Null Data Handling

In [None]:
df.isnull().sum()

In [None]:
# Separate the 'stroke' column from the rest of the dataset to avoid duplication
stroke_col = df['stroke']
df = df.drop(columns=['stroke'])

# Apply KNNImputer only to the remaining columns
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_imputed = imputer.fit_transform(df)

# Recreate the DataFrame and reattach the 'stroke' column
df = pd.DataFrame(df_imputed, columns=df.columns)
df['stroke'] = stroke_col

# Check for nulls again
print(df.isnull().sum())

### Target Variable Distribution

In [None]:
# Check class distribution
print("Target Variable Distribution:")
print(df['stroke'].value_counts(normalize=True))

# Visualize the target variable
sns.countplot(data=df, x='stroke', palette='Set2')
plt.title('Target Variable Distribution')
plt.xlabel('Stroke (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

### Feature Correlation Analysis

In [None]:
# Compute the correlation matrix
correlation_matrix = df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

## Model Development

### Imports for ML

In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix , accuracy_score , precision_score, recall_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn .ensemble import RandomForestClassifier
import joblib

### Data Split To>> Features , Target

In [24]:
x = df.drop('stroke', axis = 1)
y = pd.to_numeric( df['stroke'])

# Store the original column names
feature_names = x.columns

### Data scaling

In [25]:
scaler = StandardScaler()

x = scaler.fit_transform(x)

### Split Data into train and test

In [26]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = .20, random_state=42)

### Decision Trees

In [None]:
tree_model = DecisionTreeClassifier(criterion='entropy')
tree_model.fit(x_train,y_train)

tree_preds = tree_model.predict(x_test)

tree_acc = accuracy_score(y_test, tree_preds)

print(f"Accuracy: {tree_acc:.3f}")

### Random Forest

In [None]:
param_grid = {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20, None]}
rf_model = RandomForestClassifier(random_state=42)
# Perform Grid Search
grid_search = GridSearchCV(rf_model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Evaluate the model on the test set
rf_preds = grid_search.predict(x_test)
rf_acc = accuracy_score(y_test, rf_preds)
print(f"Accuracy: {rf_acc:.3f}")

# Save the best model from grid search
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_rf_model.joblib')

### Random Forest Feature Importances

In [None]:
# Get feature importances from the best model
feature_importances = best_model.feature_importances_

# Match feature importances to feature names
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df["Feature"], importance_df["Importance"], color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Random Forest Feature Importances")
plt.gca().invert_yaxis()
plt.show()

# Print the top N features
print(importance_df.head(10))

## Unit Tests

In [30]:
import unittest
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

### Test for Label Encoding

In [31]:
class TestLabelEncoding(unittest.TestCase):
    def setUp(self):
        # Mock dataset
        data = {
            "age_group": ["Young", "Middle-aged", "Old"],
            "gender": ["Male", "Female", "Other"],
            "ever_married": ["No", "Yes", "No"],
            "work_type": ["Private", "Self-employed", "Govt_job"],
            "Residence_type": ["Urban", "Rural", "Urban"],
            "smoking_status": ["Never", "Formerly", "smokes"]
        }
        self.df = pd.DataFrame(data)
        self.columns_to_encode = ['age_group', 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

    def test_label_encoding(self):
        # Apply LabelEncoder
        label_encoder = LabelEncoder()
        df_encoded = self.df.copy()
        for column in self.columns_to_encode:
            df_encoded[column] = label_encoder.fit_transform(self.df[column])

        # Assertions to test label encoding
        for column in self.columns_to_encode:
            # Ensure encoded columns have integer types
            self.assertTrue(pd.api.types.is_integer_dtype(df_encoded[column]))

            # Ensure the number of unique values matches the original
            self.assertEqual(len(df_encoded[column].unique()), len(self.df[column].unique()))

        # Example check for specific mappings (e.g., "Male" -> 1, "Female" -> 0, "Other" -> 2)
        gender_mapping = dict(zip(self.df["gender"].unique(), label_encoder.fit_transform(self.df["gender"].unique())))
        self.assertIn("Male", gender_mapping)
        self.assertIn("Female", gender_mapping)
        self.assertIn("Other", gender_mapping)
        self.assertEqual(gender_mapping["Other"], 2)
        self.assertEqual(gender_mapping["Male"], 1)
        self.assertEqual(gender_mapping["Female"], 0)

### Test for Feature-Target Split

In [32]:
class TestFeatureTargetSplit(unittest.TestCase):
    def test_split_features_target(self):
        # Mock dataset
        data = {
            "age": [25, 35, 45],
            "gender": ["Male", "Female", "Male"],
            "bmi": [24.5, 30.1, 29.4],
            "stroke": [0, 1,0]
        }
        df = pd.DataFrame(data)
        
        # Split features and target
        x = df.drop("stroke", axis=1)
        y = pd.to_numeric(df["stroke"])
        
        # Test feature and target dimensions
        self.assertEqual(x.shape[0], df.shape[0])
        self.assertEqual(len(y), df.shape[0])

### Test for Missing Value

In [33]:
class TestMissingValues(unittest.TestCase):
    def test_missing_values(self):
        # Mock dataset with missing values
        data = {
            "age": [25, 35, np.nan],  # Missing value
            "gender": ["Male", "Female", "Male"],
            "bmi": [24.5, 30.1, 29.4],
            "stroke": [0, 1, 0]
        }
        df = pd.DataFrame(data)
        
        # Check for missing values
        self.assertTrue(df.isnull().values.any())
        
        # Fill missing values
        df_filled = df.fillna(df.mean(numeric_only=True))
        self.assertFalse(df_filled.isnull().values.any())


### Test for Scaling

In [34]:
class TestScaling(unittest.TestCase):
    def test_scaling(self):
        # Mock data after encoding
        x = pd.DataFrame({
            "age": [25, 35, 45],
            "bmi": [24.5, 30.1, 29.4],
            "gender": [0, 1, 2]
        })
        
        # Perform scaling
        scaler = StandardScaler()
        x_scaled = scaler.fit_transform(x)
        
        # Test that mean is approximately 0 and std is approximately 1
        self.assertTrue(np.allclose(x_scaled.mean(axis=0), 0, atol=1e-7))
        self.assertTrue(np.allclose(x_scaled.std(axis=0), 1, atol=1e-7))


### Run all tests

In [None]:
if __name__ == '__main__':
    unittest.main(argv=[''], verbosity=2, exit=False)

## Results

### Comparing Models

In [None]:
# Model Comparison
results = {
    "Model": ["Decision Tree", "Random Forest"],
    "Accuracy": [tree_acc, rf_acc],
    "Precision": [
        precision_score(y_test, tree_preds),
        precision_score(y_test, rf_preds),
    ],
    "Recall": [
        recall_score(y_test, tree_preds),
        recall_score(y_test, rf_preds),
    ],
    "F1-Score": [
        f1_score(y_test, tree_preds),
        f1_score(y_test, rf_preds),
    ],
}

import pandas as pd
results_df = pd.DataFrame(results)
print("\nModel Comparison:\n")
print(results_df)

### Results for Decision Tree Model

In [None]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test, tree_preds))

# Generate and display confusion matrix
cm = confusion_matrix(y_test, tree_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Stroke', 'Stroke'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

### Results for Random Forest Model

In [None]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test, rf_preds))

# Generate and display confusion matrix
cm = confusion_matrix(y_test, rf_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Stroke', 'Stroke'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

## Deployment

### Create Repo for Hugging Face

In [40]:
from huggingface_hub import HfApi

api = HfApi()

# Replace with your actual Hugging Face token
hf_token = "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"

repo_name = "Asiya-Mohammed/random-forest-model"


In [None]:
# Creating the repository dont run if it already exists
api.create_repo(repo_id=repo_name, exist_ok=True, private=False, token=hf_token)

### Deploy Best Model(Random Forest Model)

In [None]:
from huggingface_hub import upload_file

# Path to the model file
model_path = "best_rf_model.joblib"

# Upload the model to the repository
upload_file(
    path_or_fileobj=model_path,              
    path_in_repo="best_rf_model.joblib",  
    repo_id=repo_name,                     
    token=hf_token                           
)

print(f"Model uploaded successfully to: https://huggingface.co/{repo_name}")

### Deploy ReadMe for Hugging Face

In [None]:
from huggingface_hub import upload_file

# Path to the readme file
readme_path = "hfREADME.md"

# Upload the readme to the repository
upload_file(
    path_or_fileobj=readme_path,              
    path_in_repo="hfREADME.md",  
    repo_id=repo_name,                     
    token=hf_token                           
)

print(f"Readme uploaded successfully to: https://huggingface.co/{repo_name}")