In [1]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For scaling and encoding

In [4]:
# Load the dataset
stroke_data = pd.read_csv('./data/healthcare-dataset-stroke-data.csv')

# Display the first few rows of the dataset
print(stroke_data.head())

# Drop rows with missing values
stroke_data.dropna(inplace=True)

# Drop unnecessary columns
stroke_data.drop(columns=['id'], inplace=True)

# Encoding categorical features
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
label_encoder = LabelEncoder()
for col in categorical_columns:
    stroke_data[col] = label_encoder.fit_transform(stroke_data[col])

# Handling missing values (e.g., 'bmi' column)
stroke_data['bmi'] = stroke_data['bmi'].fillna(stroke_data['bmi'].mean())

# Scaling numerical features
numerical_columns = ['age', 'avg_glucose_level', 'bmi']
scaler = StandardScaler()
stroke_data[numerical_columns] = scaler.fit_transform(stroke_data[numerical_columns])

print(stroke_data.head())

      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  
   gender       age  hypertension  heart_disease  ever_m

In [5]:
# Splitting the dataset into features and target variable
X = stroke_data.drop(columns=['stroke'])    # Features
y = stroke_data['stroke']                  # Target variable
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the training and testing sets
print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

Training set shape: (3927, 10), (3927,)
Testing set shape: (982, 10), (982,)


Train

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Random Forest model: {accuracy:.2f}")

Accuracy of the Random Forest model: 0.95


HPO

In [8]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Define the hyperparameters to tune
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    # Create the model with the suggested hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(rf, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Print the best hyperparameters and score
print("Best hyperparameters:", study.best_params)
print("Best accuracy:", study.best_value)

[I 2025-05-06 10:00:27,088] A new study created in memory with name: no-name-e0b0d011-8bc6-403c-a4ce-c887c15fc66b
[I 2025-05-06 10:00:27,405] Trial 0 finished with value: 0.9602750190985484 and parameters: {'n_estimators': 10, 'max_depth': 5, 'min_samples_split': 17, 'min_samples_leaf': 12}. Best is trial 0 with value: 0.9602750190985484.
[I 2025-05-06 10:00:28,272] Trial 1 finished with value: 0.9602750190985484 and parameters: {'n_estimators': 59, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.9602750190985484.
[I 2025-05-06 10:00:31,852] Trial 2 finished with value: 0.9602750190985484 and parameters: {'n_estimators': 166, 'max_depth': 11, 'min_samples_split': 16, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9602750190985484.
[I 2025-05-06 10:00:35,446] Trial 3 finished with value: 0.9602750190985484 and parameters: {'n_estimators': 199, 'max_depth': 7, 'min_samples_split': 11, 'min_samples_leaf': 13}. Best is trial 0 with valu

Best hyperparameters: {'n_estimators': 10, 'max_depth': 5, 'min_samples_split': 17, 'min_samples_leaf': 12}
Best accuracy: 0.9602750190985484
