In [1]:
from google.colab import drive
# Mount google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

In [3]:
# Locate csv path
file_path = '/content/drive/MyDrive/healthcare-dataset-stroke-data.csv'

# Read csv into dataframe
df = pd.read_csv(file_path)

In [4]:
# Look for null values
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
# Look for duplicate value
df.duplicated().sum()

0

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [7]:
# Data Cleaning

# Remove useless column
df.drop(columns=['id'], inplace=True)

# Remove row with null value in the 'bmi' column
df = df.dropna(subset=['bmi'])

# Remove rows where 'gender' is 'Other'
df = df[df['gender'] != 'Other'].copy()

# Replace value with binary representation
df['gender'] = df['gender'].replace({'Male': 1, 'Female': 0})
df['ever_married'] = df['ever_married'].replace({'Yes': 1, 'No': 0})
df['Residence_type'] = df['Residence_type'].replace({'Urban': 1,'Rural':0})

# Perform one-hot encoding for 'work_type', 'smoking_status'
df = pd.get_dummies(df, columns=['work_type', 'smoking_status'], drop_first=True)


df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,67.0,0,1,1,1,228.69,36.6,1,0,1,0,0,1,0,0
2,1,80.0,0,1,1,0,105.92,32.5,1,0,1,0,0,0,1,0
3,0,49.0,0,0,1,1,171.23,34.4,1,0,1,0,0,0,0,1
4,0,79.0,1,0,1,0,174.12,24.0,1,0,0,1,0,0,1,0
5,1,81.0,0,0,1,1,186.21,29.0,1,0,1,0,0,1,0,0


In [8]:
# Extract the columns to be scaled
columns_to_scale = ['avg_glucose_level', 'bmi', 'age']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the selected columns
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# Display the first few rows of the updated DataFrame
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,1.069938,0,1,1,1,2.777797,0.981145,1,0,1,0,0,1,0,0
2,1,1.646336,0,1,1,0,0.014016,0.459086,1,0,1,0,0,0,1,0
3,0,0.271847,0,0,1,1,1.484266,0.701016,1,0,1,0,0,0,0,1
4,0,1.601998,1,0,1,0,1.549325,-0.623231,1,0,0,1,0,0,1,0
5,1,1.690675,0,0,1,1,1.821493,0.013426,1,0,1,0,0,1,0,0


In [9]:
### SPLITING DATA TO TRAIN & TEST
X = df.drop('stroke', axis=1)  # Features (excluding 'stroke')
y = df['stroke']  # Target variable 'stroke'

# Split the data into training and testing sets (e.g., 70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
### MODEL WITH IMBALANCE DATA

# Logistic Regression with original imbalanced data
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy of Logistic Regression (Imbalanced Data): {accuracy_logreg:.4f}")
print("Logistic Regression (Imbalanced Data) Classification Report:")
print(classification_report(y_test, y_pred_logreg))

# Random Forest with original imbalanced data
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"\nAccuracy of Random Forest (Imbalanced Data): {accuracy_rf:.4f}")
print("Random Forest (Imbalanced Data) Classification Report:")
print(classification_report(y_test, y_pred_rf))

Accuracy of Logistic Regression (Imbalanced Data): 0.9511
Logistic Regression (Imbalanced Data) Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1401
           1       0.00      0.00      0.00        72

    accuracy                           0.95      1473
   macro avg       0.48      0.50      0.49      1473
weighted avg       0.90      0.95      0.93      1473



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Accuracy of Random Forest (Imbalanced Data): 0.9504
Random Forest (Imbalanced Data) Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1401
           1       0.33      0.01      0.03        72

    accuracy                           0.95      1473
   macro avg       0.64      0.51      0.50      1473
weighted avg       0.92      0.95      0.93      1473



In [11]:
### OVERSAMPLING
oversampler = RandomOverSampler(random_state=42)
X_train_balanced, y_train_balanced = oversampler.fit_resample(X_train, y_train)


In [12]:
### CHECKING DATA STATS

train_stroke_count = y_train_balanced.value_counts()
print("Balanced Train Data - 'stroke' counts:")
print(train_stroke_count)

# Count of 'stroke' occurrences in test data (y_test)
test_stroke_count = y_test.value_counts()
print("\nTest Data - 'stroke' counts:")
print(test_stroke_count)

Balanced Train Data - 'stroke' counts:
0    3298
1    3298
Name: stroke, dtype: int64

Test Data - 'stroke' counts:
0    1401
1      72
Name: stroke, dtype: int64


In [13]:
### MODEL WITH BALANCED DATA

# Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_balanced, y_train_balanced)

# Predict on the original test set
y_pred_logreg = logreg.predict(X_test)

# Calculate accuracy for Logistic Regression
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy of Logistic Regression: {accuracy_logreg:.4f}")

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_balanced, y_train_balanced)

# Predict on the original test set
y_pred_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest: {accuracy_rf:.4f}")

print("Logistic Regression:")
print(classification_report(y_test, y_pred_logreg))

# Random Forest
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

Accuracy of Logistic Regression: 0.7379
Accuracy of Random Forest: 0.9470
Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      1401
           1       0.14      0.88      0.25        72

    accuracy                           0.74      1473
   macro avg       0.57      0.80      0.54      1473
weighted avg       0.95      0.74      0.81      1473

Random Forest:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1401
           1       0.12      0.01      0.02        72

    accuracy                           0.95      1473
   macro avg       0.54      0.50      0.50      1473
weighted avg       0.91      0.95      0.93      1473



In [14]:
### MODEL TUNING

# Define hyperparameters grid for Logistic Regression with 'l1' penalty and 'liblinear' solver
logreg_params = {'C': [0.1, 1, 10, 100], 'penalty': ['l1'], 'solver': ['liblinear']}

# GridSearchCV for Logistic Regression
logreg_grid = GridSearchCV(LogisticRegression(random_state=42), logreg_params, cv=5, scoring='accuracy')
logreg_grid.fit(X_train_balanced, y_train_balanced)

# Get best parameters and best score for Logistic Regression
print("Best Parameters for Logistic Regression:", logreg_grid.best_params_)
print("Best Accuracy Score for Logistic Regression:", logreg_grid.best_score_)

# Define hyperparameters grid for Random Forest
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30]}

# GridSearchCV for Random Forest
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train_balanced, y_train_balanced)

# Get best parameters and best score for Random Forest
print("\nBest Parameters for Random Forest:", rf_grid.best_params_)
print("Best Accuracy Score for Random Forest:", rf_grid.best_score_)

Best Parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Accuracy Score for Logistic Regression: 0.7650094194408068

Best Parameters for Random Forest: {'max_depth': 20, 'n_estimators': 100}
Best Accuracy Score for Random Forest: 0.996361568681508


In [19]:
# Initialize the RandomForestClassifier with the best hyperparameters
best_rf_estimator = RandomForestClassifier(max_depth=20, n_estimators=200, random_state=42)

# Perform 5-fold cross-validation (you can adjust cv parameter as needed)
cv_scores_rf = cross_val_score(best_rf_estimator, X_train_balanced, y_train_balanced, cv=5, scoring='accuracy')

# Perform cross-validation predictions to compute other evaluation metrics
cv_predictions_rf = cross_val_predict(best_rf_estimator, X_train_balanced, y_train_balanced, cv=5)

# Print the cross-validation scores
print("Random Forest Cross-validation scores:", cv_scores_rf)
print("Random Forest Mean CV accuracy:", cv_scores_rf.mean())

# Print classification report for each fold
for fold, (train_idx, val_idx) in enumerate(StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X_train_balanced, y_train_balanced)):
    X_train_fold, X_val_fold = X_train_balanced.iloc[train_idx], X_train_balanced.iloc[val_idx]
    y_train_fold, y_val_fold = y_train_balanced.iloc[train_idx], y_train_balanced.iloc[val_idx]

    best_rf_estimator.fit(X_train_fold, y_train_fold)
    y_pred = best_rf_estimator.predict(X_val_fold)

    print(f"Classification Report for Fold {fold + 1}:")
    print(classification_report(y_val_fold, y_pred))

# Compute and print confusion matrix
conf_matrix_rf = confusion_matrix(y_train_balanced, cv_predictions_rf)
print("Confusion Matrix for Random Forest:")
print(conf_matrix_rf)

Random Forest Cross-validation scores: [0.99393939 0.99772555 0.99620925 0.9954511  0.99620925]
Random Forest Mean CV accuracy: 0.9959069083557331
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       660
           1       1.00      1.00      1.00       660

    accuracy                           1.00      1320
   macro avg       1.00      1.00      1.00      1320
weighted avg       1.00      1.00      1.00      1320

Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       660
           1       0.99      1.00      1.00       659

    accuracy                           1.00      1319
   macro avg       1.00      1.00      1.00      1319
weighted avg       1.00      1.00      1.00      1319

Classification Report for Fold 3:
              precision    recall  f1-score   support

           0       1.00      1.00      1.0