In [1]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from google.colab import files

In [2]:
# Upload dataset
uploaded = files.upload()

Saving heart_disease.csv to heart_disease.csv


In [3]:
df = pd.read_csv("heart_disease.csv")

print("✅ Dataset Preview:")
print(df.head())
print("\n📊 Dataset Info:")
print(df.info())

# Features (X) and Target (y)
X = df.drop("num", axis=1)
y = (df["num"] > 0).astype(int)   # 0 = no disease, 1 = disease present

✅ Dataset Preview:
        age       sex        cp  trestbps      chol       fbs   restecg  \
0  0.948726  0.686202 -2.251775  0.757525 -0.264900  2.394438  1.016684   
1  1.392002  0.686202  0.877985  1.611220  0.760415 -0.417635  1.016684   
2  1.392002  0.686202  0.877985 -0.665300 -0.342283 -0.417635  1.016684   
3 -1.932564  0.686202 -0.165268 -0.096170  0.063974 -0.417635 -0.996749   
4 -1.489288 -1.457296 -1.208521 -0.096170 -0.825922 -0.417635  1.016684   

    thalach     exang   oldpeak     slope        ca      thal       num  
0  0.017197 -0.696631  1.087338  2.274579 -0.711131  0.660004 -0.764198  
1 -1.821905  1.435481  0.397182  0.649113  2.504881 -0.890238  0.866450  
2 -0.902354  1.435481  1.346147  0.649113  1.432877  1.176752  0.051126  
3  1.637359 -0.696631  2.122573  2.274579 -0.711131 -0.890238 -0.764198  
4  0.980537 -0.696631  0.310912 -0.976352 -0.711131 -0.890238 -0.764198  

📊 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 3

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Train-Test Split Done")
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

✅ Train-Test Split Done
Training samples: 242
Testing samples: 61


In [5]:
# Baseline model
rf = RandomForestClassifier(random_state=42)

In [6]:
# Grid Search
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5]
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

print("✅ Best params (GridSearch):", grid_search.best_params_)
print("✅ Best CV score (GridSearch):", grid_search.best_score_)

✅ Best params (GridSearch): {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 100}
✅ Best CV score (GridSearch): 0.8384353741496599


In [7]:
# Randomized Search
param_dist = {
    "n_estimators": np.arange(50, 300, 50),
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist,
    n_iter=20, cv=5, scoring="accuracy",
    random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)

print("✅ Best params (RandomizedSearch):", random_search.best_params_)
print("✅ Best CV score (RandomizedSearch):", random_search.best_score_)

✅ Best params (RandomizedSearch): {'n_estimators': np.int64(100), 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 5}
✅ Best CV score (RandomizedSearch): 0.8384353741496599


In [8]:
# Best GridSearch model
best_grid = grid_search.best_estimator_
y_pred_grid = best_grid.predict(X_test)



In [9]:
# Best RandomizedSearch model
best_random = random_search.best_estimator_
y_pred_random = best_random.predict(X_test)


In [None]:
# Save evaluation results into a file
with open("evaluation_metrics.txt", "w") as f:
    f.write("FINAL MODEL EVALUATION METRICS\n")
    f.write("="*50 + "\n\n")

    # GridSearch Results
    f.write("📊 GridSearch Best Model:\n")
    f.write(f"Best Params: {grid_search.best_params_}\n")
    f.write(f"Best CV Score: {grid_search.best_score_:.4f}\n")
    f.write(f"Test Accuracy: {accuracy_score(y_test, y_pred_grid):.4f}\n")
    f.write("Classification Report:\n")
    f.write(classification_report(y_test, y_pred_grid))
    f.write("\n" + "-"*50 + "\n\n")

    # RandomizedSearch Results
    f.write("📊 RandomizedSearch Best Model:\n")
    f.write(f"Best Params: {random_search.best_params_}\n")
    f.write(f"Best CV Score: {random_search.best_score_:.4f}\n")
    f.write(f"Test Accuracy: {accuracy_score(y_test, y_pred_random):.4f}\n")
    f.write("Classification Report:\n")
    f.write(classification_report(y_test, y_pred_random))

print("✅ Evaluation metrics saved to evaluation_metrics.txt")

✅ Evaluation metrics saved to evaluation_metrics.txt


In [None]:
files.download("evaluation_metrics.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
best_params = random_search.best_params_

# Save them to a file
import joblib
joblib.dump(best_params, "best_params.pkl")

print("✅ Saved best_params:", best_params)

✅ Saved best_params: {'n_estimators': np.int64(100), 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 5}


In [11]:
!ls -lh /content/

total 92K
-rw-r--r-- 1 root root  193 Sep 26 18:38 best_params.pkl
-rw-r--r-- 1 root root  81K Sep 26 18:32 heart_disease.csv
drwxr-xr-x 1 root root 4.0K Sep 25 13:37 sample_data


In [None]:
from google.colab import files
files.download("best_params.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
!pip freeze > requirements.txt

from google.colab import files
files.download("requirements.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>