<a href="https://colab.research.google.com/github/Bpatnaik470/Bpatnaik470/blob/main/Model%20evaluation_Heart_Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import os
import shutil
import kagglehub
arezaei81_heartcsv_path = kagglehub.dataset_download('arezaei81/heartcsv')

# Get the actual CSV file path within the downloaded directory
for filename in os.listdir(arezaei81_heartcsv_path):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(arezaei81_heartcsv_path, filename)
        break  # Stop searching after finding the first CSV file

print('Data source import complete.')

Data source import complete.


In [7]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from scipy.stats import boxcox
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [10]:
# prompt: Build an automated process to test many modeling techniques and ML algorithms with
# your data to see which one yields the best results

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def evaluate_models(X, y):
  """
  Evaluates multiple machine learning models using GridSearchCV.

  Args:
    X: The feature data (independent variables).
    y: The target variable (dependent variable).
  """

  models = {
      'Logistic Regression': LogisticRegression(),
      'K-Nearest Neighbors': KNeighborsClassifier(),
      'Support Vector Machine': SVC(),
      'Decision Tree': DecisionTreeClassifier(),
      'Random Forest': RandomForestClassifier(),
      'Naive Bayes': GaussianNB(),
      'Gradient Boosting': GradientBoostingClassifier(),
      'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
  }

  results = {}
  for model_name, model in models.items():
    print(f"Evaluating {model_name}...")

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # Define parameter grid (example, adjust as needed)
    param_grid = {}
    if model_name == 'K-Nearest Neighbors':
      param_grid = {'model__n_neighbors': [3, 5, 7, 9]}
    elif model_name == 'Support Vector Machine':
      param_grid = {'model__kernel': ['linear', 'rbf'], 'model__C': [0.1, 1, 10]}
    elif model_name == 'Decision Tree':
      param_grid = {'model__max_depth': [3, 5, 7]}
    elif model_name == 'Random Forest':
      param_grid = {'model__n_estimators': [100, 200], 'model__max_depth': [3, 5]}
    elif model_name == 'Logistic Regression':
      param_grid = {'model__C': [0.1, 1, 10]}
    elif model_name == 'Gradient Boosting':
      param_grid = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.1, 0.05]}

    grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy')
    grid_search.fit(X, y)

    results[model_name] = {
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_,
        'best_estimator': grid_search.best_estimator_ # Store the best estimator
    }

  return results


# Load your data (replace 'arezaei81_heartcsv_path' with your dataset path)
df = pd.read_csv(csv_file_path)

# Assuming 'target' is your target variable
X = df.drop('target', axis=1)
y = df['target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate models
model_results = evaluate_models(X_train, y_train)

# Display results
for model_name, result in model_results.items():
    print(f"\n{model_name}:")
    print(f"  Best Score: {result['best_score']:.4f}")
    print(f"  Best Parameters: {result['best_params']}")


# Choose the best model based on the results and further refine it using the test set.
# Assuming you want to select the Decision Tree as the best model
best_dt = model_results['Decision Tree']['best_estimator'] # Assign best_dt

# Evaluate the optimized model on the train

Evaluating Logistic Regression...
Evaluating K-Nearest Neighbors...
Evaluating Support Vector Machine...
Evaluating Decision Tree...
Evaluating Random Forest...
Evaluating Naive Bayes...
Evaluating Gradient Boosting...
Evaluating Linear Discriminant Analysis...

Logistic Regression:
  Best Score: 0.8180
  Best Parameters: {'model__C': 1}

K-Nearest Neighbors:
  Best Score: 0.8141
  Best Parameters: {'model__n_neighbors': 5}

Support Vector Machine:
  Best Score: 0.8184
  Best Parameters: {'model__C': 1, 'model__kernel': 'rbf'}

Decision Tree:
  Best Score: 0.7561
  Best Parameters: {'model__max_depth': 7}

Random Forest:
  Best Score: 0.8261
  Best Parameters: {'model__max_depth': 3, 'model__n_estimators': 100}

Naive Bayes:
  Best Score: 0.7890
  Best Parameters: {}

Gradient Boosting:
  Best Score: 0.8056
  Best Parameters: {'model__learning_rate': 0.1, 'model__n_estimators': 100}

Linear Discriminant Analysis:
  Best Score: 0.7974
  Best Parameters: {}
