## IMPORTING NECESSARY LIBRARIES

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models
from sklearn.model_selection import train_test_split

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Hyperparameter tuning and cross-validation
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, ShuffleSplit

# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error, r2_score

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# LOADING DATA

In [2]:
try:
    df = pd.read_csv('../data/heart.csv')
except FileNotFoundError:
    print("File not found")

In [3]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

## SPLITTING DATA

In [4]:
X = df.drop(columns=['target'])  
y = df['target'] 

In [5]:
# First, split into training (80%) and temporary set (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Second, split the temporary set into validation (50%) and test (50%) sets (both 10% of the total data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## MODEL TRAINING

In [12]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('Random Forest Classifier', RandomForestClassifier()),
    ('Decision Tree Regressor', DecisionTreeRegressor()),  # Example regression model
    ('Random Forest Regressor', RandomForestRegressor())  # Example regression model
]


In [13]:
model_results = {}

# Loop over each model
for model_name, model in models:
    print(f"Training {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on validation set
    y_pred = model.predict(X_val)
    
    # Evaluate the model
    if model_name in ['Logistic Regression', 'SVM', 'KNN', 'Random Forest Classifier']:
        # For classification models
        report = classification_report(y_val, y_pred)
        model_results[model_name] = report
        print(f"{model_name} Classification Report:\n{report}")
        
    else:
        # For regression models
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        model_results[model_name] = {'MSE': mse, 'R2': r2}
        print(f"{model_name} - Mean Squared Error: {mse:.4f}, R2 Score: {r2:.4f}")
    
    print("\n" + "-"*50 + "\n")


Training Logistic Regression...
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.69      0.77        52
           1       0.74      0.90      0.81        50

    accuracy                           0.79       102
   macro avg       0.81      0.80      0.79       102
weighted avg       0.81      0.79      0.79       102


--------------------------------------------------

Training SVM...
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.50      0.58        52
           1       0.59      0.76      0.67        50

    accuracy                           0.63       102
   macro avg       0.64      0.63      0.62       102
weighted avg       0.64      0.63      0.62       102


--------------------------------------------------

Training KNN...
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.77 

In [14]:
for model_name, model in models:
    print(f"Evaluating {model_name} on test set...")

    # Make predictions on the test set
    y_test_pred = model.predict(X_test)

    # Evaluate the model
    if model_name in ['Logistic Regression', 'SVM', 'KNN', 'Random Forest Classifier']:
        print(f"{model_name} Test Set Classification Report:\n{classification_report(y_test, y_test_pred)}")
        
    else:
        mse_test = mean_squared_error(y_test, y_test_pred)
        r2_test = r2_score(y_test, y_test_pred)
        print(f"{model_name} Test Set - MSE: {mse_test:.4f}, R2: {r2_test:.4f}")
    
    print("\n" + "-"*50 + "\n")

Evaluating Logistic Regression on test set...
Logistic Regression Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.70      0.75        50
           1       0.75      0.85      0.80        53

    accuracy                           0.78       103
   macro avg       0.78      0.77      0.77       103
weighted avg       0.78      0.78      0.78       103


--------------------------------------------------

Evaluating SVM on test set...
SVM Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.72      0.73        50
           1       0.74      0.75      0.75        53

    accuracy                           0.74       103
   macro avg       0.74      0.74      0.74       103
weighted avg       0.74      0.74      0.74       103


--------------------------------------------------

Evaluating KNN on test set...
KNN Test Set Classification Report:
          