In [1]:
# import all necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score, precision_recall_curve, roc_auc_score, roc_curve

#Load Dataset
# 1. Load the dataset
df = pd.read_csv("C:/Users/demos/OneDrive/Desktop/ESCUELA/BINF5507/Assignment2/Heart_Disease.csv")

#Inspect the Dataset
print(df.head())
print(df.info())
print(df.describe())

#Handle Missing Data and Duplicates
df = df.drop_duplicates().fillna(df.mean())


#Exclude regression and classification targets. The regression target: Cholesterol levels
x = df.drop(columns=["chol", "num"])  
y_reg = df["chol"]
y_clf = (df["num"] >0).astype(int)

#Split data
x_train, x_test, y_train, y_test = train_test_split(x, y_clf, test_size=0.2, random_state=42)


# Standardize numerical features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


#Regression: Train ElasticNet and Tune Hyperparameters
alpha_values = np.logspace(-3, 2, 10)
l1_ratios = np.linspace(0.1, 1, 10)
best_r2, best_params = np.inf, None
results = []

for alpha in alpha_values:
    for l1_ratio in l1_ratios:
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state = 42)
        model.fit(x_train, y_test)
        y_pred = model.predict(x_train)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        results.append((alpha, l1_ratio, r2, rmse))
        
        if r2 > best_r2:
            best_r2 = r2
            best_params = (alpha, l1_ratio)
            
#Heatmap plot
results_df = pd.DataFrame(results, columns=['alpha', 'l1_ratio', 'R2', 'RMSE'])
heart_heatmap = results_df.pivot(index='l1_ratio', columns= 'alpha', values='R2')
plt.figure(figsize=(10, 6))
sns.heatmap(heart_heatmap, annot=True, cmap='coolwarm')
plt.xlabel("Alpha")
plt.ylabel("L1 Ratio")
plt.title("ElasticNet R2 Score Heatmap")
plt.show()
        

##Log Regression hyperparameter tuning
log_reg = LogisticRegression(max_iter=1000)
log_params = {'penalty': ['l1', 'l2', 'elasticnet', 'none' ],'solver': ['liblinear', 'saga'],'l1_ratio': [0.1, 0.5, 0.9]}
log_grid = GridSearchCV(log_reg, log_params, scoring= 'f1', cv=5)
log_grid.fit(x_train, y_train)
best_log_model = log_grid.best_estimator_


## k-NN Classifier##
knn_results = []
for n in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    y_prob = knn.predict_proba(x_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_prob)
    auroc = roc_auc_score(y_test, y_prob)
    knn_results.append({'n_neighbors': n, 'accuracy': acc, 'F1':f1, 'AUROC': auroc})

knn_df = pd.DataFrame(knn_results)
best_knn = knn_df.loc[knn_df['AUROC'].idxmax()]


#AUROC and AUPRC Curves
log_prob = best_log_model.predict_proba(x_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, log_prob)
precision, recall, _ = precision_recall_curve(y_test, log_prob)

plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f"Logisitic AUROC")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rates")
plt.ylabel("True Positive Rates")
plt.title("AUROC Curve")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f"Logisitic AUPRC")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("AUPRC Curve")
plt.legend()  
plt.show()





ModuleNotFoundError: No module named 'seaborn'