In [4]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from imblearn.over_sampling import SMOTE

# Load dataset
# Assuming 'framingham.csv' is the dataset file provided by the user
# Dataset contains features for predicting heart disease
framingham_df = pd.read_csv('framingham.csv')

# Handle missing values by filling with median values
framingham_df.fillna(framingham_df.median(), inplace=True)

# Prepare features and target
X = framingham_df.drop(columns=['TenYearCHD'])  # Replace 'TenYearCHD' with the actual target column name if different
y = framingham_df['TenYearCHD']

# Feature selection using SelectKBest
selector = SelectKBest(score_func=f_classif, k='all')
X_new_selected = selector.fit_transform(X, y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new_selected)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training dataset
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

# Train Logistic Regression model on SMOTE data
logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train_smote, y_train_smote)
y_pred_logreg = logreg.predict(X_test)

# Train Random Forest model on SMOTE data
rf = RandomForestClassifier()
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test)

# Evaluate the models using classification report and ROC-AUC score
logreg_report = classification_report(y_test, y_pred_logreg)
rf_report = classification_report(y_test, y_pred_rf)

roc_auc_logreg = roc_auc_score(y_test, y_pred_logreg)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

# Display results
print("Logistic Regression Results:")
print("----------------------------")
print("Classification Report:")
print(logreg_report)
print(f"ROC AUC Score: {roc_auc_logreg:.4f}\n")

print("Random Forest Results:")
print("-----------------------")
print("Classification Report:")
print(rf_report)
print(f"ROC AUC Score: {roc_auc_rf:.4f}")


Logistic Regression Results:
----------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.66      0.77       725
           1       0.23      0.60      0.34       123

    accuracy                           0.66       848
   macro avg       0.57      0.63      0.55       848
weighted avg       0.81      0.66      0.70       848

ROC AUC Score: 0.6332

Random Forest Results:
-----------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.90       725
           1       0.29      0.19      0.23       123

    accuracy                           0.82       848
   macro avg       0.58      0.56      0.56       848
weighted avg       0.79      0.82      0.80       848

ROC AUC Score: 0.5556


In [5]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from imblearn.over_sampling import SMOTE

# Load dataset
# Assuming 'framingham.csv' is the dataset file provided by the user
# Dataset contains features for predicting heart disease
framingham_df = pd.read_csv('framingham.csv')

# Handle missing values by filling with median values
framingham_df.fillna(framingham_df.median(), inplace=True)

# Prepare features and target
X = framingham_df.drop(columns=['TenYearCHD'])  # Replace 'TenYearCHD' with the actual target column name if different
y = framingham_df['TenYearCHD']

# Feature selection using SelectKBest
selector = SelectKBest(score_func=f_classif, k='all')
X_new_selected = selector.fit_transform(X, y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new_selected)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training dataset
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

# Hyperparameter tuning using GridSearchCV for Logistic Regression
logreg = LogisticRegression(max_iter=500)
param_grid_logreg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}
grid_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, scoring='roc_auc')
grid_logreg.fit(X_train_smote, y_train_smote)
y_pred_logreg = grid_logreg.best_estimator_.predict(X_test)

# Hyperparameter tuning using GridSearchCV for Random Forest
rf = RandomForestClassifier()
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc')
grid_rf.fit(X_train_smote, y_train_smote)
y_pred_rf = grid_rf.best_estimator_.predict(X_test)

# Evaluate the models using classification report and ROC-AUC score
logreg_report = classification_report(y_test, y_pred_logreg)
rf_report = classification_report(y_test, y_pred_rf)

roc_auc_logreg = roc_auc_score(y_test, y_pred_logreg)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

# Display results
print("Logistic Regression Results:")
print("----------------------------")
print("Classification Report:")
print(logreg_report)
print(f"ROC AUC Score: {roc_auc_logreg:.4f}\n")

print("Random Forest Results:")
print("-----------------------")
print("Classification Report:")
print(rf_report)
print(f"ROC AUC Score: {roc_auc_rf:.4f}")


Logistic Regression Results:
----------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.66      0.77       725
           1       0.24      0.63      0.35       123

    accuracy                           0.66       848
   macro avg       0.58      0.64      0.56       848
weighted avg       0.81      0.66      0.71       848

ROC AUC Score: 0.6434

Random Forest Results:
-----------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89       725
           1       0.29      0.20      0.23       123

    accuracy                           0.81       848
   macro avg       0.58      0.56      0.56       848
weighted avg       0.79      0.81      0.80       848

ROC AUC Score: 0.5576
