In [None]:
# Apply Label Encoding to all object columns
label_encoders = {}
for column in heart_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    heart_df[column] = le.fit_transform(heart_df[column].astype(str))
    label_encoders[column] = le

# Display the first few rows of the transformed dataset
heart_df.head(10)

# Check the unique values in the 'num' column
print(heart_df['num'].unique())

# Map 'num' to binary (e.g., 0: no disease, 1: disease present)
heart_df['num'] = heart_df['num'].apply(lambda x: 1 if x > 0 else 0)

# Verify the changes
print(heart_df['num'].unique())


logistic_model = logit("num ~ age + sex + dataset + cp + trestbps + chol + fbs + restecg + thalch + exang + oldpeak + slope + ca + thal", 
                       data=heart_df).fit()
print(logistic_model.summary())

logistic_model = logit("num ~ sex + cp + chol + exang + oldpeak + slope + ca", data=heart_df).fit()
print(logistic_model.summary())


X_logistic = heart_df[['sex', 'cp', 'chol', 'exang', 'oldpeak', 'slope', 'ca']]
y_logistic = heart_df["num"]


predictions = logistic_model.predict(X_logistic) > 0.5 
cf_matrix = confusion_matrix(y_logistic, predictions)
accuracy = accuracy_score(y_logistic, predictions)

print(cf_matrix)
print("Accuracy of the new Model: ",accuracy)


#split the data into X and Y
X = heart_df.drop(['num','id','age_bins'], axis=1)
y = heart_df['num']

print (X)
print (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


from sklearn.preprocessing import StandardScaler

models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XG Boost': XGBClassifier(random_state=42)
    
}

params = {
    'Random Forest': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [ 10,20],
        'model__min_samples_split': [2, 5]
    },
    'Gradient Boosting': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.1, 0.01],
        'model__max_depth': [3, 5]
    },
    
    'Logistic Regression': {
        'model__C': [1, 10],
        'model__solver': ['lbfgs', 'liblinear']
    },
    'K-Nearest Neighbors': {
        'model__n_neighbors': [3, 5],
        'model__weights': ['uniform', 'distance']
    },
    
    'XG Boost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.1, 0.01],
        'model__max_depth': [3, 5]
    }
}

# Initialize best model tracking
best_model = None
best_accuracy = 0.0

# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")

    # Create a pipeline with the model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # Get hyperparameters for the current model
    model_params = params.get(name, {})

    # Create GridSearchCV with the pipeline and parameters
    grid_search = GridSearchCV(pipeline, model_params, cv=5, n_jobs=-1, verbose=0)

    # Fit the pipeline
    grid_search.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = grid_search.predict(X_test)

    # Print evaluation metrics
    print(f"{name} - Best Parameters: {grid_search.best_params_}")
    print(f"{name} - Best Score: {grid_search.best_score_}")
    print(f"{name} - Test Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"{name} - Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"{name} - Classification Report:\n{classification_report(y_test, y_pred)}")
    print('\n')
    
    if accuracy_score(y_test, y_pred) > best_accuracy:
        best_accuracy = accuracy_score(y_test, y_pred)
        best_model = grid_search.best_estimator_

# print the best model & accuracy
print(f"The Best model is {best_model.named_steps['model']} with an accuracy of {best_accuracy*100}%")


# Training Random Forest...
# Random Forest - Best Parameters: {'model__max_depth': 20, 'model__min_samples_split': 5, 'model__n_estimators': 100}
# Random Forest - Best Score: 0.8484671302149177
# Random Forest - Test Accuracy: 0.8235294117647058
# Random Forest - Confusion Matrix:
# [[83 18]
#  [15 71]]
# Random Forest - Classification Report:
#               precision    recall  f1-score   support

#            0       0.85      0.82      0.83       101
#            1       0.80      0.83      0.81        86

#     accuracy                           0.82       187
#    macro avg       0.82      0.82      0.82       187
# weighted avg       0.82      0.82      0.82       187



# Training Gradient Boosting...
# Gradient Boosting - Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 200}
# Gradient Boosting - Best Score: 0.8378634639696585
# Gradient Boosting - Test Accuracy: 0.8235294117647058
# Gradient Boosting - Confusion Matrix:
# [[83 18]
#  [15 71]]
# Gradient Boosting - Classification Report:
#               precision    recall  f1-score   support

#            0       0.85      0.82      0.83       101
#            1       0.80      0.83      0.81        86

#     accuracy                           0.82       187
#    macro avg       0.82      0.82      0.82       187
# weighted avg       0.82      0.82      0.82       187



# Training Logistic Regression...
# Logistic Regression - Best Parameters: {'model__C': 1, 'model__solver': 'lbfgs'}
# Logistic Regression - Best Score: 0.8378476611883692
# Logistic Regression - Test Accuracy: 0.8235294117647058
# Logistic Regression - Confusion Matrix:
# [[84 17]
#  [16 70]]
# Logistic Regression - Classification Report:
#               precision    recall  f1-score   support

#            0       0.84      0.83      0.84       101
#            1       0.80      0.81      0.81        86

#     accuracy                           0.82       187
#    macro avg       0.82      0.82      0.82       187
# weighted avg       0.82      0.82      0.82       187



# Training K-Nearest Neighbors...
# K-Nearest Neighbors - Best Parameters: {'model__n_neighbors': 5, 'model__weights': 'distance'}
# K-Nearest Neighbors - Best Score: 0.8164348925410871
# K-Nearest Neighbors - Test Accuracy: 0.8074866310160428
# K-Nearest Neighbors - Confusion Matrix:
# [[80 21]
#  [15 71]]
# K-Nearest Neighbors - Classification Report:
#               precision    recall  f1-score   support

#            0       0.84      0.79      0.82       101
#            1       0.77      0.83      0.80        86

#     accuracy                           0.81       187
#    macro avg       0.81      0.81      0.81       187
# weighted avg       0.81      0.81      0.81       187



# Training XG Boost...
# XG Boost - Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}
# XG Boost - Best Score: 0.8378950695322376
# XG Boost - Test Accuracy: 0.8235294117647058
# XG Boost - Confusion Matrix:
# [[84 17]
#  [16 70]]
# XG Boost - Classification Report:
#               precision    recall  f1-score   support

#            0       0.84      0.83      0.84       101
#            1       0.80      0.81      0.81        86

#     accuracy                           0.82       187
#    macro avg       0.82      0.82      0.82       187
# weighted avg       0.82      0.82      0.82       187



# The Best model is RandomForestClassifier(max_depth=20, min_samples_split=5, random_state=42) with an accuracy of 82.35294117647058%
# Training Stacking Classifier...
# Stacking Classifier - Test Ac

from sklearn.ensemble import StackingClassifier
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Defining the base models
base_estimators = [
    ('random_forest', RandomForestClassifier(
        random_state=42,
        n_estimators= 300,
        max_depth=10,
        min_samples_split=2
    )),
    ('logistic_regression', LogisticRegression(
        random_state=42, 
        solver='lbfgs',
        C=1
    ))
]


# Defining the final estimator (XGBoost)
final_estimator = XGBClassifier(
    random_state=42, 
    n_estimators=200, 
    learning_rate=0.1, 
    max_depth=5
)

# Creating the Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=final_estimator,
    cv=5,
    n_jobs=-1
)

# Fit the stacking classifier
print("Training Stacking Classifier...")
stacking_clf.fit(X_train, y_train)

# Evaluate the stacking classifier
y_pred_stack = stacking_clf.predict(X_test)

# Print evaluation metrics
print("Stacking Classifier - Test Accuracy:", accuracy_score(y_test, y_pred_stack))
print("Stacking Classifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_stack))
print("Stacking Classifier - Classification Report:\n", classification_report(y_test, y_pred_stack))




