In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("Heart.csv")

# Drop the 'Unnamed: 0' column if it exists
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns='Unnamed: 0')

# Define features and target
X = data.drop(columns='AHD')
y = data['AHD'].apply(lambda x: 1 if x == 'Yes' else 0)

# Identify numeric and categorical columns
numeric_features = ['Age', 'RestBP', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak', 'Ca']
categorical_features = ['Sex', 'ChestPain', 'Fbs', 'ExAng', 'Slope', 'Thal']

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())  # Use MinMaxScaler to ensure non-negative values for chi2
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))  # Drop='first' to avoid dummy variable trap
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(random_state=42)
}

# Create a pipeline that includes preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=chi2)),
    ('model', RandomForestClassifier(random_state=42))  # Default model
])

# Define parameter grid for GridSearchCV
param_grid = [
    {
        'feature_selection__k': [10, 12, 13],  # k should not be more than number of features
        'model': [RandomForestClassifier(random_state=42)],
        'model__n_estimators': [50, 100],  # Applicable for RandomForest and GradientBoosting
        'model__max_depth': [10, 20, None]  # Applicable for RandomForest and GradientBoosting
    },
    {
        'feature_selection__k': [10, 12, 13],
        'model': [GradientBoostingClassifier(random_state=42)],
        'model__n_estimators': [50, 100],
        'model__max_depth': [10, 20, None]
    },
    {
        'feature_selection__k': [10, 12, 13],
        'model': [SVC(random_state=42)],
        'model__C': [0.1, 1, 10],  # Applicable for SVC
        'model__kernel': ['linear', 'rbf']  # Applicable for SVC
    }
]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit grid search
try:
    grid_search.fit(X_train, y_train)
    
    # Print best parameters and best score
    print("Best parameters found: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # Predict on the test set
    y_pred = grid_search.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy: ", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Analyze feature importance
    if hasattr(grid_search.best_estimator_.named_steps['model'], 'feature_importances_'):
        best_model = grid_search.best_estimator_.named_steps['model']
        feature_importances = best_model.feature_importances_

        # Get feature names
        encoder = grid_search.best_estimator_.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
        encoded_features = encoder.get_feature_names_out(categorical_features)
        all_features = np.append(numeric_features, encoded_features)

        # Create a DataFrame for feature importances
        feature_importance_df = pd.DataFrame({
            'Feature': all_features,
            'Importance': feature_importances
        }).sort_values(by='Importance', ascending=False)

        print(feature_importance_df)
    else:
        print("Best model does not support feature importances.")
except ValueError as e:
    print(f"ValueError: {e}")
    # Additional debug info
    print(f"X_train shape: {X_train.shape}")
    print(f"Features in X_train: {X_train.columns}")
except AttributeError as e:
    print(f"AttributeError: {e}")


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters found:  {'feature_selection__k': 12, 'model': RandomForestClassifier(random_state=42), 'model__max_depth': 20, 'model__n_estimators': 50}
Best score:  0.8394241417497232
Test Accuracy:  0.7582417582417582
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.77        48
           1       0.73      0.77      0.75        43

    accuracy                           0.76        91
   macro avg       0.76      0.76      0.76        91
weighted avg       0.76      0.76      0.76        91

ValueError: All arrays must be of the same length
X_train shape: (212, 13)
Features in X_train: Index(['Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs', 'RestECG', 'MaxHR',
       'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal'],
      dtype='object')
