In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
import xgboost as xgb
from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
class ELMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_neurons=100):
        self.hidden_neurons = hidden_neurons

    def fit(self, X, y):
        input_size = X.shape[1]
        self.input_weights = np.random.normal(size=[input_size, self.hidden_neurons])
        self.biases = np.random.normal(size=[self.hidden_neurons])
        H = np.tanh(np.dot(X, self.input_weights) + self.biases)
        self.output_weights = np.dot(np.linalg.pinv(H), y)
        return self

    def predict(self, X):
        H = np.tanh(np.dot(X, self.input_weights) + self.biases)
        predictions = np.dot(H, self.output_weights)
        return (predictions > 0.5).astype(int)

In [3]:
datasets_info = {
    'heart_failure_clinical_records_dataset': {
        'filename': 'heart_failure_clinical_records_dataset.csv',
        'features': ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
                     'ejection_fraction', 'high_blood_pressure', 'platelets',
                     'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time'],
        'target': 'DEATH_EVENT'
    },
    'statlog': {
        'filename': 'statlog.csv',
        'features': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'],
        'target': 'presence',
        'adjust_target': True  # Specific to statlog dataset
    },
    'heart': {
        'filename': 'heart.csv',
        'features': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'],
        'target': 'target'
    }
}

In [4]:
models = {
    'ELM': ELMClassifier(hidden_neurons=100),
    'RandomForest': RandomForestClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss', random_state=42),
    'Bagging': BaggingClassifier(random_state=42)
}

In [5]:
param_grids = {
    'RandomForest': {'n_estimators': [10, 50, 100, 200, 500]},
    'AdaBoost': {'n_estimators': [50, 100, 200, 500]},
    'XGBoost': {'n_estimators': [50, 100, 200, 500]},
    'Bagging': {'n_estimators': [10, 50, 100, 200, 500]}
}


In [6]:
for dataset_name, dataset_info in datasets_info.items():
    print(f"\nProcessing dataset: {dataset_name}")

    # Load dataset
    df = pd.read_csv(dataset_info['filename'])
    features = dataset_info['features']
    target = dataset_info['target']

    X = df[features].values
    y = df[target].values

    # Adjust target values for statlog dataset
    if 'adjust_target' in dataset_info and dataset_info['adjust_target']:
        y = y - 1

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Perform cross-validation for accuracy
    results = {}
    for model_name, model in models.items():
        scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
        results[model_name] = scores.mean()

    print("Cross-validation results:")
    print(pd.DataFrame(results, index=["Accuracy"]).T)

    # Perform GridSearchCV for hyperparameter tuning
    best_params = {}
    for model_name, param_grid in param_grids.items():
        if model_name != 'ELM':  # Skip ELM for GridSearchCV as it's custom
            grid_search = GridSearchCV(models[model_name], param_grid, cv=10, scoring='accuracy')
            grid_search.fit(X_train, y_train)
            best_params[model_name] = grid_search.best_params_

    print("\nBest parameters found:")
    print(pd.DataFrame(best_params))
    
    # Evaluate models with best parameters
    final_results = {}
    for model_name, model in models.items():
        if model_name in best_params:
            model.set_params(**best_params[model_name])
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        final_results[model_name] = {
            "Accuracy": accuracy,
            "Classification Report": report
        }

    # Print final evaluation results
    for model_name, result in final_results.items():
        print(f"\nAccuracy of {model_name}: {result['Accuracy'] * 100:.2f}%")
        print("Classification Report:")
        print(result['Classification Report'])


Processing dataset: heart_failure_clinical_records_dataset
Cross-validation results:
              Accuracy
ELM           0.740761
RandomForest  0.874638
AdaBoost      0.841123
XGBoost       0.857971
Bagging       0.816304

Best parameters found:
              RandomForest  AdaBoost  XGBoost  Bagging
n_estimators            10        50      500       50

Accuracy of ELM: 68.33%
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.83      0.75        35
           1       0.67      0.48      0.56        25

    accuracy                           0.68        60
   macro avg       0.68      0.65      0.66        60
weighted avg       0.68      0.68      0.67        60


Accuracy of RandomForest: 68.33%
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.89      0.77        35
           1       0.71      0.40      0.51        25

    accuracy                           0.68   