In [5]:
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris, load_wine, fetch_openml
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [6]:
# Define datasets
datasets = [
    ("Iris", load_iris),
    ("WBDC (Breast Cancer Wisconsin Diagnostic)", lambda: fetch_openml(data_id=1510)),
    ("Spambase", lambda: fetch_openml(name="spambase", version=1)),
    ("Heart", lambda: fetch_openml(data_id=533)),
    ("Glass", lambda: fetch_openml(name="glass", version=2)),
    ("WBC (Breast Cancer Wisconsin Original)", lambda: fetch_openml(data_id=15)),
    ("Ionosphere", lambda: fetch_openml(name="ionosphere", version=1)),
    ("Arrhythmia", lambda: fetch_openml(name="arrhythmia", version=1)),
    ("Multiple Features", lambda: fetch_openml(name="mfeat-factors", version=1)),
    ("Australian", lambda: fetch_openml(name="australian")),
    ("German Number (Credit Dataset)", lambda: fetch_openml(name="credit-g", version=1)),
    ("DNA", lambda: fetch_openml(name="dna", version=1)),
    ("Wine", load_wine),
    ("Vehicle", lambda: fetch_openml(name="vehicle", version=2)),
    ("Waveform", lambda: fetch_openml(name="waveform-5000", version=2)),
    ("Zoo", lambda: fetch_openml(name="zoo", version=2)),
    ("Hillvalley", lambda: fetch_openml(name="hill-valley", version=2)),
    ("Sonar", lambda: fetch_openml(name="sonar", version=1)),
    ("Musk 1", lambda: fetch_openml(name="musk", version=1))
]


In [7]:


def Baseline(estimator, fold=10, X=None, y=None, scoring='accuracy', param_grid=None, dataset_name="dataset"):
    scoring_functions = {
        'accuracy': make_scorer(accuracy_score),
        'f1_score': make_scorer(f1_score, average='weighted'),
        'matthews_corrcoef': make_scorer(matthews_corrcoef)
    }

    if scoring not in scoring_functions:
        raise ValueError(f"Scoring method '{scoring}' is not valid. Choose 'accuracy', 'matthews_corrcoef', or 'f1_score'.")

    # Create preprocessing steps for all features
    preprocessor = ColumnTransformer(
        transformers=[('onehot', OneHotEncoder(handle_unknown='ignore'), X.columns)]
    )

    # Create a pipeline with the preprocessor and the estimator
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', estimator)])

    # Hyperparameter tuning with GridSearchCV if param_grid is provided
    if param_grid:
        cv = StratifiedKFold(n_splits=fold, shuffle=True, random_state=42)
        grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring_functions[scoring], error_score='raise')
        grid_search.fit(X, y)
        best_estimator = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        pipeline.fit(X, y)
        best_estimator = pipeline
        best_params = {}

    # Perform cross-validation with the best estimator
    scores = cross_val_score(best_estimator, X, y, cv=fold, scoring=scoring_functions[scoring])

    # Return results in specified format
    result = {
        'Framework': 'KNeighborsClassifier',
        'Dataset': dataset_name,
        'AUC': scores.mean(),
        'AUC_STD': scores.std()
    }

    return result


In [18]:
# Example usage: Load datasets and run Baseline function
param_grid = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance']
}
results = []

for dataset_name, loader in datasets:
    try:
        # Load data
        data = loader()
        X, y = (data.data, data.target) if hasattr(data, 'data') else (data['data'], data['target'])

        # Convert X to DataFrame for compatibility
        X = pd.DataFrame(X)

        # Run baseline evaluation
        result = Baseline(estimator=KNeighborsClassifier(), fold=10, X=X, y=y, scoring='accuracy', param_grid=param_grid, dataset_name=dataset_name)
        results.append(result)

    except Exception as e:
        print(f"Error with dataset {dataset_name}: {e}")
#Print all results
for res in results:
    print(res)



{'Framework': 'KNeighborsClassifier', 'Dataset': 'Iris', 'AUC': 0.8533333333333335, 'AUC_STD': 0.07180219742846006}
{'Framework': 'KNeighborsClassifier', 'Dataset': 'WBDC (Breast Cancer Wisconsin Diagnostic)', 'AUC': 0.6854323308270677, 'AUC_STD': 0.04093269008836167}
{'Framework': 'KNeighborsClassifier', 'Dataset': 'Spambase', 'AUC': 0.7989625577666699, 'AUC_STD': 0.024622408860984428}
{'Framework': 'KNeighborsClassifier', 'Dataset': 'Heart', 'AUC': 0.8191883116883115, 'AUC_STD': 0.045507263028688995}
{'Framework': 'KNeighborsClassifier', 'Dataset': 'Glass', 'AUC': 0.6683982683982684, 'AUC_STD': 0.06734211156108369}
{'Framework': 'KNeighborsClassifier', 'Dataset': 'WBC (Breast Cancer Wisconsin Original)', 'AUC': 0.95, 'AUC_STD': 0.03683941988065038}
{'Framework': 'KNeighborsClassifier', 'Dataset': 'Ionosphere', 'AUC': 0.6955555555555556, 'AUC_STD': 0.1269956343992288}
{'Framework': 'KNeighborsClassifier', 'Dataset': 'Arrhythmia', 'AUC': 0.5774396135265699, 'AUC_STD': 0.014886832236702

In [19]:
from tabulate import tabulate  # Ensure you have installed the library: pip install tabulate

# Generate summary table
df_results = pd.DataFrame(results)

# Pivot table for the desired format
summary_table = df_results.pivot(index='Dataset', columns='Framework', values=['AUC', 'AUC_STD'])

# Flatten MultiIndex for clean column names
summary_table.columns = [f'{framework}_{metric}' for metric, framework in summary_table.columns]

# Replace NaN values with placeholders (e.g., empty or 0.0)
summary_table = summary_table.fillna('')

# Display the summary table in a nicely formatted way
print(tabulate(summary_table, headers='keys', tablefmt='grid'))

# Save the summary table to a CSV file
summary_table.to_csv("baseline_results.csv")


+-------------------------------------------+----------------------------+--------------------------------+
| Dataset                                   |   KNeighborsClassifier_AUC |   KNeighborsClassifier_AUC_STD |
| Arrhythmia                                |                   0.57744  |                     0.0148868  |
+-------------------------------------------+----------------------------+--------------------------------+
| Australian                                |                   0.856522 |                     0.0351729  |
+-------------------------------------------+----------------------------+--------------------------------+
| DNA                                       |                   0.836489 |                     0.0176211  |
+-------------------------------------------+----------------------------+--------------------------------+
| German Number (Credit Dataset)            |                   0.721    |                     0.0314484  |
+---------------------------