In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
def Baseline(estimator, fold=10, X=None, y=None, scoring='accuracy', param_grid=None, dataset_name="dataset"):
    """
    Perform k-fold cross-validation on the given estimator and dataset, with optional hyperparameter tuning.

    Parameters:
        estimator: The machine learning model to be used for training.
        fold (int): The number of folds for cross-validation.
        X (ndarray): Feature matrix.
        y (ndarray): Target vector.
        scoring (str): The scoring method to be used. Options: 'accuracy', 'matthews_corrcoef', 'f1_score'.
        param_grid (dict): Dictionary with hyperparameters to tune if hyperparameter tuning is desired.
        dataset_name (str): Name of the dataset (for saving in CSV).

    Returns:
        float: The mean cross-validation score across the folds for the best model found.
    """
    # Map the scoring parameter to the actual scoring function
    scoring_functions = {
        'accuracy': make_scorer(accuracy_score),
        'f1_score': make_scorer(f1_score, average='weighted'),
        'matthews_corrcoef': make_scorer(matthews_corrcoef)
    }

    if scoring not in scoring_functions:
        raise ValueError(f"Scoring method '{scoring}' is not valid. Choose 'accuracy', 'matthews_corrcoef', or 'f1_score'.")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    if param_grid:
        cv = StratifiedKFold(n_splits=fold, shuffle=True, random_state=42)
        grid_search = GridSearchCV(estimator, param_grid, cv=cv, scoring=scoring_functions[scoring])
        grid_search.fit(X_scaled, y)
        best_estimator = grid_search.best_estimator_
        print(f"Best parameters found: {grid_search.best_params_}")
    else:
        best_estimator = estimator

    # Perform cross-validation with the best estimator
    scores = cross_val_score(best_estimator, X_scaled, y, cv=fold, scoring=scoring_functions[scoring])

    # Calculate mean and standard deviation
    mean_score = scores.mean()
    std_dev_score = scores.std()
    print(f"Baseline {scoring} score over {fold}-fold CV with best parameters: {mean_score:.4f} ± {std_dev_score:.4f}")

    # Prepare results for each fold in separate rows for CSV
    results = {
        'Dataset': [dataset_name] * fold,
        'Fold': [f"Fold {i+1}" for i in range(fold)],
        'Score': scores
    }

    # Add summary rows for mean and standard deviation
    summary = {
        'Dataset': [dataset_name, dataset_name],
        'Fold': ['Mean', 'Standard Deviation'],
        'Score': [mean_score, std_dev_score]
    }

    # Combine fold results with summary results
    results_df = pd.DataFrame(results)
    summary_df = pd.DataFrame(summary)
    final_df = pd.concat([results_df, summary_df], ignore_index=True)

    # Save final DataFrame to CSV
    csv_filename = f"{dataset_name}_cross_validation_results.csv"
    final_df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")

    return mean_score


In [3]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Define estimator and parameter grid
estimator = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}

# Run Baseline function with dataset name
Baseline(estimator=estimator, fold=10, X=X, y=y, scoring='accuracy', param_grid=param_grid, dataset_name="iris")


Best parameters found: {'n_neighbors': 7, 'weights': 'distance'}
Baseline accuracy score over 10-fold CV with best parameters: 0.9600 ± 0.0442
Results saved to iris_cross_validation_results.csv


0.96

In [4]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
data = load_wine()
X, y = data.data, data.target

# Define estimator and parameter grid
estimator = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}

# Run Baseline function with dataset name
Baseline(estimator=estimator, fold=10, X=X, y=y, scoring='accuracy', param_grid=param_grid, dataset_name="wine")


Best parameters found: {'n_neighbors': 5, 'weights': 'uniform'}
Baseline accuracy score over 10-fold CV with best parameters: 0.9605 ± 0.0259
Results saved to wine_cross_validation_results.csv


0.9604575163398692

In [5]:
# Load a sample dataset
from sklearn.datasets import fetch_openml

# Load Vehicle dataset
vehicle_data = fetch_openml(name='vehicle', version=1, as_frame=True)
X = vehicle_data.data
y = vehicle_data.target

# Define estimator and parameter grid
estimator = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}

# Run Baseline function with dataset name
Baseline(estimator=estimator, fold=10, X=X, y=y, scoring='accuracy', param_grid=param_grid, dataset_name="Vehicle_data")


Best parameters found: {'n_neighbors': 5, 'weights': 'distance'}
Baseline accuracy score over 10-fold CV with best parameters: 0.7176 ± 0.0369
Results saved to Vehicle_data_cross_validation_results.csv


0.7175630252100841