In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def Baseline(estimator, fold=10, X=None, y=None, scoring='accuracy'):
    """
    Perform k-fold cross-validation on a dataset with dynamic baseline model selection.

    Parameters:
        estimator: Machine learning model or a choice of baseline model if 'auto'.
        fold (int): Number of folds for cross-validation.
        X (ndarray): Feature matrix.
        y (ndarray): Target vector.
        scoring (str): The scoring metric. Options: 'accuracy', 'matthews_corrcoef', 'f1_score'.

    Returns:
        float: The mean cross-validation score across the folds.
    """
    # Preprocess the dataset: Scale the features to handle diverse ranges
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Define scoring methods
    scoring_functions = {
        'accuracy': make_scorer(accuracy_score),
        'f1_score': make_scorer(f1_score, average='weighted'),
        'matthews_corrcoef': make_scorer(matthews_corrcoef)
    }

    # Ensure a valid scoring method is used
    if scoring not in scoring_functions:
        raise ValueError(f"Invalid scoring method '{scoring}'. Use 'accuracy', 'matthews_corrcoef', or 'f1_score'.")

    # Default baseline model selection based on dataset properties
    if estimator == 'auto':
        if X.shape[1] < 20:  # Low-dimensional data
            estimator = KNeighborsClassifier(n_neighbors=5)
        elif X.shape[1] < 100:  # Medium-dimensional data
            estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        else:  # High-dimensional data
            estimator = LogisticRegression(max_iter=500, random_state=42)

    # Perform stratified cross-validation
    cv = StratifiedKFold(n_splits=fold, shuffle=True, random_state=42)
    scores = cross_val_score(estimator, X_scaled, y, cv=cv, scoring=scoring_functions[scoring])

    # Calculate and print the mean score
    mean_score = scores.mean()
    print(f"Baseline {scoring} score over {fold}-fold CV: {mean_score:.4f}")
    return mean_score


In [14]:
# Load a sample dataset
data = load_iris()
X, y = data.data, data.target

# Evaluate the baseline model using 10-fold cross-validation with accuracy scoring
Baseline(estimator, fold=10, X=X, y=y, scoring='accuracy')

# Evaluate with F1-score
Baseline(estimator, fold=10, X=X, y=y, scoring='f1_score')

# Evaluate with Matthews correlation coefficient
Baseline(estimator, fold=10, X=X, y=y, scoring='matthews_corrcoef')


Baseline accuracy score over 10-fold CV: 0.9533
Baseline f1_score score over 10-fold CV: 0.9520
Baseline matthews_corrcoef score over 10-fold CV: 0.9363


0.9362634927202874

In [13]:
# Load a sample dataset
data = load_wine()
X, y = data.data, data.target

# Evaluate the baseline model using 10-fold cross-validation with accuracy scoring
Baseline(estimator, fold=10, X=X, y=y, scoring='accuracy')

# Evaluate with F1-score
Baseline(estimator, fold=10, X=X, y=y, scoring='f1_score')

# Evaluate with Matthews correlation coefficient
Baseline(estimator, fold=10, X=X, y=y, scoring='matthews_corrcoef')

Baseline accuracy score over 10-fold CV: 0.9722
Baseline f1_score score over 10-fold CV: 0.9717
Baseline matthews_corrcoef score over 10-fold CV: 0.9616


0.9615828975793683

In [12]:
# Load a sample dataset
from sklearn.datasets import fetch_openml

# Load Vehicle dataset
vehicle_data = fetch_openml(name='vehicle', version=1, as_frame=True)
X = vehicle_data.data
y = vehicle_data.target

# Evaluate the baseline model using 10-fold cross-validation with accuracy scoring
Baseline(estimator, fold=10, X=X, y=y, scoring='accuracy')

# Evaluate with F1-score
Baseline(estimator, fold=10, X=X, y=y, scoring='f1_score')

# Evaluate with Matthews correlation coefficient
Baseline(estimator, fold=10, X=X, y=y, scoring='matthews_corrcoef')

Baseline accuracy score over 10-fold CV: 0.7198
Baseline f1_score score over 10-fold CV: nan
Baseline matthews_corrcoef score over 10-fold CV: 0.6289


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 204, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of ['bus' 'opel' 'saab' 'van']

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metr

0.6288526215780331