In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

def compare_models(X, y, task='classification', param_grids=None):
    # Check if task is valid
    if task not in ['classification', 'regression']:
        raise ValueError("Invalid task. Choose 'classification' or 'regression'.")

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Define models
    if task == 'classification':
        models = {
            'RandomForest': RandomForestClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'SVM': SVC(),
            'LogisticRegression': LogisticRegression(max_iter=1000),
            'DecisionTree': DecisionTreeClassifier(),
            'KNN': KNeighborsClassifier(),
            'Ridge': Ridge(),
            'Lasso': Lasso()
        }
    else:
        models = {
            'RandomForest': RandomForestRegressor(),
            'GradientBoosting': GradientBoostingRegressor(),
            'SVM': SVR(),
            'LinearRegression': LinearRegression(),
            'DecisionTree': DecisionTreeRegressor(),
            'KNN': KNeighborsRegressor(),
            'Ridge': Ridge(),
            'Lasso': Lasso()
        }

    baseline_results = []

    # Evaluate baseline models
    print("Evaluating baseline models...")
    for model_name in tqdm(models, desc="Baseline Models"):
        model = models[model_name]
        if task == 'classification':
            scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
            baseline_results.append({
                'Model': model_name,
                'Mean Score': scores.mean()
            })
        else:
            scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
            baseline_results.append({
                'Model': model_name,
                'Mean Score': -scores.mean()
            })

    # Create a DataFrame with the baseline results
    baseline_results_df = pd.DataFrame(baseline_results)
    print("\nBaseline Results:\n", baseline_results_df)

    # Select the best model based on the baseline results
    if task == 'classification':
        best_model_name = baseline_results_df.loc[baseline_results_df['Mean Score'].idxmax()]['Model']
    else:
        best_model_name = baseline_results_df.loc[baseline_results_df['Mean Score'].idxmin()]['Model']
    
    best_model = models[best_model_name]

    print(f"\nBest Model: {best_model_name}")

    # Hyperparameter tuning for the best model
    if param_grids and best_model_name in param_grids:
        print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
        grid_search = GridSearchCV(best_model, param_grids[best_model_name], cv=5, scoring='accuracy' if task == 'classification' else 'neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        print(f"Best Parameters for {best_model_name}: {grid_search.best_params_}")

    # Evaluate the best model
    y_pred = best_model.predict(X_val)
    if task == 'classification':
        final_score = accuracy_score(y_val, y_pred)
        print(f"\nFinal Accuracy of the best model ({best_model_name}): {final_score}")
    else:
        final_score = mean_squared_error(y_val, y_pred)
        print(f"\nFinal Mean Squared Error of the best model ({best_model_name}): {final_score}")

    return best_model, baseline_results_df

# Example parameter grids
param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['linear', 'rbf']
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'LinearRegression': {},
    'DecisionTree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }
}




In [2]:
# Load dataset for testing
housing = fetch_california_housing()
X_regression, y_regression = housing.data, housing.target

# Run the comparison for regression
print("\n=== Regression Task ===")
best_model_regression, baseline_results_df_regression = compare_models(X_regression, y_regression, task='regression', param_grids=param_grids)



=== Regression Task ===
Evaluating baseline models...


Baseline Models: 100%|██████████| 8/8 [01:55<00:00, 14.49s/it]


Baseline Results:
               Model  Mean Score
0      RandomForest    0.262886
1  GradientBoosting    0.285267
2               SVM    0.351828
3  LinearRegression    0.519265
4      DecisionTree    0.531783
5               KNN    0.422986
6             Ridge    0.519265
7             Lasso    1.336930

Best Model: RandomForest

Performing hyperparameter tuning for RandomForest...





Best Parameters for RandomForest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}

Final Mean Squared Error of the best model (RandomForest): 0.25160074448312497


In [3]:

# Load dataset for testing
iris = load_iris()
X_classification, y_classification = iris.data, iris.target

# Run the comparison for classification
print("\n=== Classification Task ===")
best_model_classification, baseline_results_df_classification = compare_models(X_classification, y_classification, task='classification', param_grids=param_grids)


=== Classification Task ===
Evaluating baseline models...


Baseline Models: 100%|██████████| 8/8 [00:02<00:00,  3.61it/s]



Baseline Results:
                 Model  Mean Score
0        RandomForest    0.950000
1    GradientBoosting    0.950000
2                 SVM    0.950000
3  LogisticRegression    0.958333
4        DecisionTree    0.950000
5                 KNN    0.925000
6               Ridge         NaN
7               Lasso         NaN

Best Model: LogisticRegression

Performing hyperparameter tuning for LogisticRegression...
Best Parameters for LogisticRegression: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}

Final Accuracy of the best model (LogisticRegression): 1.0


