In [None]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database.csv
Saving scp_statements.csv to scp_statements.csv


In [None]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [10]:
# a4
"""
modular_regression_with_random_search.py

- Loads a regression dataset (diabetes by default)
- Splits and scales data
- Defines several regressors + hyperparameter search spaces
- Tunes with RandomizedSearchCV (robust to errors)
- Evaluates models on Train & Test with MSE, RMSE, MAE, R2, MAPE
- Returns a pandas DataFrame summarising results
"""

import numpy as np
import pandas as pd
import warnings
from pprint import pprint

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from scipy.stats import randint, uniform

warnings.filterwarnings("ignore")


# ---------------- Utility functions ----------------
def load_dataset(kind="diabetes"):
    """
    Load a regression dataset. Default: sklearn's diabetes dataset.
    Replace or modify to load your CSV.
    Returns: X (ndarray), y (ndarray)
    """
    if kind == "diabetes":
        data = load_diabetes()
        return data.data, data.target
    else:
        raise ValueError("Unknown dataset kind. Add loader code here.")


def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    return X_train_s, X_test_s, scaler


def safe_mape(y_true, y_pred, eps=1e-8):
    """Compute MAPE safely (avoid division by zero). Returns percentage."""
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.where(np.abs(y_true) < eps, eps, y_true)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100.0


def evaluate_regression(model, X_train, y_train, X_test, y_test):
    """Compute MSE, RMSE, MAE, R2, MAPE for train and test splits."""
    results = {}
    for label, (X, y) in [("Train", (X_train, y_train)), ("Test", (X_test, y_test))]:
        y_pred = model.predict(X)
        mse = mean_squared_error(y, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        mape = safe_mape(y, y_pred)
        results[f"MSE_{label}"] = mse
        results[f"RMSE_{label}"] = rmse
        results[f"MAE_{label}"] = mae
        results[f"R2_{label}"] = r2
        results[f"MAPE_{label}"] = mape
    return results


def tune_hyperparameters(model, param_distributions, X_train, y_train,
                         cv=3, n_iter=20, scoring="neg_mean_squared_error", random_state=42):
    """
    RandomizedSearchCV wrapper with safe fallback.
    Returns (best_estimator, best_params) or (fitted_model, {}) on failure.
    """
    if not param_distributions:  # nothing to tune
        model.fit(X_train, y_train)
        return model, {}

    try:
        rs = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_distributions,
            n_iter=n_iter,
            cv=cv,
            scoring=scoring,
            random_state=random_state,
            n_jobs=-1,
            verbose=0,
        )
        rs.fit(X_train, y_train)
        best = rs.best_estimator_
        best_params = rs.best_params_
        return best, best_params
    except Exception as e:
        print(f"[Warning] RandomizedSearchCV failed for {model.__class__.__name__}: {e}")
        print("Falling back to model.fit() with default params.")
        try:
            model.fit(X_train, y_train)
            return model, {}
        except Exception as e2:
            raise RuntimeError(f"Model fit also failed: {e2}")


# ---------------- Models & hyperparameter spaces ----------------
def get_regressors_and_spaces(include_xgboost_if_available=True, include_catboost_if_available=True):
    """
    Returns dict: {name: (estimator_instance, param_distributions_dict)}
    If xgboost/catboost are installed, they will be added automatically.
    """
    models = {
        "LinearRegression": (LinearRegression(), {}),

        "DecisionTree": (
            DecisionTreeRegressor(random_state=42),
            {
                "max_depth": randint(1, 20),
                "min_samples_split": randint(2, 20),
                "min_samples_leaf": randint(1, 20)
            }
        ),

        "RandomForest": (
            RandomForestRegressor(random_state=42),
            {
                "n_estimators": randint(50, 200),
                "max_depth": randint(3, 25),
                "min_samples_split": randint(2, 20)
            }
        ),

        "SVR": (
            SVR(),
            {
                "C": uniform(loc=0.1, scale=100),
                "epsilon": uniform(loc=0.001, scale=1),
                "kernel": ["rbf", "linear", "poly"]
            }
        ),

        "KNN": (
            KNeighborsRegressor(),
            {
                "n_neighbors": randint(1, 30),
                "weights": ["uniform", "distance"],
                "p": [1, 2]
            }
        ),

        "MLP": (
            MLPRegressor(max_iter=1000, random_state=42),
            {
                "hidden_layer_sizes": [(50,), (100,), (100, 50)],
                "activation": ["relu", "tanh"],
                "alpha": uniform(loc=1e-6, scale=1e-2),
                "learning_rate_init": uniform(loc=1e-4, scale=1e-1)
            }
        ),

        "AdaBoost": (
            AdaBoostRegressor(random_state=42),
            {
                "n_estimators": randint(50, 200),
                "learning_rate": uniform(loc=0.01, scale=1.0)
            }
        ),
    }

    # Optionally add XGBoost if installed
    if include_xgboost_if_available:
        try:
            from xgboost import XGBRegressor
            models["XGBoost"] = (
                XGBRegressor(random_state=42, verbosity=0),
                {
                    "n_estimators": randint(50, 200),
                    "max_depth": randint(2, 10),
                    "learning_rate": uniform(loc=0.01, scale=0.3)
                }
            )
            print("[Info] XGBoost found and included.")
        except Exception:
            print("[Info] xgboost not installed — skipping XGBoost.")

    # Optionally add CatBoost if installed
    if include_catboost_if_available:
        try:
            from catboost import CatBoostRegressor
            models["CatBoost"] = (
                CatBoostRegressor(random_state=42, verbose=0),
                {
                    "iterations": randint(50, 200),
                    "depth": randint(2, 10),
                    "learning_rate": uniform(loc=0.01, scale=0.3)
                }
            )
            print("[Info] CatBoost found and included.")
        except Exception:
            print("[Info] catboost not installed — skipping CatBoost.")

    return models


# ---------------- Orchestration: run all models ----------------
def run_all_regressors(X, y,
                       test_size=0.2, random_state=42,
                       cv=3, n_iter=20,
                       include_xgboost=True, include_catboost=True):
    """
    Main runner. Returns results dataframe.
    """
    # Split
    X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_size, random_state=random_state)

    # Scale (recommended for e.g., SVR, MLP, KNN)
    X_train_s, X_test_s, scaler = scale_data(X_train, X_test)

    # Get models
    models_and_spaces = get_regressors_and_spaces(include_xgboost_if_available=include_xgboost,
                                                  include_catboost_if_available=include_catboost)

    results = []
    for name, (estimator, space) in models_and_spaces.items():
        print(f"\n--- Running: {name} ---")
        # Tune or fit
        try:
            best_model, best_params = tune_hyperparameters(
                estimator, space, X_train_s, y_train, cv=cv, n_iter=n_iter, scoring="neg_mean_squared_error", random_state=random_state
            )
            if best_params:
                print("Best params:", best_params)
        except RuntimeError as e:
            print(f"[Error] Skipping {name} due to fit failure: {e}")
            continue

        # Evaluate
        metrics = evaluate_regression(best_model, X_train_s, y_train, X_test_s, y_test)
        metrics["Model"] = name
        # store params for record
        metrics["BestParams"] = (best_params if best_params else {})
        results.append(metrics)

    results_df = pd.DataFrame(results)

    # reorder columns for readability
    cols_order = ["Model", "BestParams",
                  "MSE_Train", "RMSE_Train", "MAE_Train", "R2_Train", "MAPE_Train",
                  "MSE_Test", "RMSE_Test", "MAE_Test", "R2_Test", "MAPE_Test"]
    results_df = results_df[[c for c in cols_order if c in results_df.columns]]

    # sort by RMSE_Test ascending (lower is better)
    if "RMSE_Test" in results_df.columns:
        results_df = results_df.sort_values("RMSE_Test").reset_index(drop=True)

    return results_df, scaler


# ---------------- Demo main ----------------
if __name__ == "__main__":
    # Load data
    X, y = load_dataset("diabetes")  # replace with your loader if needed

    # Run
    results_df, scaler = run_all_regressors(X, y, test_size=0.2, random_state=42, cv=3, n_iter=25)

    # Show full table
    pd.set_option('display.max_colwidth', 200)
    print("\n=== Final Results (sorted by RMSE_Test) ===")
    print(results_df.to_string(index=False))

    # Optionally save to CSV
    results_df.to_csv("regression_model_results.csv", index=False)
    print("\nSaved results to regression_model_results.csv")


[Info] XGBoost found and included.
[Info] catboost not installed — skipping CatBoost.

--- Running: LinearRegression ---

--- Running: DecisionTree ---
Best params: {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 13}

--- Running: RandomForest ---
Best params: {'max_depth': 6, 'min_samples_split': 3, 'n_estimators': 183}

--- Running: SVR ---
Best params: {'C': np.float64(5.741157902710025), 'epsilon': np.float64(0.7229987722668247), 'kernel': 'linear'}

--- Running: KNN ---
Best params: {'n_neighbors': 15, 'p': 2, 'weights': 'uniform'}

--- Running: MLP ---
Best params: {'activation': 'relu', 'alpha': np.float64(0.0059695015794648705), 'hidden_layer_sizes': (100,), 'learning_rate_init': np.float64(0.015699452033620265)}

--- Running: AdaBoost ---
Best params: {'learning_rate': np.float64(0.8183973481164611), 'n_estimators': 58}

--- Running: XGBoost ---
Best params: {'learning_rate': np.float64(0.2140922615763339), 'max_depth': 2, 'n_estimators': 67}

=== Final Results (s