In [None]:
!pip install xgboost lightgbm catboost

In [1]:
!pip install --upgrade numpy catboost

Collecting numpy
  Using cached numpy-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)


In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, PassiveAggressiveRegressor, SGDRegressor, BayesianRidge, ARDRegression, HuberRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import TheilSenRegressor, RANSACRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Load dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names) # Use housing data
y = pd.Series(housing.target)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of regression models with optimized default parameters
models = {
    'LinearRegression': LinearRegression(n_jobs=-1),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'PassiveAggressiveRegressor': PassiveAggressiveRegressor(max_iter=1000, random_state=42),
    'SGDRegressor': SGDRegressor(max_iter=1000, random_state=42),
    'BayesianRidge': BayesianRidge(),
    'ARDRegression': ARDRegression(),
    'HuberRegressor': HuberRegressor(max_iter=1000),
    'TheilSenRegressor': TheilSenRegressor(random_state=42),
    'RANSACRegressor': RANSACRegressor(random_state=42),
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state=42),
    'ExtraTreeRegressor': ExtraTreeRegressor(random_state=42),
    'RandomForestRegressor': RandomForestRegressor(random_state=42, n_jobs=-1),
    'ExtraTreesRegressor': ExtraTreesRegressor(random_state=42, n_jobs=-1),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(random_state=42),
    'AdaBoostRegressor': AdaBoostRegressor(random_state=42),
    'BaggingRegressor': BaggingRegressor(random_state=42, n_jobs=-1),
    'KNeighborsRegressor': KNeighborsRegressor(n_jobs=-1),
    'SVR': SVR(),
    'MLPRegressor': MLPRegressor(max_iter=1000, random_state=42),
    'XGBRegressor': XGBRegressor(use_label_encoder=False, eval_metric='rmse', random_state=42, n_jobs=-1),
    'LGBMRegressor': LGBMRegressor(random_state=42, n_jobs=-1),
    'CatBoostRegressor': CatBoostRegressor(verbose=0, random_state=42),
    'StackingRegressor': StackingRegressor(estimators=[
        ('lr', LinearRegression(n_jobs=-1)),
        ('ridge', Ridge(random_state=42)),
        ('lasso', Lasso(random_state=42))
    ], final_estimator=RandomForestRegressor(random_state=42, n_jobs=-1)),
    'VotingRegressor': VotingRegressor(estimators=[
        ('lr', LinearRegression(n_jobs=-1)),
        ('ridge', Ridge(random_state=42)),
        ('lasso', Lasso(random_state=42))
    ])
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    return train_time, predict_time, mse, rmse, r2

# Evaluate each model
results = {}
for name, model in models.items():
    train_time, predict_time, mse, rmse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {
        'Train Time': train_time,
        'Predict Time': predict_time,
        'MSE': mse,
        'RMSE': rmse,
        'R2 Score': r2
    }

# Print results
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  Train Time: {metrics['Train Time']:.4f} seconds")
    print(f"  Predict Time: {metrics['Predict Time']:.4f} seconds")
    print(f"  MSE: {metrics['MSE']:.4f}")
    print(f"  RMSE: {metrics['RMSE']:.4f}")
    print(f"  R2 Score: {metrics['R2 Score']:.4f}")
    print('\n')

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947
LinearRegression:
  Train Time: 0.0042 seconds
  Predict Time: 0.0021 seconds
  MSE: 0.5559
  RMSE: 0.7456
  R2 Score: 0.5758


Ridge:
  Train Time: 0.0073 seconds
  Predict Time: 0.0009 seconds
  MSE: 0.5558
  RMSE: 0.7455
  R2 Score: 0.5759


Lasso:
  Train Time: 0.0052 seconds
  Predict Time: 0.0009 seconds
  MSE: 0.9380
  RMSE: 0.9685
  R2 Score: 0.2842


ElasticNet:
  Train Time: 0.0070 seconds
  Predict Time: 0.0035 seconds
  MSE: 0.7646
  RMSE: 0.8744
  R2 Score: 0.4166


PassiveAggressiveRegressor:
  Train Time: 0.0301 seconds
  Predict Time: 0.0009 seconds
  MSE: 1.1681
  RMSE: 1.0808
  R2 Score: 0.1086


SGDRegressor:
  Train Time: 0.0878 se

## Regression Algorithm Performance<br><br>



| Metric       | Best Algorithm          | Best Value   | Worst Algorithm         | Worst Value   |
|--------------|-------------------------|--------------|-------------------------|---------------|
| Train Time   | LinearRegression        | 0.0042(s)       | RandomForestRegressor     | 8.5214(s)        |
| Predict Time | PassiveAggressiveRegressor, SGDRegressor, Ridge & Lasso        | 0.0009(s)       | SVR            | 2.0201(s)       |
| MSE          | CatBoostRegressor   | 0.1989     | SGDRegressor  | 31255986387106809012092403712.0
| RMSE         | CatBoostRegressor   | 0.4460       | SGDRegressor  | 176793626545491.750
| r2-Score     | CatBoostRegressor | 0.8482| SGDRegressor  | -23852085040706160132853071872.0      |

