In [30]:
# import packages
import pickle

import numpy as np
from sklearn.datasets import make_regression
# import models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
# import metrics and scoring modules
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
# import tuning modules
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

X, Y = make_regression(n_samples=4000, n_features=20, random_state=18, n_informative=8)

In [2]:
score_function = {"mse": mean_squared_error, "mae": mean_absolute_error, "r2": r2_score, 'rmse':
    lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False), "msle": mean_squared_log_error}

name_score_mapper = {"mae": "neg_mean_absolute_error", "mse": "neg_mean_squared_error",
                     "r2": "r2", "msle": "neg_mean_squared_log_error"}

# constant used for cross validation
CV = KFold(n_splits=5, shuffle=True, random_state=11)

In [3]:
def tune_model(model, params_grid, X_train, y_train, cv=None, scoring='neg_mean_absolute_error'):
    # if a user-friendly name is given, map it to the official one used by sklearn
    if scoring in name_score_mapper:
        scoring = name_score_mapper[scoring]

    if cv is None:
        cv = CV

    searcher = GridSearchCV(model, param_grid=params_grid, cv=cv, scoring=scoring, n_jobs=-1)
    searcher.fit(X_train, y_train)
    return searcher.best_estimator_

In [4]:
def evaluate_tuned_model(tuned_model, X_train, X_test, y_train, y_test, train=True, metrics=None):
    # set the default metric
    if metrics is None:
        metrics = ['mse']

    if isinstance(metrics, str):
        metrics = [metrics]

    if 'msle' in metrics and (y_train <= 0).any():
        # msle cannot be used for target variables with non-positive values
        metrics.remove('msle')

    # train the model
    if train:
        tuned_model.fit(X_train, y_train)

    # predict on the test dataset
    y_pred = tuned_model.predict(X_test)
    # evaluate the model
    scores = dict(list(zip(metrics, [score_function[m](y_test, y_pred) for m in metrics])))
    return tuned_model, scores

In [5]:
def save_model(tuned_model, path):
    with open(path, 'wb') as f:
        pickle.dump(tuned_model, f)


def load_model(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [6]:
def try_model(model, X, y, params_grid, save=True, save_path=None, test_size=0.2, tune_metric=None,
              test_metrics=None, cv=None):
    # the dataset passed is assumed to be ready to be processed
    # all its features are numerical and all its missing values are imputed/discarded

    if save and save_path is None:
        raise ValueError("Please pass a path to save the model or set the 'save' parameter to False")

    # split the dataset into train and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=11)

    # tune the model
    tuned_model = tune_model(model, params_grid, X_train, y_train, cv=cv, scoring=tune_metric)

    # evaluate the tuned model
    model, results = evaluate_tuned_model(tuned_model, X_train, X_test, y_train, y_test, metrics=test_metrics)
    # save the model to the passed path
    if save:
        save_model(tuned_model, save_path)

    return model, results

In [8]:
ridge_basic = Ridge(max_iter=5000)

ridge_grid = {"alpha": np.logspace(0.001, 10, 20)}

def try_ridge(X, y, lr_model=ridge_basic, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = ridge_grid

    return try_model(lr_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

X, Y = make_regression(n_samples=4000, n_features=20, random_state=18, n_informative=8)

lr, results = try_ridge(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse')

print(results)


{'mse': 0.003949899818051906, 'rmse': 0.0628482284400436, 'r2': 0.9999998998382846}


## Decision Tree Regressor

In [10]:
dtr_regressor = DecisionTreeRegressor(random_state=42)

dtr_grid = {'criterion': ['squared error', 'friedman_mse', 'absolute_error', 'poisson'],
            'splitter':['random', 'best'], 'max_depth':list(range(1, 30)),
            'min_samples_split': list(range(1, 20))}

def try_dtr(X, y, dtr_model=dtr_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = dtr_grid

    return try_model(dtr_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_dtr(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse')

print(results)


{'mse': 19199.67985795589, 'rmse': 138.5629093875987, 'r2': 0.5131337608205551}


11600 fits failed out of a total of 22040.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
870 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 1342, in fit
    super().fit(
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 265, in fit
    check_scalar(
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\va

## KN Regression

In [16]:
knn_regressor = KNeighborsRegressor()

knn_grid = {'n_neighbors': list(range(1, 21)), 'weights': ['uniform', 'distance'],
            'p': [1, 2, 3]}

def try_knn(X, y, knn_model=knn_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = knn_grid

    return try_model(knn_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_knn(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse')

print(results)


{'mse': 11501.592511643237, 'rmse': 107.24547781441993, 'r2': 0.7083421634034271}


## Linear Regression

In [17]:
linear_regressor = LinearRegression()

linear_grid = {'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True ]}

def try_linear(X, y, linear_model=linear_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = linear_grid

    return try_model(linear_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_linear(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse')

print(results)

{'mse': 5.598142835038566e-26, 'rmse': 2.366039482983867e-13, 'r2': 1.0}


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




## Polynomial regression

In [18]:
polynomial_regressor = LinearRegression()

polynomial_grid = {'degree': list(range(2, 5)), 'interaction_only': [True, False], 
                   'include_bias': [True, False], 'order': ['C', 'F']}

def try_polynomial(X, y, polynomial_model=polynomial_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = linear_grid

    return try_model(polynomial_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_polynomial(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse')

print(results)

{'mse': 5.598142835038566e-26, 'rmse': 2.366039482983867e-13, 'r2': 1.0}


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




## Random forest regressor

In [28]:
rfr_regressor = RandomForestRegressor()

rfr_grid = {'max_leaf_nodes':[2, 4, 6, 7], 'min_samples_split':[5, 10, 20, 50], 'max_depth': [5,10,15,20],
            'max_features':[3, 4, 5], 'n_estimators': [50, 100, 200]}

def try_rfr(X, y, rfr_model=rfr_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = rfr_grid

    return try_model(rfr_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_rfr(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse', params_grid=rfr_grid)

print(results)

{'mse': 22395.30073803946, 'rmse': 149.65059551515142, 'r2': 0.4320990804904612}


## Linear RANSAC regression

In [34]:
'''
ransac_regressor = RANSACRegressor()

ransac_grid = {'estimator': [LinearRegression], 'min_samples': list(range(0, 0.5, 0.1)),
               'stop_probability': list(range(0.1, 1, 0.1))}

def try_ransac(X, y, ransac_model=ransac_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = ransac_grid

    return try_model(ransac_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_ransac(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse', params_grid=ransac_grid)

print(results)
'''

TypeError: 'float' object cannot be interpreted as an integer

## Linear SVM regression

In [36]:
svr_regressor = LinearSVR()

svr_grid = {'C': [100, 10, 1.0, 0.1, 0.001], 'loss': ['hinge', 'squared_hinge'],
               'dual':[True, False], 'fit_intercept': [True, False]}

def try_svr(X, y, svr_model=svr_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = svr_grid

    return try_model(svr_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_svr(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse', params_grid=svr_grid)

print(results)

## XGBoost classifier

In [None]:
xgb_regressor = XGBRegressor()

xgb_grid = {'booster': ['gbtree', 'gblinear', 'dart'], 'eta': [0.01, 0.05, 0.1, 0.15, 0.2]}

def try_xgb(X, y, xgb_model=xgb_regressor, params_grid=None, save=True, save_path=None,
              test_size=0.2, tune_metric=None, test_metrics=None, cv=None):
    if params_grid is None:
        params_grid = xgb_grid

    return try_model(xgb_model, X, y, params_grid, save=save, save_path=save_path,
                     test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lr, results = try_xgb(X, Y, save=False, test_metrics=['mse', 'rmse', 'r2', "msle"], tune_metric='mse', params_grid=xgb_grid)

print(results)