## functions

In [1]:
import os
import pandas as pd
import numpy as np
import json
import time
import copy
from sklearn.model_selection import KFold
from itertools import product
from joblib import Parallel, delayed


from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor, TheilSenRegressor, ARDRegression, HuberRegressor, \
LassoLars, Lars, OrthogonalMatchingPursuit, BayesianRidge, LogisticRegression, TweedieRegressor, SGDRegressor,PassiveAggressiveRegressor, RANSACRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, HistGradientBoostingRegressor, BaggingRegressor
import xgboost as xgb
#import lightgbm as lgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel as C
#from mord import OrdinalRidge

from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, mean_squared_log_error,\
explained_variance_score, r2_score
from scipy.stats import percentileofscore, norm, jarque_bera, ks_2samp
#from statsmodels.stats.stattools import durbin_watson

In [2]:
def scoring_metrics(y_true, y_pred):
    
    # Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)
    # Mean Squared Error (MSE)
    mse = mean_squared_error(y_true, y_pred)
    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    # Median Absolute Error
    median_ae = median_absolute_error(y_true, y_pred)
    # Mean Percentage Error (MPE)
    mpe = np.mean((y_true - y_pred) / y_true) * 100
    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    # Symmetric Mean Absolute Percentage Error (SMAPE)
    smape = np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))) * 100
    # Relative Squared Error (RSE)
    rse = np.sum(np.square(y_true - y_pred)) / np.sum(np.square(y_true - np.mean(y_true)))
    # Theil's U statistic
    theil_u = np.sqrt(np.sum(np.square(y_pred - y_true)) / np.sum(np.square(y_true)))
    # Mean Error (ME)
    me = np.mean(y_pred - y_true)
    # Adjusted R-squared
    adj_r2 = 1 - (1 - r2_score(y_true, y_pred)) * (len(y_true) - 1) / (len(y_true) - len(y_pred) - 1)
    # Explained Variance Score
    explained_var = explained_variance_score(y_true, y_pred)
    # Durbin-Watson Statistic
    #dw_statistic = durbin_watson(y_pred - y_true)
    # Jarque-Bera Test
    jb_statistic, jb_p_value = jarque_bera(y_pred - y_true)
    # Kolmogorov-Smirnov Statistic
    ks_statistic, ks_p_value = ks_2samp(y_true, y_pred)
    # Pseudo R-squared (Nagelkerke's R-squared)
    r2 = r2_score(y_true, y_pred)
    
    scores = {
        'Mean_Absolute_Error': mae,
        'Mean_Squared_Error': mse,
        'Root_Mean_Squared_Error': rmse,
        'Median_Absolute_Error': median_ae,
        'Mean_Percentage_Error': mpe,
        'Mean_Absolute_Percentage_Error': mape,
        'Symmetric_Mean_Absolute_Percentage_Error': smape,
        'Relative_Squared_Error': rse,
        'Theils_U': theil_u,
        'Mean_Error': me,
        'Adjusted_R-squared': adj_r2,
        'Explained_Variance_Score': explained_var,
        #'Durbin-Watson_Statistic': dw_statistic,
        'Jarque-Bera_Test_Statistic': jb_statistic,
        'Kolmogorov-Smirnov_Statistic': ks_statistic,
        'R-squared': r2,
    }
    return scores

In [3]:
def grid_search(model, param_grid, X_outter, y_outter):
    best_score = None
    best_params = None
    best_model = None
    param_comb_arr = []
    param_comb_arr_lite = []
    # Generate all possible combinations of hyperparameters
    all_params = list(product(*param_grid.values()))

    for params in all_params:
        # Create a dictionary of hyperparameter values
        param_dict = {param_name: param_value for param_name, param_value in zip(param_grid.keys(), params)}
        param_comb = {}
        param_comb_lite = {}
        param_comb['params'] = str(param_dict)
        param_comb_lite['params'] = str(param_dict)
        
        # Set the model hyperparameters
        model.set_params(**param_dict)
        inner_fold_arr = []
        inner_fold_arr_lite = []
        scores = []
        for fold_num, (inner_train_index, inner_test_index) in enumerate(inner_cv.split(X_outter, y_outter)):
            X_inner_train, X_inner_test = X_outter[inner_train_index], X_outter[inner_test_index]
            y_inner_train, y_inner_test = y_outter[inner_train_index], y_outter[inner_test_index]
            
            inner_fold = {}
            inner_fold_lite = {}
            inner_fold['fold_num'] = fold_num+1
            inner_fold_lite['fold_num'] = fold_num+1
            test_row = {}
            train_row = {}
            
            start_time = time.time()
            model.fit(X_inner_train, y_inner_train)
            end_time = time.time()
            train_row['fit_time']=end_time - start_time
            
            start_time = time.time()
            y_pred = model.predict(X_inner_train)
            end_time = time.time()
            train_row['pred_time']=end_time - start_time
            

            train_row.update(scoring_metrics(y_inner_train,y_pred))
            
            inner_fold_lite['train'] =  copy.deepcopy(train_row)
            train_row['y'] = y_inner_train
            train_row['y_pred'] = y_pred
            train_row['indices'] = inner_train_index

            inner_fold['train']=train_row

            start_time = time.time()
            y_pred = model.predict(X_inner_test)
            end_time = time.time()
            test_row['pred_time']=end_time - start_time

            
            test_metrics = scoring_metrics(y_inner_test,y_pred)
            test_row.update(test_metrics)
            scores.append(test_metrics)
            
            inner_fold_lite['test']=copy.deepcopy(test_row)
            
            test_row['y'] = y_inner_test
            test_row['y_pred'] = y_pred
            test_row['indices'] = inner_test_index

            inner_fold['test']=test_row
            inner_fold_arr.append(inner_fold)
            inner_fold_arr_lite.append(inner_fold_lite)
            
        total_MSE = 0.0
        for data in scores:
            total_MSE += data['Mean_Squared_Error']
        score_MSE = total_MSE / 3

        if best_score is None or score_MSE > best_score:
            best_score = score_MSE
            best_params = param_dict
            best_model = copy.deepcopy(model)
        param_comb['inner_fold'] = inner_fold_arr
        param_comb_lite['inner_fold'] = inner_fold_arr_lite
        param_comb_arr.append(param_comb)
        param_comb_arr_lite.append(param_comb_lite)

    return best_model,best_params,param_comb_arr,param_comb_arr_lite

In [4]:
def outer_metric(model, params, X_out_train, y_out_train,idx_train,X_out_test,y_out_test,idx_test,outer_fold_num):
  outer_fold = {}
  outer_fold_lite = {}
  test_row = {}
  train_row = {}
  
  outer_fold['fold_num']= int(outer_fold_num)
  outer_fold['best_params'] = str(params)
  outer_fold_lite['fold_num']= int(outer_fold_num)
  outer_fold_lite['best_params'] = str(params)
  
  start_time = time.time()
  model.fit(X_out_train, y_out_train)
  end_time = time.time()
  train_row['fit_time']=end_time - start_time

  start_time = time.time()
  y_pred = model.predict(X_out_train)
  end_time = time.time()
  train_row['pred_time']=end_time - start_time

  train_row.update(scoring_metrics(y_out_train,y_pred))

  outer_fold_lite['train']=copy.deepcopy(train_row)
  
  train_row['y'] = y_out_train
  train_row['y_pred'] = y_pred
  train_row['indices'] = idx_train

  outer_fold['train']=train_row

  start_time = time.time()
  y_pred = model.predict(X_out_test)
  end_time = time.time()
  test_row['pred_time']=end_time - start_time

  test_row.update(scoring_metrics(y_out_test,y_pred))

  outer_fold_lite['test']=copy.deepcopy(test_row)
  
  test_row['y'] = y_out_test
  test_row['y_pred'] = y_pred
  test_row['indices'] = idx_test

  outer_fold['test']=test_row
  return outer_fold,outer_fold_lite

In [5]:
def process_fold(model, param_grid, fold_num, outer_train_index, outer_test_index):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]

    best_model, best_param, param_arr, param_arr_lite = grid_search(model, param_grid, X_outer_train, y_outer_train)
    outer_loop, outer_loop_lite = outer_metric(best_model, best_param, X_outer_train, y_outer_train, outer_train_index,
                                                X_outer_test, y_outer_test, outer_test_index, fold_num + 1)
    outer_loop['param_comb'] = param_arr
    outer_loop_lite['param_comb'] = param_arr_lite
    print(f'Finished fold {fold_num + 1}')
    return outer_loop, outer_loop_lite

In [6]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [7]:
momental_dataset = "2"

In [8]:
def to_json(model_lite,model,model_name):
    json_array = json.dumps(model_lite, cls=NumpyArrayEncoder,indent=2)
    with open(f"result_dataset_{momental_dataset}/{model_name}_reg_lite.json", 'w') as json_file:
        json_file.write(json_array)
    json_array = json.dumps(model, cls=NumpyArrayEncoder,indent=2)
    with open(f"result_dataset_{momental_dataset}/{model_name}_reg.json", 'w') as json_file:
        json_file.write(json_array)

## Opening the dataset, OHE and standardization

In [9]:
df = pd.read_csv(f'./dataset{momental_dataset}/procesed_dataset_{momental_dataset}.csv')
df.head()

Unnamed: 0,GPRegPop,Hypertens,Anxiety,Depression,Asthma,Obesity,Diabetes,CHD,Fall,Cancer,CKD,COPD,Target,AF
0,1495,165,163,175,110,102,63,47,51,25,17,36,33,19
1,1457,203,191,173,92,90,62,58,80,38,35,35,33,29
2,1343,190,191,171,104,96,68,53,50,25,24,29,23,21
3,1391,269,131,100,102,125,54,51,33,43,39,31,23,18
4,1459,265,161,133,92,124,96,87,85,41,58,50,51,38


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190 entries, 0 to 189
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   GPRegPop    190 non-null    int64
 1   Hypertens   190 non-null    int64
 2   Anxiety     190 non-null    int64
 3   Depression  190 non-null    int64
 4   Asthma      190 non-null    int64
 5   Obesity     190 non-null    int64
 6   Diabetes    190 non-null    int64
 7   CHD         190 non-null    int64
 8   Fall        190 non-null    int64
 9   Cancer      190 non-null    int64
 10  CKD         190 non-null    int64
 11  COPD        190 non-null    int64
 12  Target      190 non-null    int64
 13  AF          190 non-null    int64
dtypes: int64(14)
memory usage: 20.9 KB


In [11]:
X = df.drop("Target", axis=1)
y = df["Target"]

# Convert the features and labels to numpy arrays
Xarr = np.array(X)
yarr = np.array(y)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

In [12]:
all_models = []
all_models_lite = []

## AdaBoost Regression

In [13]:
ada_reg = AdaBoostRegressor()
#ada_reg.get_params()

In [14]:
param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],  # Number of weak learners (base estimators).
    'learning_rate': [0.01, 0.1, 1.0],  # Shrinkage parameter to control the contribution of each estimator. Small value means each tree in the ensemble has a minor impact on the final prediction lead to gradual convergence of the algorithm.
    'loss': ['linear', 'square', 'exponential'],  # Loss function to be used when updating weights.
    'estimator': [DecisionTreeRegressor(max_depth=1), DecisionTreeRegressor(max_depth=3), DecisionTreeRegressor(max_depth=7)],  # Base estimator. Simpler models can reduce overfitting.
}
ada_model = {}
ada_model['model']='AdaBoost_Regression'
outer_loop_arr = []
ada_model_lite = {}
ada_model_lite['model']='AdaBoost_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ada_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ada_model['outer_loop'] = outer_loop_arr
ada_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ada_model)
all_models_lite.append(ada_model_lite)

to_json(ada_model_lite,ada_model,'ada')

ending_time = time.time()
print(ending_time - starting_time)

Finished fold 6
Finished fold 2
Finished fold 3
Finished fold 5
Finished fold 7
Finished fold 8
951.6997966766357


## Automatic Relevance Determination Regression

In [15]:
ard_reg = ARDRegression()
#ard_reg.get_params()

In [16]:
param_grid = {
    'max_iter': [50, 100, 200, 400, 600], # Maximum number of iterations for optimization.
    'alpha_1': np.logspace(-6, -3, 4),  #  Control how many important features the model selects. Larger values lead to stronger regularization.
    'alpha_2': np.logspace(-6, -3, 4),  # Control how much the coefficients of all features should be shrunk towards zero.. Larger values lead to stronger regularization.
    'lambda_1': np.logspace(-6, -3, 4),  # How much individual feature coefficients can vary. Larger values lead to stronger regularization.
    'lambda_2': np.logspace(-6, -3, 4),  # Controlling the average size of all coefficients. Larger values lead to stronger regularization. 
}
ard_model = {}
ard_model['model']='Automatic_Relevance_Determination_Regression'
outer_loop_arr = []
ard_model_lite = {}
ard_model_lite['model']='Automatic_Relevance_Determination_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ard_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ard_model['outer_loop'] = outer_loop_arr
ard_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ard_model)
all_models_lite.append(ard_model_lite)

to_json(ard_model_lite,ard_model,'ard')

ending_time = time.time()
print(ending_time - starting_time)

283.8685564994812
Finished fold 3
Finished fold 1
Finished fold 9
Finished fold 2
Finished fold 6
Finished fold 4
Finished fold 7
Finished fold 8


## Bagging Regression

In [20]:
bag_reg = BaggingRegressor(n_jobs = -1, random_state = 42)
#bag_reg.get_params()

In [21]:
param_grid = {
    'n_estimators': [10, 50, 100, 200, 400],  # Number of base estimators (bags). Larger values lead to stronger regularization.
    'estimator': [None, LinearRegression(), Ridge(alpha=1.0), Lasso(), DecisionTreeRegressor()],  # Base estimator to use. 
    'max_samples': [0.7, 0.85, 1.0],  # Fraction of samples used for fitting each bag. Larger values lead to stronger regularization.
}
bag_model = {}  
bag_model['model']='Bagging_Regression'
outer_loop_arr = []
bag_model_lite = {}
bag_model_lite['model']='Bagging_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(bag_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
bag_model['outer_loop']= outer_loop_arr
bag_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(bag_model)
all_models_lite.append(bag_model_lite)

to_json(bag_model_lite,bag_model,'bag')

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## Bayesian Ridge Regression

In [22]:
bayridge_reg = BayesianRidge()
#bayridge_reg.get_params()

In [23]:
param_grid = {
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations for optimization.
    'alpha_1': np.logspace(-6, -3, 4),  #  Control how many important features the model selects. Larger values lead to stronger regularization.
    'alpha_2': np.logspace(-6, -3, 4),  # Control how much the coefficients of all features should be shrunk towards zero.. Larger values lead to stronger regularization.
    'lambda_1': np.logspace(-6, -3, 4),  # How much individual feature coefficients can vary. Larger values lead to stronger regularization.
    'lambda_2': np.logspace(-6, -3, 4),  # Controlling the average size of all coefficients. Larger values lead to stronger regularization. 
}
bayridge_model = {}
bayridge_model['model']='Bayesian_Ridge_Regression'
outer_loop_arr = []
bayridge_model_lite = {}
bayridge_model_lite['model']='Bayesian_Ridge_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(bayridge_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
bayridge_model['outer_loop'] = outer_loop_arr
bayridge_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(bayridge_model)
all_models_lite.append(bayridge_model_lite)

to_json(bayridge_model_lite,bayridge_model,'bayridge')


ending_time = time.time()
print(ending_time - starting_time)

238.90137553215027


## Decision Tree Regression

In [24]:
tree_reg = DecisionTreeRegressor(random_state = 42)
#tree_reg.get_params()

In [26]:
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error'], # Function used to measure the quality of a split at each node.
    'max_depth': [1, 2, 3, 5, 7, 10, 15, 20, 25, 30, None],  # Maximum depth of the tree. None means unlimited depth.
    'min_samples_split': [2, 5, 10, 15, 20],  # Minimum samples required to split an internal node.
    'max_features': [0.1, 0.2, 0.25, 0.33, 0.5],  #Maximum number of features to consider when splitting a node during tree construction. None: can use all available features.
}
tree_model = {}
tree_model['model']='Decision_Tree_Regression'
outer_loop_arr = []
tree_model_lite = {}
tree_model_lite['model']='Decision_Tree_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(tree_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
tree_model['outer_loop'] = outer_loop_arr
tree_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(tree_model)
all_models_lite.append(tree_model_lite)

to_json(tree_model_lite,tree_model,'tree')

ending_time = time.time()
print(ending_time - starting_time)

147.7922990322113


## Elastic Net Regression

In [27]:
elnet_reg = ElasticNet(random_state = 42)
#elnet_reg.get_params()

In [28]:
param_grid = {
    'alpha': np.logspace(-3, 1, 5),  # Combined L1 and L2 regularization strength.
    'l1_ratio' : [0, 0.2, 0.5, 0.7, 1],  # Mix between L1 and L2 regularization. 0: Ridge, 1: Lasso.
    'max_iter': [50, 100, 300, 500, 1000, 1500],  # Maximum number of optimization iterations.
}

elnet_model = {}
elnet_model['model']='Elastic_Net_Regression'
outer_loop_arr = []
elnet_model_lite = {}
elnet_model_lite['model']='Elastic_Net_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(elnet_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
elnet_model['outer_loop'] = outer_loop_arr
elnet_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(elnet_model)
all_models_lite.append(elnet_model_lite)

to_json(elnet_model_lite,elnet_model,'elnet')

ending_time = time.time()
print(ending_time - starting_time)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

27.479907274246216


## Gaussian Process Regression

In [29]:
gp_reg = GaussianProcessRegressor()
#gp_reg.get_params()

In [30]:
param_grid = {
    'kernel': [RBF(), Matern()],  # Kernel function to model the covariance of the Gaussian process.
    'n_restarts_optimizer': [1, 3, 5, 10],  # Number of restarts for the optimizer to find the best kernel parameters.
    'alpha': np.logspace(-3, 1, 5),  # Regularization parameter for the Gaussian process. Larger value lead to stronger regularization
}
gp_model = {}
gp_model['model']='Gaussian_Process_Regression'
outer_loop_arr = []
gp_model_lite = {}
gp_model_lite['model']='Gaussian_Process_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(gp_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
gp_model['outer_loop'] = outer_loop_arr
gp_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(gp_model)
all_models_lite.append(gp_model_lite)

to_json(gp_model_lite,gp_model,'gp')

ending_time = time.time()
print(ending_time - starting_time)

42.67188262939453


## Gradient Boosting Regression

In [31]:
gb_reg = GradientBoostingRegressor(random_state = 42)
#gb_reg.get_params()

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],  # Number of boosting stages.
    'learning_rate': [0.01, 0.1, 1.0],  # Shrinkage parameter to control the contribution of each estimator. Small value means each tree in the ensemble has a minor impact on the final prediction lead to  gradual convergence of the algorithm.
    'max_depth': [1, 3, 5, 7, 10, 15, 20],  # Maximum depth of individual decision trees.
    'min_samples_split': [2, 5, 10, 15, 20],  # Minimum samples required to split an internal node.
    'subsample': [0.7, 0.85, 1.0],  # Fraction of samples used for fitting the trees.
    'max_features': [0.1, 0.2, 0.25, 0.33, 0.5],  # Maximum number of features to consider for a split.
}

gb_model = {}
gb_model['model']='Gradient_Boosting_Regression'
outer_loop_arr = []
gb_model_lite = {}
gb_model_lite['model']='Gradient_Boosting_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(gb_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
gb_model['outer_loop'] = outer_loop_arr
gb_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(gb_model)
all_models_lite.append(gb_model_lite)

to_json(gb_model_lite,gb_model,'gb')

ending_time = time.time()
print(ending_time - starting_time)

## Hist Gradient Boosting Regression

In [None]:
hgb_reg = HistGradientBoostingRegressor(random_state = 42)
#hgb_reg.get_params()

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],  # Number of boosting stages (iterations). Larger values lead to risk of overfitting.
    'max_depth': [1, 2, 3, 5, 7, 10, 15, 20, 25, 30, None],  # Maximum depth of the trees. None: no maximum depth. Smaller values lead to stronger regularization. 
    'min_samples_leaf':  [2, 5, 10, 15, 20],  # Minimum samples required to be at a leaf node. Larger values lead to stronger regularization.
    'learning_rate': [0.01, 0.1, 1.0],  # Shrinkage parameter to control the contribution of each estimator. Smaller values lead to stronger regularization.
    'loss': ['least_squares', 'least_absolute_deviation'],  # Loss function to be optimized.
    # least_squares: the sum of the squared differences between the predicted values and the actual target values,
    # least_absolute_deviation: known as L1 loss (i.e. measures the sum of the absolute differences between the predicted values and the actual target values).
}
hgb_model = {}
hgb_model['model']='Hist_Gradient_Boosting_Regression'
outer_loop_arr = []
hgb_model_lite = {}
hgb_model_lite['model']='Hist_Gradient_Boosting_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(hgb_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
hgb_model['outer_loop'] = outer_loop_arr
hgb_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(hgb_model)
all_models_lite.append(hgb_model_lite)

to_json(hgb_model_lite,hgb_model,'hgb')

ending_time = time.time()
print(ending_time - starting_time)

## Huber Regression

In [None]:
huber_reg = HuberRegressor()
#huber_reg.get_params()

In [None]:
param_grid = {
    'epsilon': [1.0, 1.5, 2.0],  # Loss parameter. Larger value lead to more resistant to outliers.
    'alpha': np.logspace(-3, 1, 5),  # L2 regularization term. Larger values lead to stronger regularization.
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
}
huber_model = {}
huber_model['model']='Huber_Regression'
outer_loop_arr = []
huber_model_lite = {}
huber_model_lite['model']='Huber_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(huber_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
huber_model['outer_loop'] = outer_loop_arr
huber_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(huber_model)
all_models_lite.append(huber_model_lite)

to_json(huber_model_lite,huber_model,'huber')

ending_time = time.time()
print(ending_time - starting_time)

## K-Nearest Neighbors Regression

In [None]:
knn_reg = KNeighborsRegressor(n_jobs = -1)
#knn_reg.get_params()

In [None]:
param_grid = {
    'n_neighbors': list(range(1, 12, 2)),  # Number of neighbors to consider. Larger values make the model less sensitive to noise but smoother.
    'weights': ['uniform', 'distance'],  # Weight function used in prediction. 'uniform' treats all neighbors equally, 'distance' weights by inverse of distance.
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute nearest neighbors.
    'p': [1, 2],  # Minkowski distance metric parameter. 1 is Manhattan distance, 2 is Euclidean distance.
}

knn_model = {}  
knn_model['model']='K-Nearest_Neighbors_Regression'
outer_loop_arr = []
knn_model_lite = {}
knn_model_lite['model']='K-Nearest_Neighbors_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(knn_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
knn_model['outer_loop']= outer_loop_arr
knn_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(knn_model)
all_models_lite.append(knn_model_lite)

to_json(knn_model_lite,knn_model,'knn')

## Lasso Regression

In [None]:
lasso_reg = Lasso(random_state = 42)
#lasso_reg.get_params()

In [None]:
param_grid = {
    'alpha': np.logspace(-3, 1, 5),  # Regularization strength (L1 regularization). Smaller values lead to weaker regularization.
    'max_iter': [None, 50, 100, 300, 500, 1000, 1500],  # Maximum number of optimization iterations. If None the model takes the default for each solver.
}

lasso_model = {}
lasso_model['model']='Lasso_Regression'
outer_loop_arr = []
lasso_model_lite = {}
lasso_model_lite['model']='Lasso_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(lasso_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
lasso_model['outer_loop'] = outer_loop_arr
lasso_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(lasso_model)
all_models_lite.append(lasso_model_lite)

to_json(lasso_model_lite,lasso_model,'lasso')

ending_time = time.time()
print(ending_time - starting_time)

## Least Absolute Deviations Regression

In [None]:
lad_reg = LassoLars(random_state = 42)
#lad_reg.get_params()

In [None]:
param_grid = {
    'alpha': np.logspace(-3, 1, 5),  # Regularization strength (L1 regularization). Larger values lead to stronger regularization.
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
    'positive': [True, False],  # True: constrain coefficients to be positive, False: no constraints lead to greater flexibility to the model.
}
lad_model = {}
lad_model['model']='Least_Absolute_Deviations_Regression'
outer_loop_arr = []
lad_model_lite = {}
lad_model_lite['model']='Least_Absolute_Deviations_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(lad_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
lad_model['outer_loop'] = outer_loop_arr
lad_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(lad_model)
all_models_lite.append(lad_model_lite)

to_json(lad_model_lite,lad_model,'lad')

ending_time = time.time()
print(ending_time - starting_time)

## Least Angle Regression

In [None]:
lars_reg = Lars(random_state = 42)
#lars_reg.get_params()

In [None]:
param_grid = {
    'eps': [1e-5, 1e-4, 1e-3],      # L2 regularization parameter. Smaller values lead to stronger regularization.
}
lars_model = {}
lars_model['model']='Least_Angle_Regression'
outer_loop_arr = []
lars_model_lite = {}
lars_model_lite['model']='Least_Angle_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(lars_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
lars_model['outer_loop'] = outer_loop_arr
lars_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(lars_model)
all_models_lite.append(lars_model_lite)

to_json(lars_model_lite,lars_model,'lars')

ending_time = time.time()
print(ending_time - starting_time)

## Linear Regresion

In [None]:
linreg_reg = LinearRegression(n_jobs = -1)
#linreg_reg.get_params()

In [None]:
param_grid = {
    'fit_intercept': [True, False],  # Whether to calculate the intercept (bias) for the model. True: Include an intercept term, False: Exclude it.
}
linreg_model = {}
linreg_model['model']='Linear_Regression'
outer_loop_arr = []
linreg_model_lite = {}
linreg_model_lite['model']='Linear_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(linreg_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
linreg_model['outer_loop']= outer_loop_arr
linreg_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(linreg_model)
all_models_lite.append(linreg_model_lite)

to_json(linreg_model_lite,linreg_model,'linreg')

## LightGBM Regression

In [None]:
lgb_reg = lgb.LGBMRegressor(n_jobs = -1,force_col_wise=True)
#lgb_reg.get_params()

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],  # Number of boosting stages. Larger values may lead to better performance but longer training times.
    'learning_rate': [0.01, 0.1, 1.0],  # Larger values shrinks the contribution of each tree, which can help prevent overfitting but may require more trees for similar predictive power.
    'max_depth': [1, 2, 3, 5, 7, 10, 15, 20, 25, 30],  # Maximum depth of individual trees. Larger values can capture more complex relationships and can lead to overfitting if too large.
    'subsample': [0.7, 0.85, 1.0],  # Fraction of samples used for fitting trees. A larger value means using more data for training.
    'colsample_bytree': ['log', 'sqrt', 0.1, 0.2, 0.25, 0.33, 0.5],  # Fraction of features used for fitting trees. A larger value increases diversity but may lead to overfitting if set too high.
}
lgb_model = {}
lgb_model['model']='LightGBM_Regression'
outer_loop_arr = []
lgb_model_lite = {}
lgb_model_lite['model']='LightGBM_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(lgb_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
lgb_model['outer_loop']= outer_loop_arr
lgb_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(lgb_model)
all_models_lite.append(lgb_model_lite)

to_json(lgb_model_lite,lgb_model,'lgb')

## Multi-layer Perceptron Regression

In [None]:
mlp_reg= MLPRegressor(random_state = 42)
#mlp_reg.get_params()

In [None]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,)],  # Number of neurons in each hidden layer. Larger value lead to more complex
    'activation': ['identity', 'logistic', 'tanh', 'relu'],  # Activation function for hidden layers. 'identity':  returns its input as-is, 'relu': Rectified Linear Unit
    'solver': ['lbfgs', 'sgd', 'adam'],  # Optimization algorithm.
    'alpha': np.logspace(-3, 1, 5),  # L2 regularization term. Larger value lead to stronger regularization
    'learning_rate': ['constant', 'invscaling', 'adaptive'],  # Learning rate schedule for weight updates.
    'learning_rate_init': [0.001, 0.01, 0.1],  # Initial learning rate.
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
}
mlp_model = {}
mlp_model['model']='Multi-layer_Perceptron_Regression'
outer_loop_arr = []
mlp_model_lite = {}
mlp_model_lite['model']='Multi-layer_Perceptron_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(mlp_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
mlp_model['outer_loop'] = outer_loop_arr
mlp_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(mlp_model)
all_models_lite.append(mlp_model_lite)

to_json(mlp_model_lite,mlp_model,'mlp')

ending_time = time.time()
print(ending_time - starting_time)

## Ordinal Regression

In [None]:
ordinal_reg = OrdinalRidge(random_state = 42)
#ordinal_reg.get_params()

In [None]:
param_grid = {
    'alpha': np.logspace(-3, 1, 5),  # Regularization strength (L2 regularization). Larger values lead to stronger regularization.
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],  # Solver algorithm. 'lsqr': Least Squares, 'sparse_cg': Conjugate Gradient, 'sag': Stochastic Average Gradient Descent, 'saga': sag with Adaptive Regularization.

}
ordinal_model = {}
ordinal_model['model']='Ordinal_Regression'
outer_loop_arr = []
ordinal_model_lite = {}
ordinal_model_lite['model']='Ordinal_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ordinal_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ordinal_model['outer_loop'] = outer_loop_arr
ordinal_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ordinal_model)
all_models_lite.append(ordinal_model_lite)

to_json(ordinal_model_lite,ordinal_model,'ordinal')

ending_time = time.time()
print(ending_time - starting_time)

## Orthogonal Matching Pursuit Regression

In [None]:
omp_reg = OrthogonalMatchingPursuit()
omp_reg.get_params()

In [None]:
param_grid = {
}

omp_model = {}
omp_model['model']='Orthogonal_Matching_Pursuit_Regression'
outer_loop_arr = []
omp_model_lite = {}
omp_model_lite['model']='Orthogonal_Matching_Pursuit_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(omp_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
omp_model['outer_loop'] = outer_loop_arr
omp_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(omp_model)
all_models_lite.append(omp_model_lite)

to_json(omp_model_lite,omp_model,'omp')

ending_time = time.time()
print(ending_time - starting_time)

## Passive Aggressive Regression

In [None]:
pa_reg = PassiveAggressiveRegressor(random_state = 42)
#pa_reg.get_params()

In [None]:
param_grid = {
    'C': [0.1, 0.5, 1, 2, 10, 100],  # Regularization parameter. Smaller values lead to stronger regularization.
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
    'shuffle': [True, False],  # Whether to shuffle the training data at each iteration.
}
pa_model = {}
pa_model['model']='Passive_Aggressive_Regression'
outer_loop_arr = []
pa_model_lite = {}
pa_model_lite['model']='Passive_Aggressive_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(pa_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
pa_model['outer_loop'] = outer_loop_arr
pa_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(pa_model)
all_models_lite.append(pa_model_lite)

to_json(pa_model_lite,pa_model,'pa')

ending_time = time.time()
print(ending_time - starting_time)

## RANSAC Regression

In [None]:
ransac_reg = RANSACRegressor(random_state = 42)
#ransac_reg.get_params()

In [None]:
param_grid = {
    'base_estimator': [None, LinearRegression(), Ridge(alpha=1.0), Lasso()],  # Base estimator for RANSAC.
    'min_samples': [None, 0.1, 0.25, 0.5],  # Minimum samples required to fit a model. None: no minimum requirement.
    'max_trials': [50, 100, 200, 400, 600],  # Maximum number of RANSAC iterations.
    'loss': ['absolute_loss', 'squared_loss'],  # Loss function to use.
    'residual_threshold': [None, 0.5, 1.0],  # Threshold for considering a data point as an inlier.
}

ransac_model = {}
ransac_model['model']='RANSAC_Regression'
outer_loop_arr = []
ransac_model_lite = {}
ransac_model_lite['model']='RANSAC_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ransac_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ransac_model['outer_loop'] = outer_loop_arr
ransac_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ransac_model)
all_models_lite.append(ransac_model_lite)

to_json(ransac_model_lite,ransac_model,'ransac')


ending_time = time.time()
print(ending_time - starting_time)

## Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_jobs = -1, random_state = 42)
#rf_reg.get_params()

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],  # Number of trees in the forest. Larger values lead to stronger regularization. 
    'max_depth': [1, 2, 3, 5, 7, 10, 15, 20, 25, 30],   # Maximum depth of the trees. None means no maximum depth. Deeper trees can capture more complex patterns but may overfit. Smaller values lead to stronger regularization.
    'min_samples_split': [2, 5, 10, 15, 20],  # Minimum samples required to split an internal node. Larger values help prevent overfitting. Larger values lead to stronger regularization. 
    'max_features': ['log', 'sqrt', 0.1, 0.2, 0.25, 0.33, 0.5],  #Maximum number of features to consider for a split. Smaller values reduce model complexity. Smaller values lead to stronger regularization.
}
rf_model = {}  
rf_model['model']='Random_Forest_Regression'
outer_loop_arr = []
rf_model_lite = {}
rf_model_lite['model']='Random_Forest_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(rf_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
rf_model['outer_loop']= outer_loop_arr
rf_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(rf_model)
all_models_lite.append(rf_model_lite)

to_json(rf_model_lite,rf_model,'rf')

## Ridge Regression

In [None]:
ridge_reg = Ridge(random_state = 42)
##ridge_reg.get_params()

In [None]:
param_grid = {
    'alpha': np.logspace(-3, 1, 5),  # Regularization strength (L2 regularization). Smaller values lead to weaker regularization.
    'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],  # Algorithm for optimization.
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of optimization iterations. If None the model takes the default for each solver.
}
ridge_model = {}
ridge_model['model']='Ridge_Regression'
outer_loop_arr = []
ridge_model_lite = {}
ridge_model_lite['model']='Ridge_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ridge_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ridge_model['outer_loop'] = outer_loop_arr
ridge_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ridge_model)
all_models_lite.append(ridge_model_lite)

to_json(ridge_model_lite,ridge_model,'ridge')

ending_time = time.time()
print(ending_time - starting_time)

## SGD Regression

In [None]:
sgd_reg = SGDRegressor(random_state = 42)
#sgd_reg.get_params()

In [None]:
param_grid = {
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive'], # Loss function to use for optimization.
    'penalty': ['l1', 'l2', 'elasticnet'],  # Penalty term for regularization.
    'alpha': np.logspace(-3, 1, 5),  # Regularization strength. Larger values lead to stronger regularization. 
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
}
sgd_model = {}
sgd_model['model']='SGD_Regression'
outer_loop_arr = []
sgd_model_lite = {}
sgd_model_lite['model']='SGD_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(sgd_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
sgd_model['outer_loop'] = outer_loop_arr
sgd_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(sgd_model)
all_models_lite.append(sgd_model_lite)

to_json(sgd_model_lite,sgd_model,'sgd')

ending_time = time.time()
print(ending_time - starting_time)

## Support Vector Regression

In [None]:
svr_reg = SVR()
#svr_reg.get_params()

In [None]:
param_grid = {
    'kernel': ['linear', 'rbf', 'poly' , 'sigmoid'],  # Kernel function for mapping data to a higher-dimensional space. Functions: Linear, Radial basis function (RBF), Polynomial.
    'C': [0.1, 0.5, 1, 2, 10, 100],  # Regularization parameter. Larger values allow for more flexible decision boundaries but may overfit.
    'epsilon': [0.01, 0.1, 0.5],  # Epsilon parameter in the SVR model. Larger value results in a wider tolerance zone.
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (used with 'poly' kernel).
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1, 10],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels. Smaller gamma values lead to smoother decision boundaries witch can overfit the data.
    # gamma = scale => gamma = 1/n_features,  gamma = auto => gamma = 1/n_samples.
}
svr_model = {}
svr_model['model']='Support_Vector_Regression'
outer_loop_arr = []
svr_model_lite = {}
svr_model_lite['model']='Support_Vector_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(svr_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
svr_model['outer_loop'] = outer_loop_arr
svr_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(svr_model)
all_models_lite.append(svr_model_lite)

to_json(svr_model_lite,svr_model,'svr')


ending_time = time.time()
print(ending_time - starting_time)

## Theil Sen Regression

In [None]:
theilsen_reg = TheilSenRegressor(random_state = 42, n_jobs = -1)
#theilsen_reg.get_params()

In [None]:
param_grid = {
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
}
theilsen_model = {}  
theilsen_model['model']='TheilSen_Regression'
outer_loop_arr = []
theilsen_model_lite = {}
theilsen_model_lite['model']='TheilSen_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(theilsen_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
theilsen_model['outer_loop']= outer_loop_arr
theilsen_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(theilsen_model)
all_models_lite.append(theilsen_model_lite)

to_json(theilsen_model_lite,theilsen_model,'theilsen')

## Tweedie Regression

In [None]:
tw_reg = TweedieRegressor()
##tw_reg.get_params()

In [None]:
param_grid = {
    'power': [0, 1, 2],  # Tweedie power parameter.
    'alpha': np.logspace(-3, 1, 5),  # Regularization strength (L2 regularization). Larger values lead to stronger regularization. 
    'solver': ['newton-cholesky', 'lbfgs'],  # Solver algorithm.
    'max_iter': [50, 100, 200, 400, 600],  # Maximum number of iterations.
}
tw_model = {}
tw_model['model']='Tweedie_Regression'
outer_loop_arr = []
tw_model_lite = {}
tw_model_lite['model']='Tweedie_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(tw_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
tw_model['outer_loop'] = outer_loop_arr
tw_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(tw_model)
all_models_lite.append(tw_model_lite)

to_json(tw_model_lite,tw_model,'tw')

ending_time = time.time()
print(ending_time - starting_time)

## XGBoost Regression

In [None]:
xgb_reg = xgb.XGBRegressor(n_jobs = -1)
xgb_reg.get_params()

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],  # Number of boosting stages. Larger values may lead to better performance but longer training times.
    'learning_rate': [0.01, 0.1, 1.0],  # Shrinkage parameter to control learning rate. Smaller values reduce overfitting.
    'max_depth': [1, 3, 5, 7, 10, 15, 20],  # Maximum depth of individual trees. Larger values can capture more complex relationships and can lead to overfitting if too large.
    'subsample': [0.7, 0.85, 1.0],  # Fraction of samples used for fitting trees.  Smaller values reduce overfitting risk. 
    'colsample_bytree': ['log', 'sqrt', 0.1, 0.2, 0.25, 0.33, 0.5],  # Fraction of features used for fitting trees. A larger value increases diversity but may lead to overfitting if set too high.
}
xgb_model = {}
xgb_model['model']='XGBoost_Regression'
outer_loop_arr = []
xgb_model_lite = {}
xgb_model_lite['model']='XGBoost_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(xgb_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
xgb_model['outer_loop']= outer_loop_arr
xgb_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(xgb_model)
all_models_lite.append(xgb_model_lite)

to_json(xgb_model_lite,xgb_model,'xgb')

## making the json file

In [None]:


# Convert the array of dictionaries to a JSON array using the custom encoder
json_array = json.dumps(all_models, cls=NumpyArrayEncoder,indent=2)

# Save the JSON array to a file
with open(f'result_dataset{momental_dataset}/allmodels_lite.json', 'w') as json_file:
    json_file.write(json_array)


json_array = json.dumps(all_models_lite, cls=NumpyArrayEncoder,indent=2)

# Save the JSON array to a file
with open(f'result_dataset{momental_dataset}/allmodels_lite.json', 'w') as json_file:
    json_file.write(json_array)
