In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_val_predict,cross_validate,KFold
import time
import copy
from itertools import product
from joblib import Parallel, delayed

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor, TheilSenRegressor, ARDRegression, HuberRegressor, \
LassoLars, Lars, OrthogonalMatchingPursuit, BayesianRidge, LogisticRegression, TweedieRegressor, PoissonRegressor, GammaRegressor, SGDRegressor, \
PassiveAggressiveRegressor, RANSACRegressor, QuantileRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, HistGradientBoostingRegressor, BaggingRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel as C
from sklearn.cross_decomposition import PLSRegression
from mord import OrdinalRidge
from sklearn.isotonic import IsotonicRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, mean_squared_log_error, explained_variance_score
from scipy.stats import percentileofscore, norm, jarque_bera, ks_2samp
from sklearn.metrics import cohen_kappa_score, r2_score
from statsmodels.stats.stattools import durbin_watson

In [2]:
def scoring_metrics(y_true, y_pred):
    
    # Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)
    # Mean Squared Error (MSE)
    mse = mean_squared_error(y_true, y_pred)
    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    # Mean Squared Logarithmic Error (MSLE)
    #msle = mean_squared_log_error(y_true, y_pred)
    # Median Absolute Error
    median_ae = median_absolute_error(y_true, y_pred)
    # Mean Percentage Error (MPE)
    mpe = np.mean((y_true - y_pred) / y_true) * 100
    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    # Symmetric Mean Absolute Percentage Error (SMAPE)
    smape = np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))) * 100
    # Relative Squared Error (RSE)
    rse = np.sum(np.square(y_true - y_pred)) / np.sum(np.square(y_true - np.mean(y_true)))
    # Theil's U statistic
    theil_u = np.sqrt(np.sum(np.square(y_pred - y_true)) / np.sum(np.square(y_true)))
    # Mean Error (ME)
    me = np.mean(y_pred - y_true)
    # Adjusted R-squared
    adj_r2 = 1 - (1 - r2_score(y_true, y_pred)) * (len(y_true) - 1) / (len(y_true) - len(y_pred) - 1)
    # Explained Variance Score
    explained_var = explained_variance_score(y_true, y_pred)
    # Durbin-Watson Statistic
    dw_statistic = durbin_watson(y_pred - y_true)
    # Jarque-Bera Test
    jb_statistic, jb_p_value = jarque_bera(y_pred - y_true)
    # Cohen's Kappa
    kappa = cohen_kappa_score(np.round(y_true), np.round(y_pred))
    # Kolmogorov-Smirnov Statistic
    ks_statistic, ks_p_value = ks_2samp(y_true, y_pred)
    # Pseudo R-squared (Nagelkerke's R-squared)
    r2 = r2_score(y_true, y_pred)
    
    scores = {
        'Mean_Absolute_Error': mae,
        'Mean_Squared_Error': mse,
        'Root_Mean_Squared_Error': rmse,
        #'Mean_Squared_Logarithmic_Error': msle,
        'Median_Absolute_Error': median_ae,
        'Mean_Percentage_Error': mpe,
        'Mean_Absolute_Percentage_Error': mape,
        'Symmetric_Mean_Absolute_Percentage_Error': smape,
        'Relative_Squared_Error': rse,
        'Theils_U': theil_u,
        'Mean_Error': me,
        'Adjusted_R-squared': adj_r2,
        'Explained_Variance_Score': explained_var,
        'Durbin-Watson_Statistic': dw_statistic,
        'Jarque-Bera_Test_Statistic': jb_statistic,
        'Cohens_Kappa': kappa,
        'Kolmogorov-Smirnov_Statistic': ks_statistic,
        'R-squared': r2,
    }
    return scores

In [3]:
def grid_search(model, param_grid, X_outter, y_outter):
    best_score = None
    best_params = None
    best_model = None
    param_comb_arr = []
    param_comb_arr_lite = []
    # Generate all possible combinations of hyperparameters
    all_params = list(product(*param_grid.values()))

    for params in all_params:
        # Create a dictionary of hyperparameter values
        param_dict = {param_name: param_value for param_name, param_value in zip(param_grid.keys(), params)}
        param_comb = {}
        param_comb_lite = {}
        param_comb['params'] = param_dict
        param_comb_lite['params'] = param_dict
        
        # Set the model hyperparameters
        model.set_params(**param_dict)
        inner_fold_arr = []
        inner_fold_arr_lite = []
        scores = []
        for fold_num, (inner_train_index, inner_test_index) in enumerate(inner_cv.split(X_outter, y_outter)):
            X_inner_train, X_inner_test = X_outter[inner_train_index], X_outter[inner_test_index]
            y_inner_train, y_inner_test = y_outter[inner_train_index], y_outter[inner_test_index]
            
            inner_fold = {}
            inner_fold_lite = {}
            inner_fold['fold_num'] = fold_num+1
            inner_fold_lite['fold_num'] = fold_num+1
            test_row = {}
            train_row = {}
            
            start_time = time.time()
            model.fit(X_inner_train, y_inner_train)
            end_time = time.time()
            train_row['fit_time']=end_time - start_time
            
            start_time = time.time()
            y_pred = model.predict(X_inner_train)
            end_time = time.time()
            train_row['pred_time']=end_time - start_time
            

            train_row.update(scoring_metrics(y_inner_train,y_pred))
            
            inner_fold_lite['train'] =  copy.deepcopy(train_row)
            train_row['y'] = y_inner_train
            train_row['y_pred'] = y_pred
            train_row['indices'] = inner_train_index

            inner_fold['train']=train_row

            start_time = time.time()
            y_pred = model.predict(X_inner_test)
            end_time = time.time()
            test_row['pred_time']=end_time - start_time

            
            test_metrics = scoring_metrics(y_inner_test,y_pred)
            test_row.update(test_metrics)
            scores.append(test_metrics)
            
            inner_fold_lite['test']=copy.deepcopy(test_row)
            
            test_row['y'] = y_inner_test
            test_row['y_pred'] = y_pred
            test_row['indices'] = inner_test_index

            inner_fold['test']=test_row
            inner_fold_arr.append(inner_fold)
            inner_fold_arr_lite.append(inner_fold_lite)
            
        total_MSE = 0.0
        for data in scores:
            total_MSE += data['Mean_Squared_Error']
        score_MSE = total_MSE / 3

        if best_score is None or score_MSE > best_score:
            best_score = score_MSE
            best_params = param_dict
            best_model = copy.deepcopy(model)
        param_comb['inner_fold'] = inner_fold_arr
        param_comb_lite['inner_fold'] = inner_fold_arr_lite
        param_comb_arr.append(param_comb)
        param_comb_arr_lite.append(param_comb_lite)

    return best_model,best_params,param_comb_arr,param_comb_arr_lite

In [4]:

def outer_metric(model, params, X_out_train, y_out_train,idx_train,X_out_test,y_out_test,idx_test,outer_fold_num):
  outer_fold = {}
  outer_fold_lite = {}
  test_row = {}
  train_row = {}
  
  outer_fold['fold_num']= int(outer_fold_num)
  outer_fold['best_params'] = params
  outer_fold_lite['fold_num']= int(outer_fold_num)
  outer_fold_lite['best_params'] = params
  
  start_time = time.time()
  model.fit(X_out_train, y_out_train)
  end_time = time.time()
  train_row['fit_time']=end_time - start_time

  start_time = time.time()
  y_pred = model.predict(X_out_train)
  end_time = time.time()
  train_row['pred_time']=end_time - start_time

  train_row.update(scoring_metrics(y_out_train,y_pred))

  outer_fold_lite['train']=copy.deepcopy(train_row)
  
  train_row['y'] = y_out_train
  train_row['y_pred'] = y_pred
  train_row['indices'] = idx_train

  outer_fold['train']=train_row

  start_time = time.time()
  y_pred = model.predict(X_out_test)
  end_time = time.time()
  test_row['pred_time']=end_time - start_time

  test_row.update(scoring_metrics(y_out_test,y_pred))

  outer_fold_lite['test']=copy.deepcopy(test_row)
  
  test_row['y'] = y_out_test
  test_row['y_pred'] = y_pred
  test_row['indices'] = idx_test

  outer_fold['test']=test_row
  return outer_fold,outer_fold_lite

In [5]:
def process_fold(model,fold_num, outer_train_index, outer_test_index):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]

    best_model, best_param, param_arr, param_arr_lite = grid_search(model, param_grid, X_outer_train, y_outer_train)
    outer_loop, outer_loop_lite = outer_metric(best_model, best_param, X_outer_train, y_outer_train, outer_train_index,
                                                X_outer_test, y_outer_test, outer_test_index, fold_num + 1)
    outer_loop['param_comb'] = param_arr
    outer_loop_lite['param_comb'] = param_arr_lite
    print(f'Finished fold {fold_num + 1}')
    return outer_loop, outer_loop_lite

In [21]:
df = pd.read_csv('../dataset2/dataset2.csv')
df.head()

Unnamed: 0,ogc_fid,lsoa11cd,lsoa11nm,lsoa11nmw,GPRegPop,Hypertens,Anxiety,Depression,Asthma,Obesity,Diabetes,CHD,Fall,Cancer,CKD,COPD,Stroke_TIA,AF
0,5601,E01005761,Stockport 026A,Stockport 026A,1495,165,163,175,110,102,63,47,51,25,17,36,33,19
1,5710,E01005870,Stockport 015A,Stockport 015A,1457,203,191,173,92,90,62,58,80,38,35,35,33,29
2,5711,E01005871,Stockport 015B,Stockport 015B,1343,190,191,171,104,96,68,53,50,25,24,29,23,21
3,5583,E01005743,Stockport 009A,Stockport 009A,1391,269,131,100,102,125,54,51,33,43,39,31,23,18
4,5584,E01005744,Stockport 009B,Stockport 009B,1459,265,161,133,92,124,96,87,85,41,58,50,51,38


In [22]:
are_different = (df['lsoa11nm'] != df['lsoa11nmw']).any()

# Print the result
if are_different:
    print("lsoa11nm and lsoa11nmw have different values in some instances.")
else:
    print("lsoa11nm and lsoa11nmw have the same values in all instances.")

lsoa11nm and lsoa11nmw have the same values in all instances.


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190 entries, 0 to 189
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ogc_fid     190 non-null    int64 
 1   lsoa11cd    190 non-null    object
 2   lsoa11nm    190 non-null    object
 3   lsoa11nmw   190 non-null    object
 4   GPRegPop    190 non-null    int64 
 5   Hypertens   190 non-null    int64 
 6   Anxiety     190 non-null    int64 
 7   Depression  190 non-null    int64 
 8   Asthma      190 non-null    int64 
 9   Obesity     190 non-null    int64 
 10  Diabetes    190 non-null    int64 
 11  CHD         190 non-null    int64 
 12  Fall        190 non-null    int64 
 13  Cancer      190 non-null    int64 
 14  CKD         190 non-null    int64 
 15  COPD        190 non-null    int64 
 16  Stroke_TIA  190 non-null    int64 
 17  AF          190 non-null    int64 
dtypes: int64(15), object(3)
memory usage: 26.8+ KB


In [24]:
df = df.drop(["ogc_fid","lsoa11cd","lsoa11nm","lsoa11nmw"],axis = 1)
columns=df.columns

In [25]:
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df.head()

Unnamed: 0,GPRegPop,Hypertens,Anxiety,Depression,Asthma,Obesity,Diabetes,CHD,Fall,Cancer,CKD,COPD,Stroke_TIA,AF
0,-0.238206,-1.442131,0.217429,0.456886,0.306791,0.051091,-0.771539,-0.843554,-0.386809,-1.443102,-1.740483,-0.008236,0.056659,-1.128662
1,-0.452517,-0.518492,0.718173,0.425053,-0.486275,-0.271591,-0.821282,-0.212549,0.693433,-0.402687,-0.284404,-0.057136,0.056659,-0.208689
2,-1.095451,-0.834473,0.718173,0.39322,0.042436,-0.11025,-0.522825,-0.49937,-0.424058,-1.443102,-1.17423,-0.350538,-0.784368,-0.944667
3,-0.824742,1.085724,-0.354851,-0.736848,-0.045682,0.669567,-1.219226,-0.614098,-1.057303,-0.002527,0.039169,-0.252737,-0.784368,-1.22066
4,-0.441238,0.988499,0.181661,-0.211605,-0.486275,0.642677,0.869978,1.45101,0.879681,-0.162591,1.576142,0.676369,1.570506,0.619287


In [27]:
X = scaled_df.drop("Stroke_TIA", axis=1)
y = scaled_df["Stroke_TIA"]

# Convert the features and labels to numpy arrays
Xarr = np.array(X)
yarr = np.array(y)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

In [28]:
all_models = []
all_models_lite = []

## Linear Regresion

In [29]:
linreg_reg = LinearRegression(n_jobs = -1)
linreg_reg.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}

In [30]:
param_grid = {
    'fit_intercept': [True, False],
}
linreg_model = {}
linreg_model['model']='Linear_Regression'
outer_loop_arr = []
linreg_model_lite = {}
linreg_model_lite['model']='Linear_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(linreg_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
linreg_model['outer_loop']= outer_loop_arr
linreg_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(linreg_model)
all_models_lite.append(linreg_model_lite)

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## Ridge Regression

In [32]:
ridge_reg = Ridge(random_state = 42)
ridge_reg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': 42,
 'solver': 'auto',
 'tol': 0.0001}

In [33]:
param_grid = {
    #'alpha': [0.01, 0.1, 0.3, 0.5, 1.0, 2.0, 10.0],
    #'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
    'tol': [1e-4, 1e-3, 1e-2],
    #'max_iter': [None, 150, 500, 1000],
}
ridge_model = {}
ridge_model['model']='Ridge_Regression'
outer_loop_arr = []
ridge_model_lite = {}
ridge_model_lite['model']='Ridge_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ridge_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ridge_model['outer_loop'] = outer_loop_arr
ridge_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ridge_model)
all_models_lite.append(ridge_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

3.8659279346466064


## Lasso Regression

In [34]:
lasso_reg = Lasso(random_state = 42)
lasso_reg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': 42,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [35]:
param_grid = {
    #'alpha': [0.01, 0.1, 0.3, 0.5, 1.0, 2.0, 10.0],
    'tol': [1e-4, 1e-3, 1e-2],
    #'max_iter': [None, 100, 300, 500, 1000, 1500],
    #'selection' : ['cyclic', 'random'],
}
lasso_model = {}
lasso_model['model']='Lasso_Regression'
outer_loop_arr = []
lasso_model_lite = {}
lasso_model_lite['model']='Lasso_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(lasso_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
lasso_model['outer_loop'] = outer_loop_arr
lasso_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(lasso_model)
all_models_lite.append(lasso_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

3.3753480911254883


## Elastic Net Regression

In [36]:
elnet_reg = ElasticNet(random_state = 42)
elnet_reg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'l1_ratio': 0.5,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': 42,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [37]:
param_grid = {
    #'alpha': [0.01, 0.1, 0.3, 0.5, 1.0, 2.0, 10.0],
    'tol': [1e-4, 1e-3, 1e-2],
    #'l1_ratio' : [0, 0.2, 0.5, 0.7, 1],
    #'max_iter': [None, 100, 300, 500, 1000, 1500],
    'selection' : ['cyclic', 'random'],
}

elnet_model = {}
elnet_model['model']='Elastic_Net_Regression'
outer_loop_arr = []
elnet_model_lite = {}
elnet_model_lite['model']='Elastic_Net_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(elnet_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
elnet_model['outer_loop'] = outer_loop_arr
elnet_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(elnet_model)
all_models_lite.append(elnet_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

0.5894033908843994


## Support Vector Regression

In [39]:
svr_reg = SVR()
svr_reg.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [40]:
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],  
    #'C': [0.1, 1, 10, 100],
    #'epsilon': [0.01, 0.1, 0.2, 0.5], 
    #'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1, 10],
    #'degree': [2, 3, 4],
}
svr_model = {}
svr_model['model']='Support_Vector_Regression'
outer_loop_arr = []
svr_model_lite = {}
svr_model_lite['model']='Support_Vector_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(svr_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
svr_model['outer_loop'] = outer_loop_arr
svr_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(svr_model)
all_models_lite.append(svr_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

3.752307653427124


## Decision Tree Regression

In [41]:
tree_reg = DecisionTreeRegressor(random_state = 42)
tree_reg.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [45]:
param_grid = {
    'splitter': ['best', 'random'],
    #'max_depth': list(range(1, 11)) + [None],
    #'min_samples_split': list(range(2, 21)),
    #'min_samples_leaf': list(range(1, 21)),
    'max_features': [None, 'sqrt', 'log2'],
}

tree_model = {}
tree_model['model']='Decision_Tree_Regression'
outer_loop_arr = []
tree_model_lite = {}
tree_model_lite['model']='Decision_Tree_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(tree_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
tree_model['outer_loop'] = outer_loop_arr
tree_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(tree_model)
all_models_lite.append(tree_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

2.229443311691284


## Gradient Boosting Regression

In [46]:
gb_reg = GradientBoostingRegressor(random_state = 42)
gb_reg.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [48]:
param_grid = {
    #'n_estimators': [100, 200, 300, 400, 500],
    #'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    #'max_depth': list(range(1, 11)),
    #'min_samples_split': list(range(2, 21)),
    #'min_samples_leaf': list(range(1, 21)),
    #'subsample': np.arange(0.1, 1.1, 0.1),
    'max_features': ['sqrt', 'log2', None],
}

gb_model = {}
gb_model['model']='Gradient_Boosting_Regression'
outer_loop_arr = []
gb_model_lite = {}
gb_model_lite['model']='Gradient_Boosting_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(gb_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
gb_model['outer_loop'] = outer_loop_arr
gb_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(gb_model)
all_models_lite.append(gb_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

3.1018733978271484


## AdaBoost Regression

In [49]:
ada_reg = AdaBoostRegressor()
ada_reg.get_params()

{'base_estimator': 'deprecated',
 'estimator': None,
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': None}

In [50]:
param_grid = {
    #'n_estimators': [50, 100, 200, 300, 400, 500, 700],
    #'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'loss': ['linear', 'square', 'exponential'],
}

ada_model = {}
ada_model['model']='AdaBoost_Regression'
outer_loop_arr = []
ada_model_lite = {}
ada_model_lite['model']='AdaBoost_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ada_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ada_model['outer_loop'] = outer_loop_arr
ada_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ada_model)
all_models_lite.append(ada_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

2.657413959503174


## XGBoost Regression

In [51]:
xgb_reg = xgb.XGBRegressor(n_jobs = -1)
xgb_reg.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [52]:
param_grid = {
    #'n_estimators': [50, 100, 200, 300, 400, 500, 700],
    #'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    #'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0] + [None],
    'colsample_bytree': [0.8, 0.9, 1.0],
    #'gamma': [0, 0.1, 0.2],
}

xgb_model = {}
xgb_model['model']='XGBoost_Regression'
outer_loop_arr = []
xgb_model_lite = {}
xgb_model_lite['model']='XGBoost_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(xgb_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
xgb_model['outer_loop']= outer_loop_arr
xgb_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(xgb_model)
all_models_lite.append(xgb_model_lite)

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## LightGBM Regression

In [54]:
lgb_reg = lgb.LGBMRegressor(n_jobs = -1)
lgb_reg.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [55]:
param_grid = {
   #'n_estimators': [50, 100, 200, 300, 400, 500, 700],
    #'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    #'max_depth': [3, 5, 7, 9],
    #'num_leaves': [31, 63, 127],
    'subsample': [0.8, 0.9, 1.0],
    #'colsample_bytree': [0.8, 0.9, 1.0],
}

lgb_model = {}
lgb_model['model']='LightGBM_Regression'
outer_loop_arr = []
lgb_model_lite = {}
lgb_model_lite['model']='LightGBM_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(lgb_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
lgb_model['outer_loop']= outer_loop_arr
lgb_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(lgb_model)
all_models_lite.append(lgb_model_lite)

loading bar:

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 415
[LightGBM] [Info] Number of data points in the train set: 114, number of used features: 13
[LightGBM] [Info] Start training from score -0.013427
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 423
[LightGBM] [Info] Number of data points in the train set: 114, number of used features: 13
[LightGBM] [Info] Start training from score -0.100480
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 430
[LightGBM] [Info] Number of data points in the train set: 114, number of used features: 13
[LightGBM] [Info] Start training from score 0.055183
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 415
[LightGBM] [Info] Number of data points in the train set: 114, number of used features: 13
[LightGBM] [Info] Start training from score -0.013427
You can set `force_col_wise=true` to remove

## CatBoost Regression   !!! ne rabote  !!!

In [36]:
catb_reg  = CatBoostRegressor(random_seed = 42)

In [35]:
param_grid = {
    'iterations': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'depth': [4, 6, 8, 10, 12],
    'l2_leaf_reg': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'border_count': [32, 64, 128],
}
catb_model = {}  
catb_model['model']='cat_boost_Regression'
outer_loop_arr = []
catb_model_lite = {}
catb_model_lite['model']='cat_boost_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(catb_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
catb_model['outer_loop']= outer_loop_arr
catb_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(catb_model)
all_models_lite.append(catb_model_lite)

loading bar:



CatBoostError: You can't change params of fitted model.

## K-Nearest Neighbors Regression

In [56]:
knn_reg = KNeighborsRegressor(n_jobs = -1)
knn_reg.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': -1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [57]:
param_grid = {
    #'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance'],
    #'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #'leaf_size': list(range(10, 101, 10)),
    #'p': [1, 2],
}

knn_model = {}  
knn_model['model']='K-Nearest_Neighbors_Regression'
outer_loop_arr = []
knn_model_lite = {}
knn_model_lite['model']='K-Nearest_Neighbors_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(knn_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
knn_model['outer_loop']= outer_loop_arr
knn_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(knn_model)
all_models_lite.append(knn_model_lite)

loading bar:



  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


10%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


20%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


30%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


40%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


50%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


60%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


70%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


80%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


90%
100%


  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


## Neural Network Regression

In [58]:
mlp_reg= MLPRegressor(random_state = 42)
mlp_reg.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [59]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    #'alpha': [0.0001, 0.001, 0.01, 0.1],
    #'learning_rate': ['constant', 'invscaling', 'adaptive'],
    #'learning_rate_init': [0.001, 0.01, 0.1],
    #'max_iter': [100, 200, 300, 400, 500],
}

mlp_model = {}
mlp_model['model']='Neural_Network_Regression'
outer_loop_arr = []
mlp_model_lite = {}
mlp_model_lite['model']='Neural_Network_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(mlp_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
mlp_model['outer_loop'] = outer_loop_arr
mlp_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(mlp_model)
all_models_lite.append(mlp_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

33.71279215812683


## Gaussian Process Regression

In [60]:
gp_reg = GaussianProcessRegressor()
gp_reg.get_params()

{'alpha': 1e-10,
 'copy_X_train': True,
 'kernel': None,
 'n_restarts_optimizer': 0,
 'n_targets': None,
 'normalize_y': False,
 'optimizer': 'fmin_l_bfgs_b',
 'random_state': None}

In [63]:
param_grid = {
    'kernel': [RBF(), Matern()],
    #'n_restarts_optimizer': list(range(1, 21)),
    #'alpha': np.logspace(-5, 2, 10),
}
gp_model = {}
gp_model['model']='Gaussian_Process_Regression'
outer_loop_arr = []
gp_model_lite = {}
gp_model_lite['model']='Gaussian_Process_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(gp_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
gp_model['outer_loop'] = outer_loop_arr
gp_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(gp_model)
all_models_lite.append(gp_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

0.27895116806030273


## Huber Regression

In [66]:
huber_reg = HuberRegressor()
huber_reg.get_params()

{'alpha': 0.0001,
 'epsilon': 1.35,
 'fit_intercept': True,
 'max_iter': 100,
 'tol': 1e-05,
 'warm_start': False}

In [67]:
param_grid = {
    #'epsilon': np.linspace(1.0, 3.0, 10),
    #'alpha': np.logspace(-5, 2, 10),
    #'max_iter': [100, 200, 300, 400, 500],
    'tol': [1e-4, 1e-5, 1e-6],
}
huber_model = {}
huber_model['model']='Huber_Regression'
outer_loop_arr = []
huber_model_lite = {}
huber_model_lite['model']='Huber_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(huber_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
huber_model['outer_loop'] = outer_loop_arr
huber_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(huber_model)
all_models_lite.append(huber_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

0.21839070320129395


## Theil Sen Regression

In [71]:
theilsen_reg = TheilSenRegressor(random_state = 42, n_jobs = -1)
theilsen_reg.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'max_iter': 300,
 'max_subpopulation': 10000.0,
 'n_jobs': -1,
 'n_subsamples': None,
 'random_state': 42,
 'tol': 0.001,
 'verbose': False}

In [72]:
param_grid = {
    #'max_subpopulation': list(range(1, 21)),
    #'n_subsamples': list(range(1, 21)),  !!!!
    'max_iter': [100, 200, 300],
    'tol': [1e-4, 1e-5, 1e-6],
}
theilsen_model = {}  
theilsen_model['model']='TheilSen_Regression'
outer_loop_arr = []
theilsen_model_lite = {}
theilsen_model_lite['model']='TheilSen_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(theilsen_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
theilsen_model['outer_loop']= outer_loop_arr
theilsen_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(theilsen_model)
all_models_lite.append(theilsen_model_lite)

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## Automatic Relevance Determination Regression

In [73]:
ard_reg = ARDRegression()
ard_reg.get_params()

{'alpha_1': 1e-06,
 'alpha_2': 1e-06,
 'compute_score': False,
 'copy_X': True,
 'fit_intercept': True,
 'lambda_1': 1e-06,
 'lambda_2': 1e-06,
 'max_iter': None,
 'n_iter': 'deprecated',
 'threshold_lambda': 10000.0,
 'tol': 0.001,
 'verbose': False}

In [74]:
param_grid = {
    #'n_iter': [100, 200, 300, 400, 500],
    'tol': [1e-4, 1e-5, 1e-6],
    #'alpha_1': np.logspace(-6, -3, 10),
    #'alpha_2': np.logspace(-6, -3, 10),
    #'lambda_1': np.logspace(-6, -3, 10),
    #'lambda_2': np.logspace(-6, -3, 10),
}
ard_model = {}
ard_model['model']='Automatic_Relevance_Determination_Regression'
outer_loop_arr = []
ard_model_lite = {}
ard_model_lite['model']='Automatic_Relevance_Determination_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ard_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ard_model['outer_loop'] = outer_loop_arr
ard_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ard_model)
all_models_lite.append(ard_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

0.37508654594421387


## Partial Least Squares Regression  !!! ne rabote  !!!

In [75]:
pls_reg = PLSRegression()
pls_reg.get_params()

{'copy': True, 'max_iter': 500, 'n_components': 2, 'scale': True, 'tol': 1e-06}

In [76]:
param_grid = {
    #'n_components': list(range(1, 21)),
    #'scale': [True, False],
    #'max_iter': [100, 200, 300, 400, 500],
    'tol': [1e-4, 1e-5, 1e-6],
}
pls_model = {}
pls_model['model']='Partial_Least_Squares_Regression'
outer_loop_arr = []
pls_model_lite = {}
pls_model_lite['model']='Partial_Least_Squares_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(pls_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
pls_model['outer_loop'] = outer_loop_arr
pls_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(pls_model)
all_models_lite.append(pls_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

## Ordinal Regression

In [77]:
ordinal_reg = OrdinalRidge(random_state = 42)
ordinal_reg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': 42,
 'solver': 'auto',
 'tol': 0.0001}

In [78]:
param_grid = {
   # 'alpha': np.logspace(-5, 2, 10),
    #'max_iter': [100, 200, 300, 400, 500],
    #'tol': [1e-4, 1e-5, 1e-6],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
}
ordinal_model = {}
ordinal_model['model']='Ordinal_Regression'
outer_loop_arr = []
ordinal_model_lite = {}
ordinal_model_lite['model']='Ordinal_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ordinal_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ordinal_model['outer_loop'] = outer_loop_arr
ordinal_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ordinal_model)
all_models_lite.append(ordinal_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

2.1492373943328857


## Least Absolute Deviations Regression

In [79]:
lad_reg = LassoLars(random_state = 42)
lad_reg.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'eps': 2.220446049250313e-16,
 'fit_intercept': True,
 'fit_path': True,
 'jitter': None,
 'max_iter': 500,
 'normalize': 'deprecated',
 'positive': False,
 'precompute': 'auto',
 'random_state': 42,
 'verbose': False}

In [80]:
param_grid = {
    #'alpha': np.logspace(-5, 2, 10),
    #'max_iter': [100, 200, 300, 400, 500],
    #'eps': [1e-4, 1e-5, 1e-6],
    #'positive': [True, False],
    'jitter': [None, 1e-8, 1e-7, 1e-6],
}
lad_model = {}
lad_model['model']='Least_Absolute_Deviations_Regression'
outer_loop_arr = []
lad_model_lite = {}
lad_model_lite['model']='Least_Absolute_Deviations_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(lad_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
lad_model['outer_loop'] = outer_loop_arr
lad_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(lad_model)
all_models_lite.append(lad_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

1.4183838367462158


## Least Angle Regression   !!!! ne rabote !!!!

In [36]:
lars_reg = Lars(random_state = 42)
lars_reg.get_params()

{'copy_X': True,
 'eps': 2.220446049250313e-16,
 'fit_intercept': True,
 'fit_path': True,
 'jitter': None,
 'n_nonzero_coefs': 500,
 'normalize': 'deprecated',
 'precompute': 'auto',
 'random_state': 42,
 'verbose': False}

In [37]:
param_grid = {
    'fit_intercept': [True, False],  # Regularization parameter (alpha)
}
lars_model = {}
lars_model['model']='Least_Angle_Regression'
outer_loop_arr = []
lars_model_lite = {}
lars_model_lite['model']='Least_Angle_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(lars_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
lars_model['outer_loop'] = outer_loop_arr
lars_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(lars_model)
all_models_lite.append(lars_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

## Orthogonal Matching Pursuit Regression

In [81]:
omp_reg = OrthogonalMatchingPursuit()
omp_reg.get_params()

{'fit_intercept': True,
 'n_nonzero_coefs': None,
 'normalize': 'deprecated',
 'precompute': 'auto',
 'tol': None}

In [82]:
param_grid = {
    'n_nonzero_coefs': list(range(1, 21)),
    'tol': [1e-4, 1e-5, 1e-6],
}  
omp_model = {}
omp_model['model']='Orthogonal_Matching_Pursuit_Regression'
outer_loop_arr = []
omp_model_lite = {}
omp_model_lite['model']='Orthogonal_Matching_Pursuit_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(omp_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
omp_model['outer_loop'] = outer_loop_arr
omp_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(omp_model)
all_models_lite.append(omp_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

2.5443127155303955


## Bayesian Ridge Regression

In [83]:
bayridge_reg = BayesianRidge()
bayridge_reg.get_params()

{'alpha_1': 1e-06,
 'alpha_2': 1e-06,
 'alpha_init': None,
 'compute_score': False,
 'copy_X': True,
 'fit_intercept': True,
 'lambda_1': 1e-06,
 'lambda_2': 1e-06,
 'lambda_init': None,
 'max_iter': None,
 'n_iter': 'deprecated',
 'tol': 0.001,
 'verbose': False}

In [84]:
param_grid = {
    #'n_iter': [100, 200, 300, 400, 500],
    'tol': [1e-4, 1e-5, 1e-6],
    #'alpha_1': np.logspace(-6, -3, 10),
    #'alpha_2': np.logspace(-6, -3, 10),
    #'lambda_1': np.logspace(-6, -3, 10),
    #'lambda_2': np.logspace(-6, -3, 10),
}
bayridge_model = {}
bayridge_model['model']='Bayesian_Ridge_Regression'
outer_loop_arr = []
bayridge_model_lite = {}
bayridge_model_lite['model']='Bayesian_Ridge_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(bayridge_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
bayridge_model['outer_loop'] = outer_loop_arr
bayridge_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(bayridge_model)
all_models_lite.append(bayridge_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

0.1587696075439453


## Logistic Regression   PROBLEM !!!!!!!  misle deka e clasificator

In [94]:
log_reg = LogisticRegression(n_jobs = -1, random_state = 42)
log_reg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [95]:
param_grid = {
    #'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    #'C': [0.001, 0.01, 0.1, 0.5, 1.0, 10.0],
    #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 400, 500],
}
log_model = {}
log_model['model']='Logistic_Regression'
outer_loop_arr = []
log_model_lite = {}
log_model_lite['model']='Logistic_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(log_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
log_model['outer_loop'] = outer_loop_arr
log_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(log_model)
all_models_lite.append(log_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

## Tweedie Regression !!! imam NaN

In [96]:
tw_reg = TweedieRegressor()
tw_reg.get_params()

{'alpha': 1.0,
 'fit_intercept': True,
 'link': 'auto',
 'max_iter': 100,
 'power': 0.0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [97]:
param_grid = {
    'power': [0, 1, 2],
    #'alpha': np.logspace(-5, 2, 10),
    'link': ['auto', 'log', 'identity'],
    #'solver': ['auto', 'lbfgs', 'sparsetron', 'sag', 'saga'],
    #'tol': [1e-4, 1e-5, 1e-6],
    #'max_iter': [100, 200, 300, 400, 500],
}
tw_model = {}
tw_model['model']='Tweedie_Regression'
outer_loop_arr = []
tw_model_lite = {}
tw_model_lite['model']='Tweedie_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(tw_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
tw_model['outer_loop'] = outer_loop_arr
tw_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(tw_model)
all_models_lite.append(tw_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

ValueError: Input contains NaN.

## Poisson Regression   !!! problem

In [102]:
poisson_reg = PoissonRegressor()
poisson_reg.get_params()

{'alpha': 1.0,
 'fit_intercept': True,
 'max_iter': 100,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [103]:
param_grid = {
    #'alpha': np.logspace(-5, 2, 10),
    #'max_iter': [100, 200, 300, 400, 500],
    #'tol': [1e-4, 1e-5, 1e-6],
    'solver': ['lbfgs', 'newton-cg'],
}
poisson_model = {}
poisson_model['model']='Poisson_Regression'
outer_loop_arr = []
poisson_model_lite = {}
poisson_model_lite['model']='Poisson_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(poisson_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
poisson_model['outer_loop'] = outer_loop_arr
poisson_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(poisson_model)
all_models_lite.append(poisson_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

ValueError: Some value(s) of y are out of the valid range of the loss 'HalfPoissonLoss'.

## Gamma Regression   !!! ne rabote !!!

In [24]:
gamma_reg = GammaRegressor()
gamma_reg.get_params()

{'alpha': 1.0,
 'fit_intercept': True,
 'max_iter': 100,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [25]:
param_grid = {
    #'alpha': [0.1, 1.0, 10.0],       # Regularization strength (alpha)
    #'fit_intercept': [True, False],  # Whether to fit an intercept
    'max_iter': [100,50],     # Maximum number of iterations
}
gamma_model = {}
gamma_model['model']='Gamma_Regression'
outer_loop_arr = []
gamma_model_lite = {}
gamma_model_lite['model']='Gamma_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(gamma_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
gamma_model['outer_loop'] = outer_loop_arr
gamma_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(gamma_model)
all_models_lite.append(gamma_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

## SGD Regression   !!! ne rabote !!!

In [20]:
sgd_reg = SGDRegressor(random_state = 42)
sgd_reg.get_params()

{'alpha': 0.0001,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.01,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'invscaling',
 'loss': 'squared_error',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'penalty': 'l2',
 'power_t': 0.25,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [21]:
param_grid = {
    #'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],  # Loss function
    #'penalty': ['l1', 'l2', 'elasticnet'],                    # Regularization type
    #'alpha': [0.0001, 0.001, 0.01],                          # Regularization strength
    'max_iter': [100, 50],                             # Maximum number of iterations
}
sgd_model = {}
sgd_model['model']='SGD_Regression'
outer_loop_arr = []
sgd_model_lite = {}
sgd_model_lite['model']='SGD_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(sgd_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
sgd_model['outer_loop'] = outer_loop_arr
sgd_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(sgd_model)
all_models_lite.append(sgd_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

## Passive Aggressive Regression

In [104]:
pa_reg = PassiveAggressiveRegressor(random_state = 42)
pa_reg.get_params()

{'C': 1.0,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'fit_intercept': True,
 'loss': 'epsilon_insensitive',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'random_state': 42,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [105]:
param_grid = {
    #'C': np.logspace(-5, 2, 15),
    #'max_iter': [100, 200, 300, 400, 500, 1000],
    'tol': [1e-4, 1e-5, 1e-6],
    'shuffle': [True, False],
}
pa_model = {}
pa_model['model']='Passive_Aggressive_Regression'
outer_loop_arr = []
pa_model_lite = {}
pa_model_lite['model']='Passive_Aggressive_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(pa_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
pa_model['outer_loop'] = outer_loop_arr
pa_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(pa_model)
all_models_lite.append(pa_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

2.133101463317871


## RANSAC Regression

In [107]:
ransac_reg = RANSACRegressor(random_state = 42)
ransac_reg.get_params()

{'estimator': None,
 'is_data_valid': None,
 'is_model_valid': None,
 'loss': 'absolute_error',
 'max_skips': inf,
 'max_trials': 100,
 'min_samples': None,
 'random_state': 42,
 'residual_threshold': None,
 'stop_n_inliers': inf,
 'stop_probability': 0.99,
 'stop_score': inf}

In [112]:
param_grid = {
    'estimator': [None, LinearRegression(), Ridge(alpha=1.0)],
    'min_samples': [0.1, 0.25, 0.5],  ## ne moze None ako ne e linregresion !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    #'max_trials': [50, 100, 200, 300, 400, 500],
    'loss': ['absolute_error', 'squared_error'],
    #'residual_threshold': [None, 0.5, 1.0],
    #'stop_n_inliers': [None, 50, 100, 150],
    'stop_score': [0.99],   
}

ransac_model = {}
ransac_model['model']='RANSAC_Regression'
outer_loop_arr = []
ransac_model_lite = {}
ransac_model_lite['model']='RANSAC_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ransac_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ransac_model['outer_loop'] = outer_loop_arr
ransac_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ransac_model)
all_models_lite.append(ransac_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

7.448608160018921


## Random Forest Regression

In [113]:
rf_reg = RandomForestRegressor(n_jobs = -1, random_state = 42)
rf_reg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [115]:
param_grid = {
    #'n_estimators': [100, 200, 300, 400, 500, 1000],
    'max_depth': [None, 5, 10, 20, 30, 40, 50],
    #'min_samples_split': [2, 5, 10, 20, 30],
    #'min_samples_leaf': [1, 2, 4, 8, 16],
    'max_features': ['sqrt', 'log2'],
}
rf_model = {}  
rf_model['model']='Random_Forest_Regression'
outer_loop_arr = []
rf_model_lite = {}
rf_model_lite['model']='Random_Forest_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(rf_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
rf_model['outer_loop']= outer_loop_arr
rf_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(rf_model)
all_models_lite.append(rf_model_lite)

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## Hist Gradient Boosting Regression

In [116]:
hgb_reg = HistGradientBoostingRegressor(random_state = 42)
hgb_reg.get_params()

{'categorical_features': None,
 'early_stopping': 'auto',
 'interaction_cst': None,
 'l2_regularization': 0.0,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_bins': 255,
 'max_depth': None,
 'max_iter': 100,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 20,
 'monotonic_cst': None,
 'n_iter_no_change': 10,
 'quantile': None,
 'random_state': 42,
 'scoring': 'loss',
 'tol': 1e-07,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [121]:
param_grid = {
    #'max_iter': [100, 200, 300, 400, 500],
    #'max_depth': [None, 5, 10, 20, 30, 40, 50],
    #'min_samples_leaf': [1, 2, 4, 8],
    #'max_leaf_nodes': [15, 31, 63],
    #'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
    'loss': ['absolute_error', 'squared_error'],   ## poison zaebava zaradi standardizacijata
    ##  loss='poisson' requires non-negative y and sum(y) > 0.
    ## ValueError: loss='gamma' requires strictly positive y.
}
hgb_model = {}
hgb_model['model']='Hist_Gradient_Boosting_Regression'
outer_loop_arr = []
hgb_model_lite = {}
hgb_model_lite['model']='Hist_Gradient_Boosting_Regression'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(hgb_reg, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
hgb_model['outer_loop'] = outer_loop_arr
hgb_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(hgb_model)
all_models_lite.append(hgb_model_lite)

ending_time = time.time()
print(ending_time - starting_time)

2.5464131832122803


## Bagging Regression

In [126]:
bag_reg = BaggingRegressor(n_jobs = -1, random_state = 42)
bag_reg.get_params()

{'base_estimator': 'deprecated',
 'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [127]:
param_grid = {
    #'n_estimators': [10, 50, 100, 200, 300, 400, 500],
    'base_estimator': [None, LinearRegression(), Ridge(alpha=1.0)],
    #'max_samples': [0.5, 0.7, 1.0],
    ##'max_features': [ 'log2'],    !!!!!!!!!!!!! moze da bide samo broj
}

bag_model = {}  
bag_model['model']='Bagging_Regression'
outer_loop_arr = []
bag_model_lite = {}
bag_model_lite['model']='Bagging_Regression'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(bag_reg, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
bag_model['outer_loop']= outer_loop_arr
bag_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(bag_model)
all_models_lite.append(bag_model_lite)

loading bar:





10%




20%




30%




40%




50%




60%




70%




80%




90%




100%


## making the json file

In [133]:
import json
import numpy as np


# Custom JSON encoder class to handle NumPy arrays
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert NumPy array to a list
        return json.JSONEncoder.default(self, obj)

# Convert the array of dictionaries to a JSON array using the custom encoder
json_array = json.dumps(all_models, cls=NumpyArrayEncoder,indent=2)

# Save the JSON array to a file
#with open('final300.json', 'w') as json_file:
    #json_file.write(json_array)


json_array = json.dumps(all_models_lite, cls=NumpyArrayEncoder,indent=2)

# Save the JSON array to a file
with open('final_reg_lite.json', 'w') as json_file:
    json_file.write(json_array)


TypeError: Object of type RBF is not JSON serializable