# Lazy Predict for regression

In [4]:
#!pip install lazypredict

[0mCollecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: xgboost, lightgbm, lazypredict
Successfully installed lazypredict-0.2.13 lightgbm-4.5.0 xgboost-2.1.2
[0m

In [1]:
# Imports
import tensorflow as tf
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyRegressor
import numpy as np
import lazypredict
import matplotlib.pyplot as plt
import seaborn as sns
import random

#from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score

2024-11-10 17:47:47.133001: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Set random seed for reproducibility

In [2]:
SEED = 64

#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

## Data Loading

In [3]:
# Only modify this
file_name = "DIS_lab_LoS_8"

In [4]:
df = pd.read_csv(f'../Datasets_benchmark/Regression/{file_name}.csv')

In [5]:
df.shape

(252004, 1602)

In [6]:
# Display the first few rows of the dataframe
print(df.head())

# Display the dataframe's information (column types, non-null values, etc.)
print(df.info())

# Get the number of rows and columns
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

   Antenna1Subcarrier1Module  Antenna1Subcarrier1Angle  \
0                       0.28                      3.14   
1                       0.31                      2.97   
2                       0.32                      2.75   
3                       0.34                      2.54   
4                       0.35                      2.49   

   Antenna1Subcarrier2Module  Antenna1Subcarrier2Angle  \
0                       0.27                      3.02   
1                       0.31                      2.83   
2                       0.32                      2.71   
3                       0.33                      2.55   
4                       0.34                      2.47   

   Antenna1Subcarrier3Module  Antenna1Subcarrier3Angle  \
0                       0.27                      3.05   
1                       0.30                      2.91   
2                       0.33                      2.72   
3                       0.36                      2.55   
4           

In [None]:
# Summary statistics for numerical columns
print(df.describe())

# For categorical columns if any
#print(df.describe(include='object'))

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Histograms for numerical data
df.hist(bins=15, figsize=(15, 10))
plt.tight_layout()  # Adjusts subplot params so that subplots are nicely fit in the figure.
plt.show()

## Data Preprocessing

In [7]:
X = df.iloc[:,:-2]
X

Unnamed: 0,Antenna1Subcarrier1Module,Antenna1Subcarrier1Angle,Antenna1Subcarrier2Module,Antenna1Subcarrier2Angle,Antenna1Subcarrier3Module,Antenna1Subcarrier3Angle,Antenna1Subcarrier4Module,Antenna1Subcarrier4Angle,Antenna1Subcarrier5Module,Antenna1Subcarrier5Angle,...,Antenna8Subcarrier96Module,Antenna8Subcarrier96Angle,Antenna8Subcarrier97Module,Antenna8Subcarrier97Angle,Antenna8Subcarrier98Module,Antenna8Subcarrier98Angle,Antenna8Subcarrier99Module,Antenna8Subcarrier99Angle,Antenna8Subcarrier100Module,Antenna8Subcarrier100Angle
0,0.28,3.14,0.27,3.02,0.27,3.05,0.28,3.00,0.28,2.99,...,0.27,0.56,0.27,0.69,0.27,0.64,0.29,0.68,0.28,0.67
1,0.31,2.97,0.31,2.83,0.30,2.91,0.34,2.78,0.31,2.74,...,0.30,0.57,0.28,0.71,0.30,0.68,0.29,0.70,0.29,0.73
2,0.32,2.75,0.32,2.71,0.33,2.72,0.35,2.67,0.33,2.63,...,0.35,0.56,0.35,0.71,0.34,0.67,0.37,0.70,0.35,0.71
3,0.34,2.54,0.33,2.55,0.36,2.55,0.37,2.47,0.35,2.44,...,0.39,0.64,0.38,0.66,0.38,0.66,0.41,0.73,0.38,0.68
4,0.35,2.49,0.34,2.47,0.36,2.40,0.37,2.38,0.37,2.32,...,0.42,0.65,0.45,0.68,0.41,0.67,0.46,0.70,0.43,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251999,0.39,0.01,0.40,0.00,0.41,-0.06,0.42,-0.17,0.41,-0.20,...,0.55,-1.64,0.55,-1.68,0.56,-1.66,0.55,-1.62,0.55,-1.61
252000,0.33,-0.25,0.37,-0.21,0.39,-0.28,0.39,-0.39,0.41,-0.41,...,0.56,-1.83,0.54,-1.84,0.56,-1.88,0.56,-1.84,0.54,-1.84
252001,0.33,-0.45,0.35,-0.49,0.36,-0.59,0.37,-0.66,0.38,-0.66,...,0.56,-2.06,0.53,-2.10,0.56,-2.13,0.56,-2.09,0.55,-2.11
252002,0.32,-0.80,0.33,-0.86,0.33,-0.86,0.33,-0.98,0.35,-0.97,...,0.55,-2.36,0.54,-2.34,0.58,-2.34,0.54,-2.36,0.53,-2.36


In [8]:
y = df.iloc[:,-1]
y

0         1155
1         1160
2         1165
3         1170
4         1175
          ... 
251999    4009
252000    4014
252001    4019
252002    4024
252003    4029
Name: PositionY, Length: 252004, dtype: int64

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = SEED)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.50, random_state = SEED)

In [10]:
from sklearn.preprocessing import MinMaxScaler
# Create a MinMaxScaler object
scaler = MinMaxScaler()

In [11]:
# Scale numerical data
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)


## Model Architecture (Lazy Predict)

In [12]:
#lazypredict.Supervised.REGRESSORS = lazypredict.supervised.REGRESSORS[33:36]
lazypredict.Supervised.REGRESSORS

[('AdaBoostRegressor', sklearn.ensemble._weight_boosting.AdaBoostRegressor),
 ('BaggingRegressor', sklearn.ensemble._bagging.BaggingRegressor),
 ('BayesianRidge', sklearn.linear_model._bayes.BayesianRidge),
 ('DecisionTreeRegressor', sklearn.tree._classes.DecisionTreeRegressor),
 ('DummyRegressor', sklearn.dummy.DummyRegressor),
 ('ElasticNet', sklearn.linear_model._coordinate_descent.ElasticNet),
 ('ElasticNetCV', sklearn.linear_model._coordinate_descent.ElasticNetCV),
 ('ExtraTreeRegressor', sklearn.tree._classes.ExtraTreeRegressor),
 ('ExtraTreesRegressor', sklearn.ensemble._forest.ExtraTreesRegressor),
 ('GammaRegressor', sklearn.linear_model._glm.glm.GammaRegressor),
 ('GaussianProcessRegressor',
  sklearn.gaussian_process._gpr.GaussianProcessRegressor),
 ('GradientBoostingRegressor', sklearn.ensemble._gb.GradientBoostingRegressor),
 ('HistGradientBoostingRegressor',
  sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor),
 ('HuberRegressor', sk

In [None]:
# Exclude specific regressors for the MIMO dataset due to its large size
# Define indices of regressors to be excluded
#excluded_indices = {10, 15, 26, 31, 32, 33, 37}
#
## Update the REGRESSORS list by keeping elements that are not in the excluded indices
#lazypredict.Supervised.REGRESSORS = [
#    model for i, model in enumerate(lazypredict.Supervised.REGRESSORS) if i not in excluded_indices
#]

In [14]:
lazypredict.Supervised.REGRESSORS

[('AdaBoostRegressor', sklearn.ensemble._weight_boosting.AdaBoostRegressor),
 ('BaggingRegressor', sklearn.ensemble._bagging.BaggingRegressor),
 ('BayesianRidge', sklearn.linear_model._bayes.BayesianRidge),
 ('DecisionTreeRegressor', sklearn.tree._classes.DecisionTreeRegressor),
 ('DummyRegressor', sklearn.dummy.DummyRegressor),
 ('ElasticNet', sklearn.linear_model._coordinate_descent.ElasticNet),
 ('ElasticNetCV', sklearn.linear_model._coordinate_descent.ElasticNetCV),
 ('ExtraTreeRegressor', sklearn.tree._classes.ExtraTreeRegressor),
 ('ExtraTreesRegressor', sklearn.ensemble._forest.ExtraTreesRegressor),
 ('GammaRegressor', sklearn.linear_model._glm.glm.GammaRegressor),
 ('GradientBoostingRegressor', sklearn.ensemble._gb.GradientBoostingRegressor),
 ('HistGradientBoostingRegressor',
  sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor),
 ('HuberRegressor', sklearn.linear_model._huber.HuberRegressor),
 ('KNeighborsRegressor', sklearn.neighbors._re

In [15]:
clf = LazyRegressor(verbose=2, ignore_warnings=True, custom_metric=None)

## Model Training

In [16]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score

In [17]:
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

  3%|▎         | 1/35 [51:11<29:00:21, 3071.22s/it]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.5356563022991652, 'Adjusted R-Squared': 0.5041753736414814, 'RMSE': 603.307111119831, 'Time taken': 3071.2200407981873}


  6%|▌         | 2/35 [2:04:49<35:25:04, 3863.77s/it]

{'Model': 'BaggingRegressor', 'R-Squared': 0.8971241160377775, 'Adjusted R-Squared': 0.8901494798369489, 'RMSE': 283.9719584046848, 'Time taken': 4418.545289516449}


  9%|▊         | 3/35 [2:05:24<18:48:15, 2115.47s/it]

{'Model': 'BayesianRidge', 'R-Squared': 0.760592238634259, 'Adjusted R-Squared': 0.7443612039653951, 'RMSE': 433.19933318058463, 'Time taken': 34.99801325798035}


 11%|█▏        | 4/35 [2:15:55<13:10:06, 1529.23s/it]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.755169848239952, 'Adjusted R-Squared': 0.7385711938833386, 'RMSE': 438.0776709125366, 'Time taken': 630.5205833911896}


 14%|█▍        | 5/35 [2:16:06<8:10:58, 981.94s/it]  

{'Model': 'DummyRegressor', 'R-Squared': -3.811717960822136e-05, 'Adjusted R-Squared': -0.06783731156466644, 'RMSE': 885.3743653152842, 'Time taken': 11.534931182861328}


 17%|█▋        | 6/35 [2:17:11<5:23:47, 669.92s/it]

{'Model': 'ElasticNet', 'R-Squared': 0.6227831670428474, 'Adjusted R-Squared': 0.5972091444694811, 'RMSE': 543.7686917385848, 'Time taken': 64.2419126033783}


 20%|██        | 7/35 [2:19:44<3:53:52, 501.15s/it]

{'Model': 'ElasticNetCV', 'R-Squared': 0.6242018678896035, 'Adjusted R-Squared': 0.5987240284244919, 'RMSE': 542.7451798636415, 'Time taken': 153.6794731616974}


 23%|██▎       | 8/35 [2:21:33<2:49:19, 376.29s/it]

{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.775757512185631, 'Adjusted R-Squared': 0.7605546316558434, 'RMSE': 419.25438932293713, 'Time taken': 108.950350522995}


 26%|██▌       | 9/35 [5:03:23<23:47:30, 3294.24s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.95510114701224, 'Adjusted R-Squared': 0.9520571569791716, 'RMSE': 187.60149277211465, 'Time taken': 9710.273380279541}


 29%|██▊       | 10/35 [5:03:40<15:50:56, 2282.25s/it]

{'Model': 'GammaRegressor', 'R-Squared': 0.6014961759864589, 'Adjusted R-Squared': 0.5744789675787612, 'RMSE': 558.9010341490982, 'Time taken': 16.23776364326477}


 31%|███▏      | 11/35 [7:14:41<26:35:54, 3989.76s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.7286646503579315, 'Adjusted R-Squared': 0.7102690334330455, 'RMSE': 461.1814764201545, 'Time taken': 7861.406818628311}


 34%|███▍      | 12/35 [7:15:35<17:50:24, 2792.39s/it]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.8358834003225494, 'Adjusted R-Squared': 0.8247568511918748, 'RMSE': 358.6698917654643, 'Time taken': 53.74887561798096}


 37%|███▋      | 13/35 [7:19:45<12:21:29, 2022.25s/it]

{'Model': 'HuberRegressor', 'R-Squared': 0.732360721962398, 'Adjusted R-Squared': 0.7142156861632385, 'RMSE': 458.02964894226045, 'Time taken': 250.12253499031067}


 40%|████      | 14/35 [7:20:10<8:16:36, 1418.88s/it] 

{'Model': 'KNeighborsRegressor', 'R-Squared': 0.9994521326419012, 'Adjusted R-Squared': 0.9994149890921996, 'RMSE': 20.723179060917694, 'Time taken': 24.661972045898438}


 43%|████▎     | 15/35 [7:20:25<5:31:55, 995.79s/it] 

{'Model': 'Lars', 'R-Squared': -6578.082362149426, 'Adjusted R-Squared': -7024.121844329049, 'RMSE': 71812.71201801658, 'Time taken': 15.262139081954956}


 46%|████▌     | 16/35 [7:21:03<3:44:00, 707.37s/it]

{'Model': 'LarsCV', 'R-Squared': 0.3213884536707069, 'Adjusted R-Squared': 0.2753808912077039, 'RMSE': 729.3387509829897, 'Time taken': 37.58641004562378}


 49%|████▊     | 17/35 [7:22:36<2:36:49, 522.72s/it]

{'Model': 'Lasso', 'R-Squared': 0.7118344413200534, 'Adjusted R-Squared': 0.6922977932739552, 'RMSE': 475.2692356015117, 'Time taken': 93.3164975643158}


 51%|█████▏    | 18/35 [7:25:46<1:59:45, 422.66s/it]

{'Model': 'LassoCV', 'R-Squared': 0.7411743643400948, 'Adjusted R-Squared': 0.7236268636173893, 'RMSE': 450.4248174864578, 'Time taken': 189.71834635734558}


 54%|█████▍    | 19/35 [7:26:00<1:20:03, 300.19s/it]

{'Model': 'LassoLars', 'R-Squared': 0.7102933331246786, 'Adjusted R-Squared': 0.6906522031670297, 'RMSE': 476.53840991563527, 'Time taken': 14.902300596237183}


 57%|█████▋    | 20/35 [7:26:39<55:25, 221.71s/it]  

{'Model': 'LassoLarsCV', 'R-Squared': 0.7399866047982888, 'Adjusted R-Squared': 0.7223585780049524, 'RMSE': 451.4571418130005, 'Time taken': 38.78149056434631}


 60%|██████    | 21/35 [7:27:10<38:22, 164.45s/it]

{'Model': 'LassoLarsIC', 'R-Squared': 0.7399866047982888, 'Adjusted R-Squared': 0.7223585780049524, 'RMSE': 451.4571418130005, 'Time taken': 30.95286202430725}


 63%|██████▎   | 22/35 [7:27:36<26:36, 122.82s/it]

{'Model': 'LinearRegression', 'R-Squared': 0.7605230557896436, 'Adjusted R-Squared': 0.744287330758433, 'RMSE': 433.2619205360544, 'Time taken': 25.747642517089844}


 66%|██████▌   | 23/35 [7:28:36<20:49, 104.10s/it]

{'Model': 'LinearSVR', 'R-Squared': 0.7432524327900116, 'Adjusted R-Squared': 0.7258458180639107, 'RMSE': 448.61298004048285, 'Time taken': 60.42904043197632}


 69%|██████▊   | 24/35 [7:55:15<1:41:17, 552.52s/it]

{'Model': 'MLPRegressor', 'R-Squared': 0.9242758565326036, 'Adjusted R-Squared': 0.9191420162975259, 'RMSE': 243.63285326921334, 'Time taken': 1598.5466742515564}


 71%|███████▏  | 25/35 [7:55:30<1:05:11, 391.11s/it]

{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': 0.7318978632459187, 'Adjusted R-Squared': 0.7137214471947945, 'RMSE': 458.4255389550779, 'Time taken': 14.559746742248535}


 74%|███████▍  | 26/35 [7:56:43<44:21, 295.73s/it]  

{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.7318978632459187, 'Adjusted R-Squared': 0.7137214471947945, 'RMSE': 458.4255389550779, 'Time taken': 73.21498799324036}


 77%|███████▋  | 27/35 [7:57:25<29:18, 219.81s/it]

{'Model': 'PassiveAggressiveRegressor', 'R-Squared': 0.5793634506999339, 'Adjusted R-Squared': 0.5508457185439972, 'RMSE': 574.2118749020027, 'Time taken': 42.67089223861694}


 80%|████████  | 28/35 [7:57:48<18:44, 160.61s/it]

{'Model': 'PoissonRegressor', 'R-Squared': 0.7167597383211027, 'Adjusted R-Squared': 0.6975570087157538, 'RMSE': 471.19010325210047, 'Time taken': 22.481600761413574}


 83%|████████▎ | 29/35 [7:58:03<11:41, 116.94s/it]

{'Model': 'Ridge', 'R-Squared': 0.7605307210998951, 'Adjusted R-Squared': 0.7442955157507354, 'RMSE': 433.2549864371827, 'Time taken': 15.053633213043213}


 86%|████████▌ | 30/35 [7:58:47<07:55, 95.04s/it] 

{'Model': 'RidgeCV', 'R-Squared': 0.760578533847905, 'Adjusted R-Squared': 0.7443465700409833, 'RMSE': 433.2117321506821, 'Time taken': 43.93590044975281}


 89%|████████▊ | 31/35 [8:09:30<17:17, 259.41s/it]

{'Model': 'SGDRegressor', 'R-Squared': 0.7322062192846215, 'Adjusted R-Squared': 0.7140507087276466, 'RMSE': 458.1618354390835, 'Time taken': 642.9411799907684}


 91%|█████████▏| 32/35 [8:09:56<09:28, 189.36s/it]

{'Model': 'TransformedTargetRegressor', 'R-Squared': 0.7605230557896436, 'Adjusted R-Squared': 0.744287330758433, 'RMSE': 433.2619205360544, 'Time taken': 25.895464658737183}


 94%|█████████▍| 33/35 [8:10:17<04:37, 138.85s/it]

{'Model': 'TweedieRegressor', 'R-Squared': 0.6178249323911105, 'Adjusted R-Squared': 0.5919147583159314, 'RMSE': 547.3307422929265, 'Time taken': 20.98958683013916}


 97%|█████████▋| 34/35 [8:11:02<01:50, 110.75s/it]

{'Model': 'XGBRegressor', 'R-Squared': 0.9033320546150208, 'Adjusted R-Squared': 0.8967782956058696, 'RMSE': 275.2706890475414, 'Time taken': 45.180272579193115}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.631167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 408000
[LightGBM] [Info] Number of data points in the train set: 201603, number of used features: 1600
[LightGBM] [Info] Start training from score 2592.445405


100%|██████████| 35/35 [8:11:41<00:00, 842.90s/it]

{'Model': 'LGBMRegressor', 'R-Squared': 0.8357662978216989, 'Adjusted R-Squared': 0.8246318095384242, 'RMSE': 358.79783023045417, 'Time taken': 39.11084794998169}





## Model Evaluation

In [18]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsRegressor,1.0,1.0,20.72,24.66
ExtraTreesRegressor,0.95,0.96,187.6,9710.27
MLPRegressor,0.92,0.92,243.63,1598.55
XGBRegressor,0.9,0.9,275.27,45.18
BaggingRegressor,0.89,0.9,283.97,4418.55
HistGradientBoostingRegressor,0.82,0.84,358.67,53.75
LGBMRegressor,0.82,0.84,358.8,39.11
ExtraTreeRegressor,0.76,0.78,419.25,108.95
BayesianRidge,0.74,0.76,433.2,35.0
RidgeCV,0.74,0.76,433.21,43.94


In [19]:
# Convert results into a DataFrame
test_results = pd.DataFrame(models)

In [20]:
# Create the 'model_results' directory if it doesn't exist
if not os.path.exists('model_results'):
    os.mkdir('model_results')

# Save to a text file
test_results.to_csv(f'model_results/{file_name}/{file_name}_results_test_Y.txt', index=True)