## LOAD FORECASTING PROJECT 

In [1]:
import warnings
warnings.filterwarnings('ignore')

**Loading** **DATA**

In [2]:
import pandas as pd
Data = pd.read_csv('continuous dataset.csv')

**Test-train Split**

In [3]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(Data, test_size=0.2, random_state =42)

**Generating Lagged Features and Rolling Statistics for Time Series Analysis**

In [4]:
windows = [12, 24, 128]
for column in train_df.columns:
    if column == 'nat_demand':
        for window in windows:
            train_df[f"{column}_lag_{window}"] = train_df[column].shift(window)
            train_df[f"{column}_ma_mean{window}"] = train_df[column].rolling(window).mean()
            train_df[f"{column}_std_std{window}"] = train_df[column].rolling(window).std()
            train_df[f"{column}_ewm_std{window}"] = train_df[column].ewm(window).std()
            train_df[f"{column}_ewm_mean{window}"] = train_df[column].ewm(window).mean()


    if column != 'datetime' and column != 'holiday' and column != 'school' and column != 'Holiday_ID' and column != 'nat_demand':
        for window in windows:
            train_df[f"{column}_lag_{window}"] = train_df[column].shift(window)
            train_df[f"{column}_ma_mean{window}"] = train_df[column].rolling(window).mean()
            train_df[f"{column}_std_std{window}"] = train_df[column].rolling(window).std()
            train_df[f"{column}_ewm_std{window}"] = train_df[column].ewm(window).std()
            train_df[f"{column}_ewm_mean{window}"] = train_df[column].ewm(window).mean()
            train_df[f"{column}_min_max{window}"] = (train_df[column] -train_df[column].rolling(window).min()) / (train_df[column].rolling(window).max() - train_df[column].rolling(window).min())
            train_df[f"{column}_median{window}"] = train_df[column].rolling(window).median()
            train_df[f"{column}_skew{window}"] = train_df[column].rolling(window).skew()
            train_df[f"{column}_kurt{window}"] = train_df[column].rolling(window).kurt()
            train_df[f"{column}_p50{window}"] = train_df[column].rolling(window).quantile(0.5)

In [5]:
windows = [12, 24, 128]
for column in test_df.columns:
    if column == 'nat_demand':
        for window in windows:
            test_df[f"{column}_lag_{window}"] = test_df[column].shift(window)
            test_df[f"{column}_ma_mean{window}"] = test_df[column].rolling(window).mean()
            test_df[f"{column}_std_std{window}"] = test_df[column].rolling(window).std()
            test_df[f"{column}_ewm_std{window}"] = test_df[column].ewm(window).std()
            test_df[f"{column}_ewm_mean{window}"] = test_df[column].ewm(window).mean()


    if column != 'datetime' and column != 'holiday' and column != 'school' and column != 'Holiday_ID' and column != 'nat_demand':
        for window in windows:
            test_df[f"{column}_lag_{window}"] = test_df[column].shift(window)
            test_df[f"{column}_ma_mean{window}"] = test_df[column].rolling(window).mean()
            test_df[f"{column}_std_std{window}"] = test_df[column].rolling(window).std()
            test_df[f"{column}_ewm_std{window}"] = test_df[column].ewm(window).std()
            test_df[f"{column}_ewm_mean{window}"] = test_df[column].ewm(window).mean()
            test_df[f"{column}_min_max{window}"] = (test_df[column] -test_df[column].rolling(window).min()) / (test_df[column].rolling(window).max() - test_df[column].rolling(window).min())
            test_df[f"{column}_median{window}"] = test_df[column].rolling(window).median()
            test_df[f"{column}_skew{window}"] = test_df[column].rolling(window).skew()
            test_df[f"{column}_kurt{window}"] = test_df[column].rolling(window).kurt()
            test_df[f"{column}_p50{window}"] = test_df[column].rolling(window).quantile(0.5)

**Removing Rows with Missing Values from Training and Testing DataFrames**

In [6]:
train_df.dropna(inplace = True)

In [7]:
test_df.dropna(inplace = True)

#### Performing test train split

In [8]:
X_train, X_test = train_df.drop(columns = ['nat_demand','datetime']), test_df.drop(columns = ['nat_demand','datetime'])

In [9]:
y_train, y_test = train_df['nat_demand'], test_df['nat_demand']

#### Standardization {Scaling)

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### formation of d-matrix 

In [11]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train_scaled, label=y_train, feature_names=X_train.columns.tolist())
dtest = xgb.DMatrix(X_test_scaled, label=y_test, feature_names=X_test.columns.tolist())

#### Defining objective function (xgboost)

In [12]:
from sklearn.metrics import mean_squared_error
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'gpu_id': 0,
        'eta': trial.suggest_float('eta', 0.001, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'lambda': trial.suggest_float('lambda', 0.0, 2.0),
        'alpha': trial.suggest_float('alpha', 0.0, 2.0),
        'nthread': -1,
        'seed': 42
    }

    num_round = 100
    model = xgb.train(params, dtrain, num_round)

    predictions = model.predict(dtest)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    return rmse

#### Performing hyperparameter tuning

In [13]:
import optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2024-07-26 12:09:51,240] A new study created in memory with name: no-name-c344ee46-6995-45ee-a19e-1901bc8bfb2d
[I 2024-07-26 12:10:19,277] Trial 0 finished with value: 123.29504309645017 and parameters: {'eta': 0.3756137657135743, 'max_depth': 12, 'min_child_weight': 19.338400208286004, 'subsample': 0.6932082849673241, 'colsample_bytree': 0.8312541473509123, 'gamma': 2.615262557101587, 'lambda': 0.2894711663289764, 'alpha': 0.7621549352149095}. Best is trial 0 with value: 123.29504309645017.
[I 2024-07-26 12:11:05,435] Trial 1 finished with value: 109.8067608782087 and parameters: {'eta': 0.026289933367636575, 'max_depth': 12, 'min_child_weight': 8.321509807728463, 'subsample': 0.7270597153709595, 'colsample_bytree': 0.6922478160608861, 'gamma': 2.1894020796062375, 'lambda': 1.1317558871069473, 'alpha': 1.5279580917047217}. Best is trial 1 with value: 109.8067608782087.
[I 2024-07-26 12:11:44,300] Trial 2 finished with value: 116.70342292268056 and parameters: {'eta': 0.346925872100

In [14]:
best_params = study.best_params
best_params['tree_method'] = 'gpu_hist'
best_params['gpu_id'] = 0
print("Best params:", best_params)
xgb_best_params = best_params

Best params: {'eta': 0.042385612010180464, 'max_depth': 20, 'min_child_weight': 9.067183102166886, 'subsample': 0.74024831902097, 'colsample_bytree': 0.9887975245714852, 'gamma': 3.4140152265325563, 'lambda': 0.41403538927045624, 'alpha': 1.025166769192781, 'tree_method': 'gpu_hist', 'gpu_id': 0}


In [15]:
xgb_model = xgb.train(best_params, dtrain, 100)

In [16]:
predictions = xgb_model.predict(dtest)

#### Evaluation Scores

In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
    max_error, mean_poisson_deviance, mean_gamma_deviance, mean_tweedie_deviance, mean_absolute_percentage_error

rmse_test = mean_squared_error(y_test, predictions, squared=False)
mae_test = mean_absolute_error(y_test, predictions)
r2_test = r2_score(y_test, predictions)
explained_variance = explained_variance_score(y_test, predictions)
max_err = max_error(y_test, predictions)
poisson_deviance = mean_poisson_deviance(y_test, predictions)
gamma_deviance = mean_gamma_deviance(y_test, predictions)
tweedie_deviance = mean_tweedie_deviance(y_test, predictions)
mape = mean_absolute_percentage_error(y_test, predictions)
print(f"efficiency: {[1 - mae_test]*100})
print(f"RMSE on test data: {rmse_test}")
print(f"MAE on test data: {mae_test}")
print("Mean Absolute Percentage Error:", mape)
print(f"R-squared on test data: {r2_test}")
print(f"Explained Variance on test data: {explained_variance}")
print(f"Max Error on test data: {max_err}")
print(f"Mean Poisson Deviance on test data: {poisson_deviance}")
print(f"Mean Gamma Deviance on test data: {gamma_deviance}")
print(f"Mean Tweedie Deviance on test data: {tweedie_deviance}")

RMSE on test data: 107.80262871991268
MAE on test data: 84.43909390960286
Mean Absolute Percentage Error: 0.07386719302404203
R-squared on test data: 0.6856678657443636
Explained Variance on test data: 0.68569659900662
Max Error on test data: 1007.59040390625
Mean Poisson Deviance on test data: 9.84393125184069
Mean Gamma Deviance on test data: 0.008559882795658027
Mean Tweedie Deviance on test data: 11621.40675892334


#### Formation of lgb dataset

In [19]:
import lightgbm as lgb
train_data = lgb.Dataset(X_train_scaled, label=y_train)
test_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)

#### Defining Objective function (LightGBM)

In [20]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 4, 64),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'num_boost_round': 1000,
        'early_stopping_rounds': 50,
        'device': 'cpu',  # Use GPU for training

    }


    model = lgb.train(params, train_data, valid_sets=[test_data])

    predictions = model.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    return rmse

#### Hyperparameter tuning

In [21]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2024-07-26 12:32:56,856] A new study created in memory with name: no-name-c6f663e2-1fbc-4aab-b3b8-69c53757cb7a


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:33:15,498] Trial 0 finished with value: 103.67270599425187 and parameters: {'num_leaves': 48, 'learning_rate': 0.04692662204134025, 'feature_fraction': 0.7376093034130742, 'bagging_fraction': 0.5946904375729118, 'bagging_freq': 2}. Best is trial 0 with value: 103.67270599425187.


Early stopping, best iteration is:
[813]	valid_0's rmse: 103.673
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:33:26,248] Trial 1 finished with value: 102.99844929196445 and parameters: {'num_leaves': 28, 'learning_rate': 0.07931549695224117, 'feature_fraction': 0.7567388981800469, 'bagging_fraction': 0.776058057221476, 'bagging_freq': 2}. Best is trial 1 with value: 102.99844929196445.


Early stopping, best iteration is:
[721]	valid_0's rmse: 102.998
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:33:37,937] Trial 2 finished with value: 105.46444382702241 and parameters: {'num_leaves': 27, 'learning_rate': 0.013354767252559279, 'feature_fraction': 0.5265341919832445, 'bagging_fraction': 0.6162445486964399, 'bagging_freq': 1}. Best is trial 1 with value: 102.99844929196445.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 105.464
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:33:44,417] Trial 3 finished with value: 106.76353266327597 and parameters: {'num_leaves': 44, 'learning_rate': 0.09725443645057019, 'feature_fraction': 0.5953265128975445, 'bagging_fraction': 0.5124731643661857, 'bagging_freq': 9}. Best is trial 1 with value: 102.99844929196445.


Early stopping, best iteration is:
[251]	valid_0's rmse: 106.764
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:34:02,275] Trial 4 finished with value: 103.79591780249655 and parameters: {'num_leaves': 52, 'learning_rate': 0.06772693001978898, 'feature_fraction': 0.9394273285493433, 'bagging_fraction': 0.6519182198875291, 'bagging_freq': 8}. Best is trial 1 with value: 102.99844929196445.


Early stopping, best iteration is:
[584]	valid_0's rmse: 103.796
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 101.848


[I 2024-07-26 12:34:35,875] Trial 5 finished with value: 101.84810745241991 and parameters: {'num_leaves': 57, 'learning_rate': 0.034151254281399444, 'feature_fraction': 0.9219470837278243, 'bagging_fraction': 0.989866891630629, 'bagging_freq': 9}. Best is trial 5 with value: 101.84810745241991.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:35:08,883] Trial 6 finished with value: 103.00058131562726 and parameters: {'num_leaves': 40, 'learning_rate': 0.01693409132986115, 'feature_fraction': 0.8521439449560184, 'bagging_fraction': 0.765915835603619, 'bagging_freq': 10}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 103.001
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:35:19,769] Trial 7 finished with value: 102.91589731945521 and parameters: {'num_leaves': 46, 'learning_rate': 0.06953234640795944, 'feature_fraction': 0.7874326525828075, 'bagging_fraction': 0.8031135274036425, 'bagging_freq': 4}. Best is trial 5 with value: 101.84810745241991.


Early stopping, best iteration is:
[413]	valid_0's rmse: 102.916
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:35:26,776] Trial 8 finished with value: 104.53155616896595 and parameters: {'num_leaves': 55, 'learning_rate': 0.08345860324935771, 'feature_fraction': 0.8225238228478309, 'bagging_fraction': 0.6107859827057074, 'bagging_freq': 3}. Best is trial 5 with value: 101.84810745241991.


Early stopping, best iteration is:
[217]	valid_0's rmse: 104.532
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:35:44,716] Trial 9 finished with value: 105.93339877216056 and parameters: {'num_leaves': 52, 'learning_rate': 0.006797346728814194, 'feature_fraction': 0.5506765250737877, 'bagging_fraction': 0.5657243091511359, 'bagging_freq': 1}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 105.933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:35:52,566] Trial 10 finished with value: 106.78425902177307 and parameters: {'num_leaves': 7, 'learning_rate': 0.03617580195770336, 'feature_fraction': 0.9646772196825344, 'bagging_fraction': 0.968484534552954, 'bagging_freq': 7}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 106.784
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:36:15,233] Trial 11 finished with value: 102.26200396096323 and parameters: {'num_leaves': 64, 'learning_rate': 0.058971160483864735, 'feature_fraction': 0.6813260686732793, 'bagging_fraction': 0.9971551300355685, 'bagging_freq': 5}. Best is trial 5 with value: 101.84810745241991.


Early stopping, best iteration is:
[650]	valid_0's rmse: 102.262
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:36:40,976] Trial 12 finished with value: 102.09804474269957 and parameters: {'num_leaves': 62, 'learning_rate': 0.03316480758461893, 'feature_fraction': 0.6431043481364207, 'bagging_fraction': 0.9946158059459578, 'bagging_freq': 6}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 102.098
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045784 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:37:04,069] Trial 13 finished with value: 101.93488603244123 and parameters: {'num_leaves': 61, 'learning_rate': 0.029583087091448138, 'feature_fraction': 0.6452159049051135, 'bagging_fraction': 0.8888907228018502, 'bagging_freq': 6}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 101.935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:37:30,270] Trial 14 finished with value: 102.55605717056523 and parameters: {'num_leaves': 35, 'learning_rate': 0.027538162570958227, 'feature_fraction': 0.9004046172597813, 'bagging_fraction': 0.8790648638309966, 'bagging_freq': 7}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[995]	valid_0's rmse: 102.556
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:37:50,561] Trial 15 finished with value: 102.82733016858171 and parameters: {'num_leaves': 58, 'learning_rate': 0.05032799623984032, 'feature_fraction': 0.6971158299255474, 'bagging_fraction': 0.897869652539974, 'bagging_freq': 10}. Best is trial 5 with value: 101.84810745241991.


Early stopping, best iteration is:
[695]	valid_0's rmse: 102.827
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:37:55,634] Trial 16 finished with value: 112.32445628640731 and parameters: {'num_leaves': 4, 'learning_rate': 0.024250706633501106, 'feature_fraction': 0.6087986584339761, 'bagging_fraction': 0.8996671317549724, 'bagging_freq': 8}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 112.324
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:38:08,614] Trial 17 finished with value: 104.44342660042416 and parameters: {'num_leaves': 13, 'learning_rate': 0.037926464915145064, 'feature_fraction': 0.9990101121158986, 'bagging_fraction': 0.8411775652552076, 'bagging_freq': 5}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 104.443
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:38:52,843] Trial 18 finished with value: 109.00889970227763 and parameters: {'num_leaves': 59, 'learning_rate': 0.0031669157188069785, 'feature_fraction': 0.8909600546829083, 'bagging_fraction': 0.7059731937879508, 'bagging_freq': 6}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 109.009
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632
Training until validation scores don't improve for 50 rounds


[I 2024-07-26 12:39:08,396] Trial 19 finished with value: 103.85596247681093 and parameters: {'num_leaves': 20, 'learning_rate': 0.042564026434113, 'feature_fraction': 0.7078976962387798, 'bagging_fraction': 0.9369495173495703, 'bagging_freq': 9}. Best is trial 5 with value: 101.84810745241991.


Did not meet early stopping. Best iteration is:
[994]	valid_0's rmse: 103.856


In [22]:
best_params = study.best_params
lgb_best_params = best_params
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'num_leaves': 57, 'learning_rate': 0.034151254281399444, 'feature_fraction': 0.9219470837278243, 'bagging_fraction': 0.989866891630629, 'bagging_freq': 9}


In [23]:
lgb_model = lgb.train(best_params, train_data, valid_sets=[test_data])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.097756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98712
[LightGBM] [Info] Number of data points in the train set: 38310, number of used features: 390
[LightGBM] [Info] Start training from score 1183.328632


#### Evaluation scores

In [26]:
predictions = lgb_model.predict(X_test_scaled)

rmse_test = mean_squared_error(y_test, predictions, squared=False)
mae_test = mean_absolute_error(y_test, predictions)
r2_test = r2_score(y_test, predictions)
explained_variance = explained_variance_score(y_test, predictions)
max_err = max_error(y_test, predictions)
poisson_deviance = mean_poisson_deviance(y_test, predictions)
gamma_deviance = mean_gamma_deviance(y_test, predictions)
tweedie_deviance = mean_tweedie_deviance(y_test, predictions)
mape = mean_absolute_percentage_error(y_test, predictions)
print(f"RMSE on test data: {rmse_test}")
print(f"MAE on test data: {mae_test}")
print("Mean Absolute Percentage Error:", mape)
print(f"R-squared on test data: {r2_test}")
print(f"Explained Variance on test data: {explained_variance}")
print(f"Max Error on test data: {max_err}")
print(f"Mean Poisson Deviance on test data: {poisson_deviance}")
print(f"Mean Gamma Deviance on test data: {gamma_deviance}")
print(f"Mean Tweedie Deviance on test data: {tweedie_deviance}")

RMSE on test data: 108.29433654033951
MAE on test data: 85.5117087475924
Mean Absolute Percentage Error: 0.07493469679355141
R-squared on test data: 0.6827938717069257
Explained Variance on test data: 0.6827957279014445
Max Error on test data: 994.2780689639762
Mean Poisson Deviance on test data: 9.957943504702682
Mean Gamma Deviance on test data: 0.008680603516445335
Mean Tweedie Deviance on test data: 11727.663326712314


#### Linear Regression 

In [28]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled,y_train)

#### Evalution scores

In [29]:
y_pred = model.predict(X_test_scaled)

rmse_test = mean_squared_error(y_test, y_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)
explained_variance = explained_variance_score(y_test, y_pred)
max_err = max_error(y_test, y_pred)
poisson_deviance = mean_poisson_deviance(y_test, y_pred)
gamma_deviance = mean_gamma_deviance(y_test, y_pred)
tweedie_deviance = mean_tweedie_deviance(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"RMSE on test data: {rmse_test}")
print(f"MAE on test data: {mae_test}")
print("Mean Absolute Percentage Error:", mape)
print(f"R-squared on test data: {r2_test}")
print(f"Explained Variance on test data: {explained_variance}")
print(f"Max Error on test data: {max_err}")
print(f"Mean Poisson Deviance on test data: {poisson_deviance}")
print(f"Mean Gamma Deviance on test data: {gamma_deviance}")
print(f"Mean Tweedie Deviance on test data: {tweedie_deviance}")

RMSE on test data: 111.14056568023987
MAE on test data: 89.03805885894238
Mean Absolute Percentage Error: 0.07788497269705887
R-squared on test data: 0.6659009159230549
Explained Variance on test data: 0.6659027858491346
Max Error on test data: 850.023528072639
Mean Poisson Deviance on test data: 10.559710139055353
Mean Gamma Deviance on test data: 0.009266376597674871
Mean Tweedie Deviance on test data: 12352.225339723713


#### LassoRegression

In [30]:
from sklearn.linear_model import LassoCV
model = LassoCV()
model.fit(X_train_scaled,y_train)

#### Evaluation Scores

In [31]:
y_pred = model.predict(X_test_scaled)

rmse_test = mean_squared_error(y_test, y_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)
explained_variance = explained_variance_score(y_test, y_pred)
max_err = max_error(y_test, y_pred)
poisson_deviance = mean_poisson_deviance(y_test, y_pred)
gamma_deviance = mean_gamma_deviance(y_test, y_pred)
tweedie_deviance = mean_tweedie_deviance(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"RMSE on test data: {rmse_test}")
print(f"MAE on test data: {mae_test}")
print("Mean Absolute Percentage Error:", mape)
print(f"R-squared on test data: {r2_test}")
print(f"Explained Variance on test data: {explained_variance}")
print(f"Max Error on test data: {max_err}")
print(f"Mean Poisson Deviance on test data: {poisson_deviance}")
print(f"Mean Gamma Deviance on test data: {gamma_deviance}")
print(f"Mean Tweedie Deviance on test data: {tweedie_deviance}")

RMSE on test data: 111.69384819783046
MAE on test data: 89.6420296316115
Mean Absolute Percentage Error: 0.07846212135494196
R-squared on test data: 0.6625661965713914
Explained Variance on test data: 0.6625724021916811
Max Error on test data: 895.4612779533461
Mean Poisson Deviance on test data: 10.650020414830395
Mean Gamma Deviance on test data: 0.009330159589739767
Mean Tweedie Deviance on test data: 12475.515725239997
