In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/Thesis/Data/compressed_data.zip
!unzip /content/drive/MyDrive/Thesis/Data/cosine_similarity.zip

Archive:  /content/drive/MyDrive/Thesis/Data/compressed_data.zip
  inflating: demand_graphs.pkl.npz   
  inflating: final_model_input_partial_scale_4.csv  
Archive:  /content/drive/MyDrive/Thesis/Data/cosine_similarity.zip
  inflating: cosine_similarity.csv   


In [3]:
!unzip /content/drive/MyDrive/Thesis/Data/returns.zip

Archive:  /content/drive/MyDrive/Thesis/Data/returns.zip
  inflating: demand_model_o_xgb_01_11.sav  
  inflating: test_predictions_o_xgb_01_11.csv  


In [4]:

import math
#import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import joblib

from sklearn.decomposition import PCA
# check xgboost version
from xgboost import XGBRegressor


#torch stuff
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#from torch_geometric.utils import dense_to_sparse


import lightgbm as lgb
from lightgbm import LGBMRegressor
import holidays

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

In [5]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [6]:

execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


In [7]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:

#################################
## Cosine Similarity
#################################

cosine_similarity = pd.read_csv(f'cosine_similarity.csv').drop(columns=['Unnamed: 0'])
cols = [f"cosine_sim_{i}" for i in range(183)]
cosine_similarity[cols] = cosine_similarity[cols].astype('float32')
cosine_similarity['started_at_hourly'] = pd.to_datetime(cosine_similarity['started_at_hourly'])

#################################
## PCA
#################################

PCA_DIM = 1

pca_test = cosine_similarity[cosine_similarity['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
pca_train = cosine_similarity[cosine_similarity['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del cosine_similarity

pca_train_time_clusters = pca_train[['started_at_hourly', 'start_station_cluster']]
pca_test_time_clusters = pca_test[['started_at_hourly', 'start_station_cluster']]

pca_train.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)
pca_test.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)

pca = PCA(n_components=PCA_DIM, svd_solver='arpack')

m = pca.fit_transform(pca_train[cols])
n = pca.transform(pca_test[cols])
del pca_train

n.shape


df_train_pca = pd.DataFrame(m, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])
df_test_pca = pd.DataFrame(n, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

del m
del n

df_train_pca[['started_at_hourly', 'start_station_cluster']] = pca_train_time_clusters[['started_at_hourly', 'start_station_cluster']]
df_test_pca[['started_at_hourly', 'start_station_cluster']] = pca_test_time_clusters[['started_at_hourly', 'start_station_cluster']]

#################################
#################################

df = pd.read_csv(f'final_model_input_partial_scale_4.csv')

suf = "_normal_xgb_01_11"

returns = pd.read_csv(f"{models_dir}/test_returns_predictions{suf}.csv").rename(
    columns={
        "end_station_cluster": "start_station_cluster",
        "pred": "returns"
        }
    )[['started_at_hourly', 'returns', 'start_station_cluster']]

#################################
#################################

df = df.merge(returns, on=['started_at_hourly', 'start_station_cluster'], how='left').fillna(0)

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])

df_test = df[df['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[df['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del df
del returns

df_train = df_train.merge(df_train_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)

df_test = df_test.merge(df_test_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)


df_hold_out = df_test[df_test['started_at_hourly'] >= pd.to_datetime("2024-03-25 00:00:00")]
df_test = df_test[df_test['started_at_hourly'] < pd.to_datetime("2024-03-25 00:00:00")]

#del df_train_pca
#del df_test_pca


#################################
#################################

In [9]:
del df_train_pca
del df_test_pca
del pca_test
del pca_test_time_clusters
del pca_train_time_clusters

# GNN Variance Embedding

In [10]:
FEATURES = ['start_station_cluster',# 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            "temp",
            "dwpt",
            "rhum",
            "prcp",
            "wdir",
            "wspd",
            "pres",
            "coco",
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            # "coco_lag_1_h",
            # "coco_lag_2_h",
            # "coco_lag_24_h",
            "mean_gnn_cluster_demand_1h",
            "total_gnn_cluster_demand_1h",
            "total_demand_1h",
            "demand_degrees_1h",
            "returns"
]
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

In [11]:
def train_model(df_train, df_test, FEATURES, model='xgb', epochs=50, batch_size=32, tuning=False):
    TARGET = 'demand'
    TARGET_TRAIN = 'demand_target'
    X_train = df_train[FEATURES]
    y_train = df_train[TARGET_TRAIN]

    X_test = df_test[FEATURES]
    y_test = df_test[TARGET]

    overall_zero_scores = pd.DataFrame()
    non_zero_scores = pd.DataFrame()
    zero_scores = pd.DataFrame()
    preds_list = []
    #10, 20, 30, 42, 50, 60, 70, 80, 90,
    for state in [0]:
        if model=='rf':
            reg = RandomForestRegressor(random_state=state)
            reg.fit(X_train, y_train)

        if model=='xgb':
            if tuning:
               reg = tuning_xgb(X_train, y_train, X_test, y_test)
            else:
               if state==0:
                   state = None
               reg = XGBRegressor(random_state=state)
               reg.fit(X_train, y_train)

        if model=='lgbm':
            # Define the parameter grid
            if tuning:
                param_grid = {
                    'learning_rate': [0.01, 0.3, 0.05, 0.1, 0.2],
                    'num_leaves': [10, 20, 31, 40, 50, 60],
                    'max_depth': [-1, 10, 20, 30, 50],
                    'feature_fraction': [0.6, 0.8, 1.0],
                    'bagging_fraction': [0.6, 0.8, 1.0],
                    'n_estimators': [20, 40, 60, 80, 100, 120, 140]
                }

                # Initialize the LightGBM model
                model = lgb.LGBMRegressor(objective='regression', metric='rmse')

                # Perform grid search
                grid_search = RandomizedSearchCV(
                    estimator=model,
                    param_distributions=param_grid,
                    scoring='neg_mean_squared_error',
                    cv=3,
                    verbose=1,
                    n_iter=10,
                )
                grid_search.fit(X_train, y_train)

                # Best parameters
                #print("Best parameters:", grid_search.best_params_)
                reg = LGBMRegressor(**grid_search.best_params_)
                reg.fit(X_train, y_train)
            else:
                reg = LGBMRegressor(random_state=state)
                reg.fit(X_train, y_train)


        if model=='lstm':

            dataset = TimeSeriesDataset(X_train.to_numpy(), y_train.to_numpy())
            test_dataset = TimeSeriesDataset(X_test.to_numpy(), y_test.to_numpy())

            # Create DataLoader
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
            reg = fit_lstm(dataloader, FEATURES, batch_size, epochs)

            output_df = test_lstm(reg, test_dataloader, FEATURES, batch_size)

            return reg, output_df

        y_pred = target_scaler.inverse_transform([reg.predict(X_test)])
        preds_out = X_test.copy()
        preds_out['actual_demand'] = y_test
        preds_out['pred'] = y_pred[0]

        if model != 'rf':
            preds_list.append(preds_out)

        non_zero = preds_out.query('actual_demand != 0')
        zeros = preds_out.query('actual_demand == 0')

        mse_score = mean_squared_error(y_test, y_pred[0])
        rmse_score = np.sqrt(mse_score)
        mae_score = mean_absolute_error(y_test, y_pred[0])
        mape_score = mean_absolute_percentage_error(y_test+1, y_pred[0]+1)
        overall_zero_scores = pd.concat([overall_zero_scores, pd.DataFrame({'mse': [mse_score], 'rmse': [rmse_score], 'mae': [mae_score], 'mape': [mape_score]})])

        mse_score = mean_squared_error(non_zero['actual_demand'], non_zero['pred'])
        rmse_score = np.sqrt(mse_score)
        mae_score = mean_absolute_error(non_zero['actual_demand'], non_zero['pred'])
        mape_score = mean_absolute_percentage_error(non_zero['actual_demand']+1, non_zero['pred']+1)
        non_zero_scores = pd.concat([non_zero_scores, pd.DataFrame({'mse': [mse_score], 'rmse': [rmse_score], 'mae': [mae_score], 'mape': [mape_score]})])


        mse_score = mean_squared_error(zeros['actual_demand'], zeros['pred'])
        rmse_score = np.sqrt(mse_score)
        mae_score = mean_absolute_error(zeros['actual_demand'], zeros['pred'])
        mape_score = mean_absolute_percentage_error(zeros['actual_demand']+1, zeros['pred']+1)
        zero_scores = pd.concat([zero_scores, pd.DataFrame({'mse': [mse_score], 'rmse': [rmse_score], 'mae': [mae_score], 'mape': [mape_score]})])

    overall_out = {
        "mse": overall_zero_scores['mse'].mean(),
        "rmse": overall_zero_scores['rmse'].mean(),
        "mae": overall_zero_scores['mae'].mean(),
        "mape": overall_zero_scores['mape'].mean()
    }

    non_zero_out = {
        "mse": non_zero_scores['mse'].mean(),
        "rmse": non_zero_scores['rmse'].mean(),
        "mae": non_zero_scores['mae'].mean(),
        "mape": non_zero_scores['mape'].mean()
    }


    zero_out = {
        "mse": zero_scores['mse'].mean(),
        "rmse": zero_scores['rmse'].mean(),
        "mae": zero_scores['mae'].mean(),
        "mape": zero_scores['mape'].mean()
    }

    return reg, preds_out, preds_list, overall_out, non_zero_out, zero_out

#### Single Thresholds


In [12]:
exclude_features = {
    4.5: {
        "clusters": [35, 65, 79, 94],
        "features_1": [
            "started_at_quarter", "prcp_lag_2_h", "is_holiday", "prcp",
            "prcp_lag_1_h", "wspd_lag_1_h", "wspd", "wspd_lag_24_h",
            "temp_lag_2_h", "wspd_lag_2_h", "started_at_month",
            "cosine_sim_pca_0"
        ],

        "features_2": [
            "started_at_quarter", "prcp_lag_2_h", "is_holiday", "wspd",
            "prcp", "started_at_month", 'rhum', "wspd_lag_2_h", "temp",
            "dwpt", "wspd_lag_1_h", "temp_lag_1_h", "temp_lag_2_h"
        ],

        "most_optimal_1": [ #31.359894450024303
          'prcp_lag_2_h', 'is_holiday', 'prcp', 'wspd_lag_1_h', 'temp_lag_2_h',
          'started_at_month'
        ],

        "most_optimal_2": [ #2.610128400831642
          'prcp_lag_2_h', 'rhum', 'wspd_lag_2_h', 'temp', 'dwpt', 'temp_lag_1_h'
        ]
    },

    4.0: {
        "clusters": [35, 65, 69, 71, 79, 86, 91, 94],
        "features_1": [
            "started_at_quarter", "prcp_lag_2_h", "is_holiday", "prcp",
            "prcp_lag_1_h", "wspd", "wspd_lag_1_h", "temp_lag_2_h",
            "wspd_lag_2_h", "started_at_month", "wspd_lag_24_h", "dwpt",
            "rhum", "temp_lag_24_h", "cosine_sim_pca_0"
        ],

        "features_2": [
            "started_at_quarter", "prcp_lag_2_h", "wspd", "is_holiday",
            "started_at_month", "rhum", 'temp',  "wspd_lag_2_h", "dwpt",
            "temp_lag_1_h", "wspd_lag_1_h", "temp_lag_2_h",
            "prcp", "cosine_sim_pca_0"
        ],

        "most_optimal_1": [ #23.595473576304098
            'started_at_quarter', 'is_holiday', 'prcp', 'prcp_lag_1_h', 'wspd',
            'wspd_lag_24_h', 'rhum', 'temp_lag_24_h', 'cosine_sim_pca_0'
        ],

        "most_optimal_2": [#2.2319695831852395
            'started_at_quarter', 'prcp_lag_2_h', 'is_holiday',
            'started_at_month', 'rhum', 'temp', 'wspd_lag_2_h', 'dwpt',
            'temp_lag_1_h', 'wspd_lag_1_h', 'temp_lag_2_h', 'prcp',
            'cosine_sim_pca_0'
        ]
    },

    3.5: {
        "clusters": [35, 65, 69, 71, 79, 86, 91, 94, 105, 122, 154],
        "features_1": [
            "started_at_quarter", "prcp_lag_2_h", "is_holiday", "prcp", "wspd",
            "prcp_lag_1_h", "wspd_lag_2_h", "wspd_lag_1_h", "temp_lag_2_h",
            "started_at_month", "rhum", "dwpt", "wspd_lag_24_h", "cosine_sim_pca_0"
        ],

        "features_2": [
          "started_at_quarter", "prcp_lag_2_h", "wspd", "is_holiday", "started_at_month",
          "rhum", "temp", "wspd_lag_2_h", "dwpt", "temp_lag_1_h", "wspd_lag_1_h",
          "temp_lag_2_h", "prcp", "cosine_sim_pca_0"
        ],

        "most_optimal_1": [#20.80729942047084
          'started_at_quarter', 'wspd', 'wspd_lag_2_h', 'wspd_lag_1_h'
        ],

        "most_optimal_2": [#1.948143117905541
          'prcp_lag_2_h', 'wspd', 'started_at_month', 'rhum', 'wspd_lag_2_h',
           'dwpt', 'temp_lag_1_h', 'temp_lag_2_h', 'prcp', 'cosine_sim_pca_0'
        ]
    },

    3.0: {
        "clusters": [35, 42, 53, 62, 65, 69, 71, 79, 81, 86, 88, 91, 94, 99,
                     102, 105, 122, 145, 154],
        "features_1": [
            "started_at_quarter", "prcp_lag_2_h", "is_holiday", "prcp", "wspd",
            "wspd_lag_2_h", "wspd_lag_1_h", "rhum", "temp_lag_2_h",
            "started_at_month", "prcp_lag_1_h", "dwpt", "cosine_sim_pca_0"
        ],

        "features_2": [
          "started_at_quarter", "prcp_lag_2_h", "wspd", "started_at_month", "is_holiday",
          "rhum", "temp", "wspd_lag_2_h", "dwpt", "temp_lag_1_h", "wspd_lag_1_h",
          "temp_lag_2_h", "prcp", "cosine_sim_pca_0"
        ],

        "most_optimal_1": [
          'started_at_quarter', 'wspd_lag_1_h', 'rhum', 'dwpt'
        ],

        "most_optimal_2": [
          'started_at_quarter', 'prcp_lag_2_h', 'started_at_month', 'dwpt', 'temp_lag_2_h', 'prcp', 'cosine_sim_pca_0'
        ]
    },

    2.5: {
        "clusters": [34, 35, 39, 42, 53, 57, 59, 60, 62, 65, 69, 71, 72, 77, 79,
                     81, 86, 88, 89, 91, 94, 97, 99, 102, 105, 113, 118, 122,
                     145, 154, 163],
        "features_1": [
          "started_at_quarter", "prcp_lag_2_h", "is_holiday", "wspd", "prcp",
          "wspd_lag_2_h", "rhum", "started_at_month", "wspd_lag_1_h",
          "temp_lag_2_h", "dwpt", "temp", "temp_lag_1_h", "prcp_lag_1_h"
        ],

        "features_2": [
            "started_at_quarter", "started_at_month", "wspd",
            "prcp_lag_2_h", "is_holiday", "rhum", "temp", "wspd_lag_2_h",
            "dwpt", "temp_lag_1_h", "wspd_lag_1_h"
        ],

        "most_optimal_1": [
          'wspd_lag_1_h', 'temp_lag_2_h', 'temp_lag_1_h', 'prcp_lag_1_h'
        ],

        "most_optimal_2": [
          'started_at_quarter', 'started_at_month', 'is_holiday', 'rhum',
          'temp', 'dwpt', 'temp_lag_1_h'
        ]
    },

    2.0: {
        "clusters": [34, 35, 39, 42, 53, 57, 59, 60, 62, 64, 65, 69, 71, 72, 77,
                     79, 81, 84, 86, 88, 89, 91, 92, 94, 95, 97, 98, 99, 102,
                     105, 106, 113, 118, 122, 123, 130, 144, 145, 154, 155, 163],
        "features_1": [
            "started_at_quarter", "prcp_lag_2_h", "is_holiday", "wspd", "prcp",
            "wspd_lag_2_h", "rhum", "started_at_month", "wspd_lag_1_h",
            "temp_lag_2_h", "dwpt", "temp", "temp_lag_1_h", "cosine_sim_pca_0"
        ],

        "features_2": [
            "started_at_quarter", "started_at_month", "wspd", "prcp_lag_2_h",
            "is_holiday", "rhum", "temp", "wspd_lag_2_h", "dwpt", "temp_lag_1_h"
        ],

        "most_optimal_1": [
            'wspd_lag_2_h', 'started_at_month', 'temp_lag_1_h',
            'cosine_sim_pca_0'
        ],

        "most_optimal_2": [
            'started_at_quarter', 'wspd_lag_2_h', 'dwpt'
        ]
    },

    1.5: {
        "clusters": [3, 19, 25, 34, 35, 39, 42, 43, 46, 51, 53, 57, 59, 60, 61,
                      62, 64, 65, 69, 71, 72, 73, 75, 76, 77, 79, 81, 82, 84, 86,
                      88, 89, 90, 91, 92, 94, 95, 97, 98, 99, 102, 105, 106,
                      108, 112, 113, 114, 115, 116, 118, 120, 122, 123, 129, 130,
                      136, 140, 141, 144, 145, 148, 149, 152, 154, 155, 160, 163,
                      164],
        "features_1": [
            "started_at_quarter", "prcp_lag_2_h", "is_holiday", "wspd",
            "started_at_month", "rhum", "wspd_lag_2_h", "prcp", "wspd_lag_1_h",
            "temp_lag_2_h", "dwpt", "temp", "temp_lag_1_h", "cosine_sim_pca_0"
        ],

        "features_2": [
            "started_at_quarter", "wspd", "started_at_month", "prcp_lag_2_h", "rhum", "is_holiday",
            "wspd_lag_2_h", "temp", "dwpt", "temp_lag_1_h", "wspd_lag_1_h", "temp_lag_2_h", "prcp",
            "cosine_sim_pca_0"
        ],

        "most_optimal_1": [
          'started_at_month', 'wspd_lag_2_h', 'dwpt', 'cosine_sim_pca_0'
        ],

        "most_optimal_2": [
          'wspd_lag_2_h', 'dwpt', 'wspd_lag_1_h', 'temp_lag_2_h', 'prcp', 'cosine_sim_pca_0'
        ]
    },

}

In [13]:
def print_results(overall_out, non_zero_out, zero_out):
  print("overall")
  print("MSE:", overall_out['mse'])
  print("RMSE:", overall_out['rmse'])
  print("MAE:", overall_out['mae'])
  print("MAPE:", overall_out['mape'])

  print()
  print("Non-zero")
  print("MSE:", non_zero_out['mse'])
  print("RMSE:", non_zero_out['rmse'])
  print("MAE:", non_zero_out['mae'])
  print("MAPE:", non_zero_out['mape'])

  print()
  print("Zeros")
  print("MSE:", zero_out['mse'])
  print("RMSE:", zero_out['rmse'])
  print("MAE:", zero_out['mae'])
  print("MAPE:", zero_out['mape'])
  print("###############")

In [14]:

all_test_clusters = []
for i in exclude_features:
  all_test_clusters.extend(exclude_features[i]['clusters'])
print(len(set(all_test_clusters)))

68


## XGBoost

### Run Regular Model

In [15]:
model, model_preds, preds_list, overall_out, non_zero_out, zero_out = train_model(
    df_train, df_test, FEATURES, model='xgb'
)
print_results(overall_out, non_zero_out, zero_out)

overall
MSE: 3.4385015032398725
RMSE: 1.8543196874433148
MAE: 0.9873230419200121
MAPE: 0.3942077604799864

Non-zero
MSE: 7.019512216780733
RMSE: 2.649436207343127
MAE: 1.6766834387513974
MAPE: 0.35354768977580636

Zeros
MSE: 0.5290595469456003
RMSE: 0.7273647963337243
MAE: 0.4272426003941237
MAPE: 0.4272426003941237
###############


In [16]:
def divide_excludes_test(df_train, df_test, clusters, df_hold_out=None):
  train_1 = df_train[df_train['start_station_cluster'].isin(clusters)]
  test_1 = df_test[df_test['start_station_cluster'].isin(clusters)]
  train_2 = df_train[~df_train['start_station_cluster'].isin(clusters)]
  test_2 = df_test[~df_test['start_station_cluster'].isin(clusters)]
  if df_hold_out is not None:
    hold_out_1 = df_hold_out[df_hold_out['start_station_cluster'].isin(clusters)]
    hold_out_2 = df_hold_out[~df_hold_out['start_station_cluster'].isin(clusters)]
    return train_1, test_1, train_2, test_2, hold_out_1, hold_out_2
  return train_1, test_1, train_2, test_2

In [17]:
def get_rmses(X):


  print("Overall")
  print("MSE: ", (mean_squared_error(X['actual_demand'], X['pred'])))
  print("RMSE: ", np.sqrt(mean_squared_error(X['actual_demand'], X['pred'])))
  print("MAE: ", (mean_absolute_error(X['actual_demand'], X['pred'])))
  print("MAPE: ", (mean_absolute_percentage_error(X['actual_demand']+1, X['pred']+1)))
  print()

  print("Non-Zero")
  non_zeros = X[(X['actual_demand'] != 0)]
  print("MSE:", (mean_squared_error(non_zeros['actual_demand'], non_zeros['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(non_zeros['actual_demand'], non_zeros['pred'])))
  print("MAE:", (mean_absolute_error(non_zeros['actual_demand'], non_zeros['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(non_zeros['actual_demand']+1, non_zeros['pred']+1)))
  print()

  print("Zeroes")
  zeros = X[(X['actual_demand'] == 0)]
  print("MSE:", (mean_squared_error(zeros['actual_demand'], zeros['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(zeros['actual_demand'], zeros['pred'])))
  print("MAE:", (mean_absolute_error(zeros['actual_demand'], zeros['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(zeros['actual_demand']+1, zeros['pred']+1)))
  print()

  print("Under-predicting")
  under_predicting = X[(X['actual_demand'] > X['pred'])]
  print("MSE:", (mean_squared_error(under_predicting['actual_demand'], under_predicting['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(under_predicting['actual_demand'], under_predicting['pred'])))
  print("MAE:", (mean_absolute_error(under_predicting['actual_demand'], under_predicting['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(under_predicting['actual_demand']+1, under_predicting['pred']+1)))
  print()


  print("Bver-predicting")
  over_predicting = X[(X['actual_demand'] < X['pred'])]
  print("MSE:", (mean_squared_error(over_predicting['actual_demand'], over_predicting['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(over_predicting['actual_demand'], over_predicting['pred'])))
  print("MAE:", (mean_absolute_error(over_predicting['actual_demand'], over_predicting['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(over_predicting['actual_demand']+1, over_predicting['pred']+1)))





### Default Test

In [18]:

import gc

In [19]:
excludes = 3.5
#for excludes in exclude_features:
clusters = exclude_features[excludes]['clusters']
train_1, test_1, train_2, test_2 = divide_excludes_test(df_train, df_test, clusters)
print(f"{excludes}")
overall_preds = pd.DataFrame()

model, model_preds_1, preds_list, overall_out, non_zero_out, zero_out = train_model(
    train_1, test_1, FEATURES, model='xgb'
)

print_results(overall_out, non_zero_out, zero_out)
model, model_preds_2, preds_list, overall_out, non_zero_out, zero_out = train_model(
    train_2, test_2, FEATURES, model='xgb'
)
print_results(overall_out, non_zero_out, zero_out)
overall_preds = pd.concat([model_preds_1, model_preds_2])

get_rmses(overall_preds)


del train_1
del train_2
del test_1
del test_2
del model_preds_1
del model_preds_2
del model
del preds_list
print(gc.collect())


3.5
overall
MSE: 21.509314432814044
RMSE: 4.637813540108533
MAE: 3.190513190265858
MAPE: 0.4309447714768978

Non-zero
MSE: 22.797608580870175
RMSE: 4.774684134146486
MAE: 3.317157561533304
MAPE: 0.35446515583877203

Zeros
MSE: 4.007019902040365
RMSE: 2.001754206200243
MAE: 1.469968941941822
MAPE: 1.469968941941822
###############
overall
MSE: 1.975989427180673
RMSE: 1.4056989105710629
MAE: 0.8022180789620923
MAPE: 0.3825397254931066

Non-zero
MSE: 4.1261264722251605
RMSE: 2.031286900520249
MAE: 1.366476582882753
MAPE: 0.34628217642871606

Zeros
MSE: 0.47333854830499333
RMSE: 0.6879960380009418
MAE: 0.4078787791562452
MAPE: 0.4078787791562452
###############
Overall
MSE:  3.361693250996815
RMSE:  1.8334920918828135
MAE:  0.9716449260135073
MAPE:  0.3859736036112658

Non-Zero
MSE: 6.878408434117952
RMSE: 2.622672002770829
MAE: 1.654017981027118
MAPE: 0.3474883938298729

Zeroes
MSE: 0.5044890925672594
RMSE: 0.7102739559967403
MAE: 0.4172414475223095
MAPE: 0.4172414475223095

Under-predict

### Exclude All

In [20]:
excludes = 3.5
#for excludes in exclude_features:
clusters = exclude_features[excludes]['clusters']
excluded_feats_1 = exclude_features[excludes]['features_1']
excluded_feats_2 = exclude_features[excludes]['features_2']

features_1 = [i for i in FEATURES if i not in excluded_feats_1]
features_2 = [i for i in FEATURES if i not in excluded_feats_2]
train_1, test_1, train_2, test_2 = divide_excludes_test(df_train, df_test, clusters)
print(f"{excludes}")
overall_preds = pd.DataFrame()

model, model_preds_1, preds_list, overall_out, non_zero_out, zero_out = train_model(
    train_1, test_1, features_1, model='xgb'
)

print_results(overall_out, non_zero_out, zero_out)
model, model_preds_2, preds_list, overall_out, non_zero_out, zero_out = train_model(
    train_2, test_2, features_2, model='xgb'
)
print_results(overall_out, non_zero_out, zero_out)
overall_preds = pd.concat([model_preds_1, model_preds_2])

get_rmses(overall_preds)


del train_1
del train_2
del test_1
del test_2
del model_preds_1
del model_preds_2
del model
del preds_list
print(gc.collect())


3.5
overall
MSE: 21.070676113997717
RMSE: 4.5902806138620456
MAE: 3.1620241771819013
MAPE: 0.424993878459144

Non-zero
MSE: 22.336300368716113
RMSE: 4.726129533637024
MAE: 3.2884873391610565
MAPE: 0.3499920204835129

Zeros
MSE: 3.8763664877405923
RMSE: 1.9688490261420737
MAE: 1.443941772172276
MAPE: 1.443941772172276
###############
overall
MSE: 1.9665600527542504
RMSE: 1.402340918876095
MAE: 0.8010663859022225
MAPE: 0.3816742420794933

Non-zero
MSE: 4.109722028033026
RMSE: 2.0272449353822606
MAE: 1.3664367939829347
MAPE: 0.3469381333965858

Zeros
MSE: 0.46878379093510847
RMSE: 0.6846778738466057
MAE: 0.4059500175987287
MAPE: 0.4059500175987287
###############
Overall
MSE:  3.321815579862808
RMSE:  1.8225848621841476
MAE:  0.968553909828704
MAPE:  0.3847473587025104

Non-Zero
MSE: 6.796422660697922
RMSE: 2.6069949483453017
MAE: 1.649757904463534
MAPE: 0.3473882936085121

Zeroes
MSE: 0.4988227347442226
RMSE: 0.7062738383546587
MAE: 0.4151002507006837
MAPE: 0.4151002507006837

Under-pred

### Run Most Optimal

In [21]:
excludes = 3.5
#for excludes in exclude_features:
clusters = exclude_features[excludes]['clusters']
excluded_feats_1 = exclude_features[excludes]['most_optimal_1']
excluded_feats_2 = exclude_features[excludes]['most_optimal_2']

features_1 = [i for i in FEATURES if i not in excluded_feats_1]
features_2 = [i for i in FEATURES if i not in excluded_feats_2]
overall_preds = pd.DataFrame()
train_1, test_1, train_2, test_2 = divide_excludes_test(df_train, df_test, clusters)
print(f"{excludes}")

model, model_preds_1, preds_list, overall_out, non_zero_out, zero_out = train_model(
    train_1, test_1, features_1, model='xgb'
)

print_results(overall_out, non_zero_out, zero_out)
model, model_preds_2, preds_list, overall_out, non_zero_out, zero_out = train_model(
    train_2, test_2, features_2, model='xgb'
)
print_results(overall_out, non_zero_out, zero_out)
overall_preds = pd.concat([model_preds_1, model_preds_2])

get_rmses(overall_preds)


del train_1
del train_2
del test_1
del test_2
del model_preds_1
del model_preds_2
del model
del preds_list
print(gc.collect())


3.5
overall
MSE: 21.072284328023436
RMSE: 4.590455786523102
MAE: 3.1767645028633473
MAPE: 0.43112680564919553

Non-zero
MSE: 22.334370308124345
RMSE: 4.725925338822477
MAE: 3.302285875450939
MAPE: 0.35454959214660414

Zeros
MSE: 3.926044410519975
RMSE: 1.9814248435204336
MAE: 1.471476905112579
MAPE: 1.471476905112579
###############
overall
MSE: 1.948143117905541
RMSE: 1.3957589755776394
MAE: 0.7991562791872434
MAPE: 0.38119540082250164

Non-zero
MSE: 4.0692112274571794
RMSE: 2.017228600693828
MAE: 1.3615596703068469
MAPE: 0.34554026711445246

Zeros
MSE: 0.4658074378061383
RMSE: 0.6825008701870924
MAE: 0.4061134487796362
MAPE: 0.4061134487796362
###############
Overall
MSE:  3.3048192390686073
RMSE:  1.817916180429837
MAE:  0.9678249930971643
MAPE:  0.38473755959869105

Non-Zero
MSE: 6.761598879187666
RMSE: 2.6003074585878623
MAE: 1.6476336785891148
MAPE: 0.3468682925293892

Zeroes
MSE: 0.496310546090792
RMSE: 0.7044931128767633
MAE: 0.4155049722872154
MAPE: 0.4155049722872154

Under-p

### Test All

In [22]:
def get_rmses(X):


  print("Overall")
  print("MSE: ", (mean_squared_error(X['actual_demand'], X['pred'])))
  print("RMSE: ", np.sqrt(mean_squared_error(X['actual_demand'], X['pred'])))
  print("MAE: ", (mean_absolute_error(X['actual_demand'], X['pred'])))
  print("MAPE: ", (mean_absolute_percentage_error(X['actual_demand']+1, X['pred']+1)))
  print()

  print("Non-Zero")
  non_zeros = X[(X['actual_demand'] != 0)]
  print("MSE:", (mean_squared_error(non_zeros['actual_demand'], non_zeros['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(non_zeros['actual_demand'], non_zeros['pred'])))
  print("MAE:", (mean_absolute_error(non_zeros['actual_demand'], non_zeros['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(non_zeros['actual_demand']+1, non_zeros['pred']+1)))
  print()

  print("Zeroes")
  zeros = X[(X['actual_demand'] == 0)]
  print("MSE:", (mean_squared_error(zeros['actual_demand'], zeros['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(zeros['actual_demand'], zeros['pred'])))
  print("MAE:", (mean_absolute_error(zeros['actual_demand'], zeros['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(zeros['actual_demand']+1, zeros['pred']+1)))
  print()

  print("Under-predicting")
  under_predicting = X[(X['actual_demand'] > X['pred'])]
  print("MSE:", (mean_squared_error(under_predicting['actual_demand'], under_predicting['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(under_predicting['actual_demand'], under_predicting['pred'])))
  print("MAE:", (mean_absolute_error(under_predicting['actual_demand'], under_predicting['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(under_predicting['actual_demand']+1, under_predicting['pred']+1)))
  print()


  print("Bver-predicting")
  over_predicting = X[(X['actual_demand'] < X['pred'])]
  print("MSE:", (mean_squared_error(over_predicting['actual_demand'], over_predicting['pred'])))
  print("RMSE:", np.sqrt(mean_squared_error(over_predicting['actual_demand'], over_predicting['pred'])))
  print("MAE:", (mean_absolute_error(over_predicting['actual_demand'], over_predicting['pred'])))
  print("MAPE:", (mean_absolute_percentage_error(over_predicting['actual_demand']+1, over_predicting['pred']+1)))

In [45]:
def test_all_excludes(exclude_features, excludes, mode="opt"):
  all_tests = pd.DataFrame()
  all_hold_out = pd.DataFrame()


  clusters = exclude_features[excludes]['clusters']
  if mode == 'opt':
    excluded_feats_1 = exclude_features[excludes]['most_optimal_1']
    excluded_feats_2 = exclude_features[excludes]['most_optimal_2']
  if mode == 'default':
    excluded_feats_1 = []
    excluded_feats_2 = []
  if mode == 'all':
    excluded_feats_1 = exclude_features[excludes]['features_1']
    excluded_feats_2 = exclude_features[excludes]['features_2']

  train_1, test_1, train_2, test_2, hold_out_1, hold_out_2 = divide_excludes_test(df_train, df_test, clusters, df_hold_out)


  features = [i for i in FEATURES if i not in excluded_feats_1]
  model_1, model_preds_1, preds_list, overall_out, non_zero_out, zero_out = train_model(
      train_1, test_1, features, model='xgb'
  )
  #print_results(overall_out, non_zero_out, zero_out)
  model_2, model_preds_2, preds_list, overall_out, non_zero_out, zero_out = train_model(
      train_2, test_2, features, model='xgb'
  )
  #print_results(overall_out, non_zero_out, zero_out)
  model = 'xgb'
  preds_holdout_1 = test_given_model(hold_out_1, model_1, features)
  preds_holdout_2 = test_given_model(hold_out_2, model_2, features)
  print("*****************")
  print("*****************")
  print("*****************")
  all_hold_out = pd.concat([preds_holdout_1, preds_holdout_2])
  all_tests = pd.concat([model_preds_1, model_preds_2])
  print()
  print("###########")
  del model_1
  del model_2
  del model_preds_1
  del model_preds_2
  del preds_list
  del train_1
  del train_2
  del test_1
  del test_2

  return all_tests, all_hold_out

#### 1.5

In [46]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 1.5, mode='opt')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 7.954410582518505
RMSE: 2.820356463732644
MAE: 1.7658285254816046
MAPE: 0.5133344535615115

Non-zero
MSE: 10.417361551663731
RMSE: 3.227593771165097
MAE: 2.101557532467831
MAPE: 0.36137312431265184

#####
Zeros
MSE: 1.6290207622100015
RMSE: 1.2763309767493702
MAE: 0.9036039543160792
MAPE: 0.9036039543160792
overall
MSE: 0.6508010391285688
RMSE: 0.8067224052476594
MAE: 0.47780829288831156
MAPE: 0.30385267479773714

Non-zero
MSE: 1.7427308774378962
RMSE: 1.3201253264133281
MAE: 0.9568108397896402
MAPE: 0.32094444808110056

#####
Zeros
MSE: 0.23958021297884005
RMSE: 0.4894693177093331
MAE: 0.29741591221157043
MAPE: 0.29741591221157043
*****************
*****************
*****************

###########
Overall
MSE:  3.317332530059917
RMSE:  1.8213545865810745
MAE:  0.9626785317222354
MAPE:  0.3806437119888169

Non-Zero
MSE: 6.7854377147157185
RMSE: 2.6048872748577274
MAE: 1.6476186875618204
MAPE: 0.34920149745318674

Zeroes
MSE: 0.4996222409158337
RMSE: 0.7068396147046611
MAE: 

In [47]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 1.5, mode='default')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 8.047379324222288
RMSE: 2.8367903208066485
MAE: 1.7693106918900954
MAPE: 0.5155957277488639

Non-zero
MSE: 10.543831091257816
RMSE: 3.2471265899650135
MAE: 2.101131720017148
MAPE: 0.35925103428677374

#####
Zeros
MSE: 1.6359522223754162
RMSE: 1.2790434794702705
MAE: 0.9171226541365104
MAPE: 0.9171226541365104
overall
MSE: 0.6460145699490013
RMSE: 0.8037503156758331
MAE: 0.47395459099947723
MAPE: 0.2983735760554389

Non-zero
MSE: 1.7654701979641354
RMSE: 1.328709975112754
MAE: 0.9638260962640379
MAPE: 0.32201833144176284

#####
Zeros
MSE: 0.22442753015978512
RMSE: 0.4737378285083271
MAE: 0.2894689601149796
MAPE: 0.2894689601149796
*****************
*****************
*****************

###########
Overall
MSE:  3.4008116951838008
RMSE:  1.8441289800834975
MAE:  0.9702683788473913
MAPE:  0.38295019788271606

Non-Zero
MSE: 6.958299552550072
RMSE: 2.6378588954965108
MAE: 1.6624550487912397
MAPE: 0.3522516101427689

Zeroes
MSE: 0.510481210527254
RMSE: 0.7144796781765413
MAE: 0.4

In [48]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 1.5, mode='all')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 8.008981253922984
RMSE: 2.8300143557803703
MAE: 1.7657381321389636
MAPE: 0.5083303365098575

Non-zero
MSE: 10.513315187101753
RMSE: 3.2424242762324846
MAE: 2.107303880629175
MAPE: 0.36029246580297963

#####
Zeros
MSE: 1.577311047723239
RMSE: 1.2559104457417491
MAE: 0.8885235486850924
MAPE: 0.8885235486850924
overall
MSE: 0.6531946137381008
RMSE: 0.8082045618147059
MAE: 0.48211432384725106
MAPE: 0.3077352887293206

Non-zero
MSE: 1.734345583154489
RMSE: 1.3169455505655838
MAE: 0.9599020447699741
MAPE: 0.3224879211725301

#####
Zeros
MSE: 0.24603311009757026
RMSE: 0.496017247782343
MAE: 0.30217944668861496
MAPE: 0.30217944668861496
*****************
*****************
*****************

###########
Overall
MSE:  3.3383139408392917
RMSE:  1.8271053447569168
MAE:  0.9639561046000406
MAPE:  0.3803521029129744

Non-Zero
MSE: 6.833347527283661
RMSE: 2.6140672384779355
MAE: 1.651146553103118
MAPE: 0.3492287943790915

Zeroes
MSE: 0.4987252971615668
RMSE: 0.7062048549546844
MAE: 0.405

#### 2.0

In [49]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 2.0, mode='opt')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 11.472321575395634
RMSE: 3.387081572002014
MAE: 2.211207919523691
MAPE: 0.48707104299868004

Non-zero
MSE: 13.590199402850512
RMSE: 3.6864887634238777
MAE: 2.484464214150342
MAPE: 0.3689136217489816

#####
Zeros
MSE: 2.143289111333573
RMSE: 1.4639976473114882
MAE: 1.0075422450759766
MAPE: 1.0075422450759766
overall
MSE: 1.039283217837087
RMSE: 1.0194524107760434
MAE: 0.6125234227005085
MAPE: 0.3565436920082557

Non-zero
MSE: 2.3005056307239458
RMSE: 1.5167417811624844
MAE: 1.0654057067335083
MAPE: 0.322656994666986

#####
Zeros
MSE: 0.3760376415016078
RMSE: 0.613219081162359
MAE: 0.37436386541983674
MAPE: 0.37436386541983674
*****************
*****************
*****************

###########
Overall
MSE:  3.2821162825204375
RMSE:  1.8116611941862744
MAE:  0.9578240558856279
MAPE:  0.37819571060990476

Non-Zero
MSE: 6.729574769004462
RMSE: 2.5941423956684533
MAE: 1.6396997511015787
MAPE: 0.3466509817428579

Zeroes
MSE: 0.4811806904451879
RMSE: 0.6936718896172656
MAE: 0.40382

In [50]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 2.0, mode='default')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 11.494440340347731
RMSE: 3.39034516537
MAE: 2.216697385387969
MAPE: 0.48765078663182587

Non-zero
MSE: 13.63181053483785
RMSE: 3.6921281850496266
MAE: 2.492953965211842
MAPE: 0.3713790449766431

#####
Zeros
MSE: 2.07954601847882
RMSE: 1.442063111822371
MAE: 0.9998157654104933
MAPE: 0.9998157654104933
overall
MSE: 1.0453644158497284
RMSE: 1.0224306410949
MAE: 0.6113962509314022
MAPE: 0.3534928068783027

Non-zero
MSE: 2.3220044287926593
RMSE: 1.5238124650995146
MAE: 1.0731388589558657
MAPE: 0.32480831579166264

#####
Zeros
MSE: 0.37401110615142896
RMSE: 0.6115644742391672
MAE: 0.368577269085965
MAPE: 0.368577269085965
*****************
*****************
*****************

###########
Overall
MSE:  3.3255188451399733
RMSE:  1.8236005168731373
MAE:  0.9622676665730302
MAPE:  0.3798927249007574

Non-Zero
MSE: 6.820911375473242
RMSE: 2.6116874574637072
MAE: 1.6470616018557847
MAPE: 0.34788565959492745

Zeroes
MSE: 0.48563857250778375
RMSE: 0.6968777313903665
MAE: 0.4058973102393

In [51]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 2.0, mode='all')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 11.402994724330613
RMSE: 3.376832054504727
MAE: 2.208051892803521
MAPE: 0.4837435130795204

Non-zero
MSE: 13.524231698854027
RMSE: 3.6775306523337132
MAE: 2.4864433439764624
MAPE: 0.3706823137037468

#####
Zeros
MSE: 2.059165565365886
RMSE: 1.4349792909188221
MAE: 0.9817663856594352
MAPE: 0.9817663856594352
overall
MSE: 1.0327468327026024
RMSE: 1.0162415228195523
MAE: 0.6133084904114504
MAPE: 0.35788768683758226

Non-zero
MSE: 2.273378645189961
RMSE: 1.5077727432176113
MAE: 1.0629427010670585
MAPE: 0.32181576739758566

#####
Zeros
MSE: 0.3803293422697523
RMSE: 0.6167084742970153
MAE: 0.3768570143360888
MAPE: 0.3768570143360888
*****************
*****************
*****************

###########
Overall
MSE:  3.322254583619798
RMSE:  1.822705292585666
MAE:  0.9607338833227725
MAPE:  0.37771643073829

Non-Zero
MSE: 6.82085735833998
RMSE: 2.611677116019509
MAE: 1.6481141862874327
MAPE: 0.3475049153236167

Zeroes
MSE: 0.47976610328972735
RMSE: 0.692651502048272
MAE: 0.4022621965

#### 2.5

In [52]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 2.5, mode='opt')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 13.83065582575469
RMSE: 3.71895897070063
MAE: 2.500694861609815
MAPE: 0.4941606587195384

Non-zero
MSE: 15.788732953870952
RMSE: 3.9735038635782085
MAE: 2.7392408138537827
MAPE: 0.3760499208975276

#####
Zeros
MSE: 2.81460358589448
RMSE: 1.6776780340382598
MAE: 1.1586462600540568
MAPE: 1.1586462600540568
overall
MSE: 1.295199835213254
RMSE: 1.1380684668389922
MAE: 0.6720054997866033
MAPE: 0.3656776584009498

Non-zero
MSE: 2.781477436065963
RMSE: 1.6677761948372938
MAE: 1.1458666977423901
MAPE: 0.3266123573397222

#####
Zeros
MSE: 0.40757127626458506
RMSE: 0.6384130921782424
MAE: 0.3890080758586167
MAPE: 0.3890080758586167
*****************
*****************
*****************

###########
Overall
MSE:  3.2788415760073755
RMSE:  1.8107571830611016
MAE:  0.9608962950571485
MAPE:  0.3802651479227248

Non-Zero
MSE: 6.719152064343083
RMSE: 2.592132725063106
MAE: 1.6423994343476982
MAPE: 0.3471135906293954

Zeroes
MSE: 0.48371347443685964
RMSE: 0.6954951289814039
MAE: 0.407199591

In [53]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 2.5, mode='default')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 13.891443771201722
RMSE: 3.7271227201692354
MAE: 2.4816613816720547
MAPE: 0.49197051309018913

Non-zero
MSE: 15.874223759437216
RMSE: 3.984246950107036
MAE: 2.7140301339823645
MAPE: 0.3706764380134914

#####
Zeros
MSE: 2.736414371739423
RMSE: 1.654211102531785
MAE: 1.1743654240178698
MAPE: 1.1743654240178698
overall
MSE: 1.3062556943215056
RMSE: 1.1429154362075549
MAE: 0.6752502481837611
MAPE: 0.3684048508939992

Non-zero
MSE: 2.8029420880222533
RMSE: 1.6741989392011491
MAE: 1.1478850769769962
MAPE: 0.3272465662066469

#####
Zeros
MSE: 0.4124108391770255
RMSE: 0.6421922135755194
MAE: 0.39298523137263514
MAPE: 0.39298523137263514
*****************
*****************
*****************

###########
Overall
MSE:  3.3106876866399615
RMSE:  1.8195295234318023
MAE:  0.9620191491634054
MAPE:  0.37999728162605656

Non-Zero
MSE: 6.790830703895495
RMSE: 2.6059222367322277
MAE: 1.6459564413885965
MAPE: 0.34756814534859254

Zeroes
MSE: 0.48319709298685176
RMSE: 0.6951237968785501
MAE: 0

In [54]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 2.5, mode='all')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 14.18356263966414
RMSE: 3.7661070934937766
MAE: 2.5190713994239333
MAPE: 0.49350834193461574

Non-zero
MSE: 16.225064973467685
RMSE: 4.028034877389679
MAE: 2.7621791445416384
MAPE: 0.37657706326521

#####
Zeros
MSE: 2.698164013609086
RMSE: 1.642608904641968
MAE: 1.1513583600976085
MAPE: 1.1513583600976085
overall
MSE: 1.2940784994056935
RMSE: 1.1375757115048182
MAE: 0.6727934790518852
MAPE: 0.3662755729077352

Non-zero
MSE: 2.779617634488149
RMSE: 1.6672185323130706
MAE: 1.1479277423126093
MAPE: 0.32816508579459425

#####
Zeros
MSE: 0.40689096392875473
RMSE: 0.6378800544998682
MAE: 0.38903576033009923
MAPE: 0.38903576033009923
*****************
*****************
*****************

###########
Overall
MSE:  3.3356632188562094
RMSE:  1.8263798123216894
MAE:  0.9631623995635881
MAPE:  0.3788943083017841

Non-Zero
MSE: 6.849378665739692
RMSE: 2.6171317631597555
MAE: 1.6512892915662243
MAPE: 0.34789006594300736

Zeroes
MSE: 0.4808962378510775
RMSE: 0.6934668253428404
MAE: 0.404

#### 3.0

In [55]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 3.0, mode='opt')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 18.040101979216185
RMSE: 4.247364121336454
MAE: 2.8844495019966496
MAPE: 0.4442306492705836

Non-zero
MSE: 20.093071526442518
RMSE: 4.482529590135743
MAE: 3.125805132594934
MAPE: 0.3648599398040638

#####
Zeros
MSE: 2.4202592428760816
RMSE: 1.55571824019521
MAE: 1.048115885373458
MAPE: 1.048115885373458
overall
MSE: 1.8335058486237183
RMSE: 1.3540701047670014
MAE: 0.7879847958964533
MAPE: 0.38937536488432467

Non-zero
MSE: 3.7669614457399514
RMSE: 1.9408661586363836
MAE: 1.305531574722159
MAPE: 0.33526902590587626

#####
Zeros
MSE: 0.48532176381697023
RMSE: 0.6966503885141888
MAE: 0.4271033091964163
MAPE: 0.4271033091964163
*****************
*****************
*****************

###########
Overall
MSE:  3.2974280382877486
RMSE:  1.8158821653091228
MAE:  0.9689398143387573
MAPE:  0.3870726343918679

Non-Zero
MSE: 6.740335166047131
RMSE: 2.596215546915766
MAE: 1.6444908210233502
MAPE: 0.3464476057581375

Zeroes
MSE: 0.5001902609053054
RMSE: 0.7072413031669639
MAE: 0.42007900

In [56]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 3.0, mode='default')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 18.86399858215149
RMSE: 4.3432704937813265
MAE: 2.880493138293575
MAPE: 0.44689160120330207

Non-zero
MSE: 21.03360376656695
RMSE: 4.586240700897299
MAE: 3.1185510782473447
MAPE: 0.3650929376007579

#####
Zeros
MSE: 2.3567436029904782
RMSE: 1.535168916761435
MAE: 1.069249718127513
MAPE: 1.069249718127513
overall
MSE: 1.8269539979958422
RMSE: 1.3516486222372448
MAE: 0.7855302990199882
MAPE: 0.38775243169687224

Non-zero
MSE: 3.7535129430568857
RMSE: 1.9373984987753257
MAE: 1.304682188531838
MAPE: 0.3364437641883657

#####
Zeros
MSE: 0.48357889693463546
RMSE: 0.695398372830017
MAE: 0.4235295806786139
MAPE: 0.4235295806786139
*****************
*****************
*****************

###########
Overall
MSE:  3.3354151699809513
RMSE:  1.8263119038053033
MAE:  0.9676966991486339
MAPE:  0.38389755998455233

Non-Zero
MSE: 6.826777233954794
RMSE: 2.612810217745406
MAE: 1.6496769395824424
MAPE: 0.34732386350969224

Zeroes
MSE: 0.4988095057952538
RMSE: 0.7062644729810879
MAE: 0.4136123

In [57]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 3.0, mode='all')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 17.89053513645547
RMSE: 4.229720456065089
MAE: 2.828320223339518
MAPE: 0.43995073506432586

Non-zero
MSE: 19.932950629843496
RMSE: 4.464633314152854
MAE: 3.062509691649323
MAPE: 0.3602286032562937

#####
Zeros
MSE: 2.350992013622939
RMSE: 1.5332944967040543
MAE: 1.0465097379144341
MAPE: 1.0465097379144341
overall
MSE: 1.8573347216629132
RMSE: 1.3628406809539086
MAE: 0.789474381310047
MAPE: 0.38800828641728624

Non-zero
MSE: 3.8243469496256712
RMSE: 1.9555937588429944
MAE: 1.3163882749236973
MAPE: 0.3391722680045903

#####
Zeros
MSE: 0.48575184987885067
RMSE: 0.6969590015767432
MAE: 0.422061275732131
MAPE: 0.422061275732131
*****************
*****************
*****************

###########
Overall
MSE:  3.3207115413575834
RMSE:  1.8222819598946765
MAE:  0.9655345311906196
MAPE:  0.383288870859303

Non-Zero
MSE: 6.799123027694246
RMSE: 2.6075128048955474
MAE: 1.6452982743428346
MAPE: 0.34641073598800964

Zeroes
MSE: 0.4946277540803769
RMSE: 0.7032977705640598
MAE: 0.41325102

#### 3.5

In [58]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 3.5, mode='opt')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 24.950158735112986
RMSE: 4.995013386880258
MAE: 3.437745418254431
MAPE: 0.404575443494004

Non-zero
MSE: 26.18622594306491
RMSE: 5.117247887592012
MAE: 3.55258888861799
MAPE: 0.34442833839061493

#####
Zeros
MSE: 3.52499379727956
RMSE: 1.8774966836933586
MAE: 1.4471252652860822
MAPE: 1.4471252652860822
overall
MSE: 2.3071202553133996
RMSE: 1.518920753467211
MAE: 0.8631153311006166
MAPE: 0.3952895082045715

Non-zero
MSE: 4.652184012785526
RMSE: 2.15689221167529
MAE: 1.4237177108219692
MAPE: 0.3415697502030034

#####
Zeros
MSE: 0.5212797089355838
RMSE: 0.7219970283426268
MAE: 0.4361988104250869
MAPE: 0.4361988104250869
*****************
*****************
*****************

###########
Overall
MSE:  3.338342895473884
RMSE:  1.8271132683755225
MAE:  0.9720335031978915
MAPE:  0.3861912903656771

Non-Zero
MSE: 6.830432975560961
RMSE: 2.6135097045086635
MAE: 1.655045269838195
MAPE: 0.3481344561844237

Zeroes
MSE: 0.5011457444722772
RMSE: 0.7079164812831222
MAE: 0.4171110945628531

In [59]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 3.5, mode='default')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 25.580241471209657
RMSE: 5.057691318300244
MAE: 3.4463586914848974
MAPE: 0.404382646703253

Non-zero
MSE: 26.870248253902027
RMSE: 5.183652018982565
MAE: 3.564698025707033
MAPE: 0.34722336295721673

#####
Zeros
MSE: 3.2201239045419507
RMSE: 1.7944703688113524
MAE: 1.395143564967882
MAPE: 1.395143564967882
overall
MSE: 2.2849988553984883
RMSE: 1.5116212671825202
MAE: 0.8584054289481385
MAPE: 0.39108854766676676

Non-zero
MSE: 4.61152003431957
RMSE: 2.147445001465595
MAE: 1.423478639309747
MAPE: 0.3425079333514651

#####
Zeros
MSE: 0.5132790716248199
RMSE: 0.7164349737588331
MAE: 0.4280842287162784
MAPE: 0.4280842287162784
*****************
*****************
*****************

###########
Overall
MSE:  3.361693250996815
RMSE:  1.8334920918828135
MAE:  0.9716449260135073
MAPE:  0.3859736036112658

Non-Zero
MSE: 6.878408434117952
RMSE: 2.622672002770829
MAE: 1.654017981027118
MAPE: 0.3474883938298729

Zeroes
MSE: 0.5044890925672594
RMSE: 0.7102739559967403
MAE: 0.4172414475223

In [60]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 3.5, mode='all')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 25.736118259947503
RMSE: 5.073077789660583
MAE: 3.4677722762157392
MAPE: 0.41278727606127835

Non-zero
MSE: 26.98196528781622
RMSE: 5.194416741831196
MAE: 3.5789784120170625
MAPE: 0.34774427723830587

#####
Zeros
MSE: 4.141436443556346
RMSE: 2.035051951070622
MAE: 1.5401992556594668
MAPE: 1.5401992556594668
overall
MSE: 2.2943868125066755
RMSE: 1.5147233452042244
MAE: 0.8617421284641908
MAPE: 0.39387554730524504

Non-zero
MSE: 4.626871884697277
RMSE: 2.151016477086421
MAE: 1.4256175765241896
MAPE: 0.34337533622052524

#####
Zeros
MSE: 0.5181253345547377
RMSE: 0.7198092348356874
MAE: 0.4323330622740265
MAPE: 0.4323330622740265
*****************
*****************
*****************

###########
Overall
MSE:  3.330323097900112
RMSE:  1.8249172852214732
MAE:  0.9694516224396663
MAPE:  0.3844428939408845

Non-Zero
MSE: 6.81381869419366
RMSE: 2.6103292309962858
MAE: 1.6532094050237554
MAPE: 0.34815794814645806

Zeroes
MSE: 0.5001086547823521
RMSE: 0.7071836075464081
MAE: 0.413923

#### 4.0

In [61]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 4.0, mode='opt')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 27.41021533053508
RMSE: 5.23547660968274
MAE: 3.581491442450455
MAPE: 0.3800424562359686

Non-zero
MSE: 28.508195692431382
RMSE: 5.3393066677642125
MAE: 3.679182073426625
MAPE: 0.3312615649670313

#####
Zeros
MSE: 3.411501706230227
RMSE: 1.8470250962643218
MAE: 1.4462533653998861
MAPE: 1.4462533653998861
overall
MSE: 2.5766692333889862
RMSE: 1.6052006832134686
MAE: 0.9065984102267148
MAPE: 0.3968464762095323

Non-zero
MSE: 5.159100626697073
RMSE: 2.271365366183317
MAE: 1.4999961169939193
MAPE: 0.34712529523201535

#####
Zeros
MSE: 0.5297702504935966
RMSE: 0.7278531792151468
MAE: 0.4362567137999545
MAPE: 0.4362567137999545
*****************
*****************
*****************

###########
Overall
MSE:  3.3476022748111336
RMSE:  1.8296453959199672
MAE:  0.9736694788948816
MAPE:  0.38691866857844087

Non-Zero
MSE: 6.849982430322154
RMSE: 2.61724710914391
MAE: 1.6588529406639196
MAPE: 0.3499152060212123

Zeroes
MSE: 0.5020448089244973
RMSE: 0.7085512041655827
MAE: 0.4169826463

In [62]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 4.0, mode='default')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 28.74262992496814
RMSE: 5.361215340290682
MAE: 3.6700509071350096
MAPE: 0.38751033598612256

Non-zero
MSE: 29.896393789885117
RMSE: 5.4677594853728815
MAE: 3.769574597269213
MAPE: 0.33685243136188014

#####
Zeros
MSE: 3.5246483060684706
RMSE: 1.8774046729643747
MAE: 1.4947473942017069
MAPE: 1.4947473942017069
overall
MSE: 2.629847400651619
RMSE: 1.621680424945562
MAE: 0.9143372055550628
MAPE: 0.4024064939062125

Non-zero
MSE: 5.259976118301191
RMSE: 2.2934637817722763
MAE: 1.5058479283178317
MAPE: 0.34804951532765693

#####
Zeros
MSE: 0.5451423389831718
RMSE: 0.7383375508418706
MAE: 0.4454911793131632
MAPE: 0.4454911793131632
*****************
*****************
*****************

###########
Overall
MSE:  3.4108593057281373
RMSE:  1.84685118667643
MAE:  0.9814538969549702
MAPE:  0.39077098043539404

Non-Zero
MSE: 6.977070869530229
RMSE: 2.6414145584383815
MAE: 1.6687396963849146
MAPE: 0.35103012522249194

Zeroes
MSE: 0.513441124454882
RMSE: 0.7165480615107978
MAE: 0.423058

In [63]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 4.0, mode='all')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 29.545131441855613
RMSE: 5.4355433437565015
MAE: 3.7000760133777346
MAPE: 0.38717216830476975

Non-zero
MSE: 30.754302589764457
RMSE: 5.545656191089063
MAE: 3.804636529569421
MAPE: 0.34016192034279114

#####
Zeros
MSE: 3.116104923276665
RMSE: 1.765249252450391
MAE: 1.4146818737594449
MAPE: 1.4146818737594449
overall
MSE: 2.5956370260918242
RMSE: 1.6110980808417048
MAE: 0.9073127610149134
MAPE: 0.3935918389025567

Non-zero
MSE: 5.215478600315927
RMSE: 2.2837422359618276
MAE: 1.511027223631927
MAPE: 0.3491800150012442

#####
Zeros
MSE: 0.5190858086916358
RMSE: 0.7204760986262041
MAE: 0.428793748780083
MAPE: 0.428793748780083
*****************
*****************
*****************

###########
Overall
MSE:  3.408254064908384
RMSE:  1.8461457323051136
MAE:  0.9770601105587039
MAPE:  0.3870218448065301

Non-Zero
MSE: 6.980229107133544
RMSE: 2.642012321533256
MAE: 1.666338012711818
MAPE: 0.3500665438924739

Zeroes
MSE: 0.5061532654360629
RMSE: 0.7114444921679153
MAE: 0.41704669298

#### 4.5

In [64]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 4.5, mode='opt')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 41.71586789292314
RMSE: 6.458782229872992
MAE: 4.445798519573041
MAPE: 0.3474734179101035

Non-zero
MSE: 42.59772239549209
RMSE: 6.526693067357472
MAE: 4.523693466817376
MAPE: 0.33562401986254853

#####
Zeros
MSE: 1.4445122756078455
RMSE: 1.2018786442931106
MAE: 0.8885959287484487
MAPE: 0.8885959287484487
overall
MSE: 3.03561398758561
RMSE: 1.7423013480984308
MAE: 0.97000437615111
MAPE: 0.403475864413371

Non-zero
MSE: 6.0108333907830644
RMSE: 2.4517001021297578
MAE: 1.5930253950723532
MAPE: 0.3482816573342271

#####
Zeros
MSE: 0.5503459547123164
RMSE: 0.7418530546626578
MAE: 0.44958083344730737
MAPE: 0.44958083344730737
*****************
*****************
*****************

###########
Overall
MSE:  3.4267827795076826
RMSE:  1.8511571460866532
MAE:  0.9851917092079393
MAPE:  0.3921776176249303

Non-Zero
MSE: 7.001267433934801
RMSE: 2.6459908227230873
MAE: 1.6762517653065272
MAPE: 0.353341753083035

Zeroes
MSE: 0.5226430107000276
RMSE: 0.7229405305417781
MAE: 0.42373035587

In [65]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 4.5, mode='default')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 41.47313733962178
RMSE: 6.439964079063002
MAE: 4.364912396882262
MAPE: 0.34796827178574263

Non-zero
MSE: 42.33194889561048
RMSE: 6.506300707438173
MAE: 4.4355015541515215
MAPE: 0.3305951489433996

#####
Zeros
MSE: 2.254076282804333
RMSE: 1.501358146081185
MAE: 1.1413408815860748
MAPE: 1.1413408815860748
overall
MSE: 3.0170717637229436
RMSE: 1.7369720100574286
MAE: 0.9670091083036292
MAPE: 0.3997962352141489

Non-zero
MSE: 5.979311656040807
RMSE: 2.4452631056883853
MAE: 1.5940261647725456
MAPE: 0.34777878778364846

#####
Zeros
MSE: 0.5426458097352317
RMSE: 0.7366449685806804
MAE: 0.44324758509165935
MAPE: 0.44324758509165935
*****************
*****************
*****************

###########
Overall
MSE:  3.4687038999496935
RMSE:  1.8624456770466336
MAE:  0.989077444507156
MAPE:  0.394040449383334

Non-Zero
MSE: 7.089485181747444
RMSE: 2.662608717357367
MAE: 1.68139289089017
MAPE: 0.3539701370878733

Zeroes
MSE: 0.5269497923751334
RMSE: 0.7259130749443307
MAE: 0.42659613187

In [66]:
all_tests, all_hold_out = test_all_excludes(exclude_features, 4.5, mode='all')
get_rmses(all_tests)
print()
print()
get_rmses(all_hold_out)
print(gc.collect())

overall
MSE: 42.79557383582483
RMSE: 6.541832605304482
MAE: 4.412678966564791
MAPE: 0.350137483238488

Non-zero
MSE: 43.702191689130665
RMSE: 6.610763321215687
MAE: 4.488361160250475
MAPE: 0.3368589145155671

#####
Zeros
MSE: 1.3933585348583872
RMSE: 1.1804060889619248
MAE: 0.9565254549185435
MAPE: 0.9565254549185435
overall
MSE: 2.9863656889925982
RMSE: 1.7281104388876882
MAE: 0.9659147407202986
MAPE: 0.4037613801519012

Non-zero
MSE: 5.897979776092271
RMSE: 2.4285756681833637
MAE: 1.58200053925296
MAPE: 0.346869631066856

#####
Zeros
MSE: 0.5542286146794476
RMSE: 0.7444653213410599
MAE: 0.4512843444023221
MAPE: 0.4512843444023221
*****************
*****************
*****************

###########
Overall
MSE:  3.4376582558299242
RMSE:  1.8540922997062266
MAE:  0.9820817982065929
MAPE:  0.3898295341280824

Non-Zero
MSE: 7.041517524805146
RMSE: 2.65358578621554
MAE: 1.6715498176463695
MAPE: 0.3503393084323216

Zeroes
MSE: 0.509652672912005
RMSE: 0.7138996238351755
MAE: 0.421913917195780

### Optimize

In [None]:
from itertools import combinations
import random
def generate_combinations(lst):
  all_combos = []

  # Generate all possible combinations with at least one missing element
  for r in range(len(lst) - 1, 0, -1):  # Ensure at least one element is missing
      for combo in combinations(lst, r):
          new_list = [x for x in lst if x not in combo]
          all_combos.append(new_list)
  random.shuffle(all_combos)

  # Yield results
  for combo in all_combos:
      yield combo

In [None]:
import gc
excludes = 4.5
#for excludes in exclude_features:
clusters = exclude_features[excludes]['clusters']
excluded_feats_1 = exclude_features[excludes]['features_1']
excluded_feats_2 = exclude_features[excludes]['features_2']
train_1, test_1, train_2, test_2 = divide_excludes_test(df_train, df_test, clusters)


best_1 = 2000000000000
best_comb_1 = []
best_2 = 2000000000000
best_comb_2 = []
m = len(excluded_feats_1)
flag = False
current_size = 200000
counters = 0
size_counter = 0


In [None]:
import multiprocessing

manager = multiprocessing.Manager()
best_mse_1 = manager.Value('d', float('inf'))
best_mse_2 = manager.Value('d', float('inf'))
best_comb_1 = manager.list()
best_comb_2 = manager.list()
lock = manager.Lock()  # Lock to avoid race conditions

In [None]:
print(train_1.shape, train_2.shape)

(29280, 148) (1111020, 148)


In [None]:
import time
def optimize_multithread(combs, train, test, best_mse, best_comb, lock):

  features = [i for i in FEATURES if i not in combs]

  model, model_preds, preds_list, overall_out, non_zero_out, zero_out = train_model(
      train, test, features, model='xgb'
  )

  print(gc.collect())
  with lock:
    if overall_out['mse'] <= best_mse.value:
      best_mse.value = overall_out['mse']
      best_comb[:] = combs

      print("Best comb:", best_comb)
      print_results(overall_out, non_zero_out, zero_out)
      print("###############")
  del model
  del model_preds
  del preds_list
  del overall_out
  del non_zero_out
  del zero_out



### 1

In [None]:
combs = generate_combinations(excluded_feats_1)

In [None]:
# processes = []
# counter = 0


# for comb in combs:
#   p = multiprocessing.Process(target=optimize_multithread,
#                               args=(
#                                   comb, train_1,
#                                   test_1, best_mse_1, best_comb_1, lock
#                                 ))
#   p.start()
#   processes.append(p)
#   counter += 1
#   time.sleep(1)
#   if counter%5==0:
#     time.sleep(5)
#   if counter >= 4200:
#     break
#   print(counter)

#   #mse = mean_squared_error(overall_preds['actual_demand'], overall_preds['pred'])

# for p in processes:
#         p.join()

# print("Best Comb1: ", best_comb_1)

counter = 0
stop_counter = 0
best_mse = 20000000000000
best_comb = []
for comb in combs:
  features = [i for i in FEATURES if i not in comb]

  model, model_preds, preds_list, overall_out, non_zero_out, zero_out = train_model(
      train_1, test_1, features, model='xgb'
  )

  if overall_out['mse'] < best_mse:
    best_mse = overall_out['mse']
    best_comb = comb

    print("Best comb:", best_comb)
    print_results(overall_out, non_zero_out, zero_out)
    print(gc.collect())
    print("###############")
    stop_counter= 0
  else:
    stop_counter+=1
  print(stop_counter)
  del model
  del model_preds
  del preds_list
  del overall_out
  del non_zero_out
  del zero_out
  counter+=1

  if stop_counter==200:
    break
  if counter >= 4200:
    break

print("Best Comb1: ", best_comb)



Best comb: ['started_at_quarter', 'prcp_lag_1_h', 'temp_lag_2_h', 'cosine_sim_pca_0']
overall
MSE: 33.08057191712811
RMSE: 5.751571256372307
MAE: 3.9782013373005958
MAPE: 0.40783117855542567

Non-zero
MSE: 34.67295230653351
RMSE: 5.888374334783202
MAE: 4.108226902556858
MAPE: 0.3363533816061358

Zeros
MSE: 4.865720504850194
RMSE: 2.205837823787187
MAE: 1.674322171297578
MAPE: 1.674322171297578
###############
525
###############
0
1
Best comb: ['started_at_quarter', 'prcp', 'wspd_lag_1_h', 'wspd_lag_24_h', 'wspd_lag_2_h', 'started_at_month', 'cosine_sim_pca_0']
overall
MSE: 32.58394153531375
RMSE: 5.708234537518036
MAE: 3.9477882133353326
MAPE: 0.4081025139220644

Non-zero
MSE: 34.148256506316606
RMSE: 5.843650956920391
MAE: 4.075844770912629
MAPE: 0.33638746859268503

Zeros
MSE: 4.866371812335541
RMSE: 2.2059854515239987
MAE: 1.6787972307470849
MAPE: 1.6787972307470849
###############
894
###############
0
1
Best comb: ['prcp_lag_2_h', 'prcp_lag_1_h', 'wspd_lag_1_h', 'wspd', 'wspd_lag

### 2

In [None]:
combs = generate_combinations(excluded_feats_2)

In [None]:
processes = []
counter = 0
stop_counter = 0
best_mse = 20000000000000
best_comb = []
for comb in combs:
  features = [i for i in FEATURES if i not in comb]

  model, model_preds, preds_list, overall_out, non_zero_out, zero_out = train_model(
      train_2, test_2, features, model='xgb'
  )

  if overall_out['mse'] < best_mse:
    best_mse = overall_out['mse']
    best_comb = comb

    print("Best comb:", best_comb)
    print_results(overall_out, non_zero_out, zero_out)
    print(gc.collect())
    print("###############")
    stop_counter= 0
  else:
    stop_counter+=1
  print(stop_counter)
  del model
  del model_preds
  del preds_list
  del overall_out
  del non_zero_out
  del zero_out
  counter+=1

  if stop_counter==200:
    break
  if counter >= 4200:
    break

print("Best Comb2: ", best_comb)



Best comb: ['started_at_quarter', 'prcp_lag_2_h', 'prcp', 'started_at_month', 'rhum', 'wspd_lag_2_h', 'temp', 'wspd_lag_1_h']
overall
MSE: 2.648823366770875
RMSE: 1.6275206194610485
MAE: 0.9042050389674118
MAPE: 0.3909985528698682

Non-zero
MSE: 5.422984514487599
RMSE: 2.328730236521096
MAE: 1.533388118362684
MAPE: 0.35379222261009763

Zeros
MSE: 0.5123586002839409
RMSE: 0.7157922885054999
MAE: 0.41965226455338067
MAPE: 0.41965226455338067
###############
517
###############
0
1
2
3
4
5
6
7
8
9
Best comb: ['prcp_lag_2_h', 'is_holiday', 'rhum', 'wspd_lag_2_h', 'temp', 'dwpt', 'temp_lag_2_h']
overall
MSE: 2.6459492382361787
RMSE: 1.6266374022000658
MAE: 0.903531481412175
MAPE: 0.39087450716160016

Non-zero
MSE: 5.420765395191249
RMSE: 2.328253722254353
MAE: 1.5312834126179156
MAPE: 0.3529505599813285

Zeros
MSE: 0.5089800294746462
RMSE: 0.713428363239538
MAE: 0.42008087717490994
MAPE: 0.42008087717490994
###############
29
###############
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

In [None]:
#aggregated_importance = []
#
#for i in range(df_test.shape[0]):
#    row = df_test[FEATURES].loc[i].values#.reshape(1, -1)
#    explanation = explainer.explain_instance(
#        data_row=row,
#        predict_fn=model.predict  # Prediction function of the model
#    )
#    # Extract feature importance
#    feature_importance = dict(explanation.as_list())
#    aggregated_importance.append(feature_importance)
#
## Aggregate feature importance
#importance_df = pd.DataFrame(aggregated_importance).fillna(0)  # Handle missing features
#mean_importance = importance_df.mean().sort_values(ascending=False)

In [None]:
#11-15
3.519954687632939
1.8761542281041126
0.39561652953215365


#11-28 (20 dim)
3.524785445848623
1.877441196375701
0.39610593754659995

#11-29 (no dim)
3.563436345796625
1.887706636582238
0.4006454580255268

#11-29 (5 dim)
3.504096872035053
1.8719233082674762
0.39420467949842114

#11-29 (6 dim)
3.524516281760969
1.8773695112473114
0.39833392477077356

#11-29 (1 dim)
3.4851078325990033
1.8668443514655964
0.39848029011836106

0.39848029011836106

In [None]:
suf = "_o_xgb_01_27"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## LightGBM

In [None]:
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
#FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

model, model_preds, preds_list = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.750807 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3954
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 37
[LightGBM] [Info] Start training from score 1.005143
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.169623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3944
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 37
[LightGBM] [Info] Start training from score 1.005143
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.186575 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

In [None]:
#11-28
3.622206095204831
1.9032094196921239
0.3979671041602479

#11-29
3.5923026180976714
1.895337072422125
0.3977005786689495

0.3977005786689495

In [None]:
suf = "_o_lgbm_01_27"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## Random Forest

In [None]:
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
#FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

model, model_preds, preds_list = train_model(df_train, df_test, FEATURES, model='rf')

In [None]:
#11_01
3.6042873699256828
1.8984960810930538
0.4084309933647605

#11_28
3.7858322010061154
1.9457215116778956
0.44009657232024235

In [None]:
suf = "_o_rf_01_27"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model