### Importing required Libraries

In [1]:
import os
import pickle
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_columns = None

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

---

In [2]:
# Load Preprocessed resampled_data from the pickle file
def load_from_pickle(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

resampled_data = load_from_pickle('resampled_data.pkl')

### Deriving the best hyperparameters for the most influential features selected among regions

In [3]:
# Create a RandomState instance with a specific seed
random_state = np.random.RandomState(seed=42)
    
# Train-test split without shuffling
train_data, test_data = train_test_split(resampled_data, shuffle=False, random_state=random_state)
    
 # Define input and output columns
input_columns = ['Precipitation', 'RelativeHumidity%', 'AirTemperature','WetBulbTemperature', 'DewTemperature', 'SeaPressure', 
                 'StationPressure', 'Month', 'Day', 'Hour', 'DayOfYear', 'Season_Autumn', 'Season_Spring', 'Season_Summer', 
                 'Season_Winter', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening','TimeOfDay_Morning', 'TimeOfDay_Night', 
                 'IsWeekend_False', 'IsWeekend_True'
                ]
output_columns = ["TotalDemand"]

# Select input and output data with "Region" for training data
train_input_data = train_data[input_columns + ["Region"]]
train_output_data = train_data[output_columns + ["Region"]]

# Select input and output data with "Region" for testing data
test_input_data = test_data[input_columns + ["Region"]]
test_output_data = test_data[output_columns + ["Region"]]

In [4]:
def get_best_model(test_input: pd.DataFrame, test_output: pd.DataFrame):
    
    # Define the parameter grid for random search
    random_param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
        'max_depth': [5, 6, 7, 8],
        'n_estimators': [200, 300, 400, 500],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'bagging_fraction': [0.7, 0.8, 0.9, 1.0],
        'min_child_samples': [5, 10, 20, 50, 100],
        'boosting_type': ['gbdt', 'dart', 'goss']
    }
    
    # Initialize the LightGBM Regressor model
    regressor = LGBMRegressor(random_state=42, verbose=-1, n_jobs=-1)
        
    # Perform random search
    random_search = RandomizedSearchCV(regressor, param_distributions=random_param_grid, n_iter=10, cv=3, random_state=42, n_jobs=-1)
    random_search.fit(test_input, test_output.values.ravel())
        
    # Get the best hyperparameters and the best model from random search
    best_params_ran = random_search.best_params_
    best_model = random_search.best_estimator_
        
    # Define the parameter grid for grid search using the best params from random search
    grid_param_grid = {
        'learning_rate': [best_params_ran['learning_rate']],
        'max_depth': [best_params_ran['max_depth'] - 1, best_params_ran['max_depth'], best_params_ran['max_depth'] + 1],
        'n_estimators': [best_params_ran['n_estimators'] - 50, best_params_ran['n_estimators'], best_params_ran['n_estimators'] + 50],
        'subsample': [best_params_ran['subsample']],
        'colsample_bytree': [best_params_ran['colsample_bytree']],
        'bagging_fraction': [best_params_ran['bagging_fraction']],
        'min_child_samples': [best_params_ran['min_child_samples']],
        'boosting_type': [best_params_ran['boosting_type']]
    }
    
    # Perform grid search
    grid = GridSearchCV(regressor, param_grid=grid_param_grid, n_jobs=-1)
    grid.fit(test_input, test_output.values.ravel())

    best_score = grid.best_score_
    best_params = grid.best_params_
    
    return grid, best_score, best_params      

In [5]:

def get_predictions(regressor, test_input, test_output, train_input=None, train_output=None):
    
    test_predictions = regressor.predict(test_input)
    test_results = pd.DataFrame(test_predictions, columns=output_columns, index=test_input.index)
    test_results = test_input.join(test_results)
    
    if train_input is not None and train_output is not None: 
        train_predictions = regressor.predict(train_input)
        train_results = pd.DataFrame(train_predictions, columns=output_columns, index=train_input.index)
        train_results = train_input.join(train_results)
        
        return test_results, train_results
    
    return test_results

In [6]:

models, regressors = [], []

test_predictions, train_predictions = [], []

for region, dataframe in train_data.groupby("Region"):
    
    # Cross validate to find the best model
    model_input, model_output = dataframe.dropna()[input_columns], dataframe.dropna()[output_columns]
    grid, score, params = get_best_model(model_input, model_output)
    regressors.append(grid)
    models.append(regressors[-1].fit(model_input, model_output.values.ravel()))
    
    print(f"Best {region} model has a score of {score} and best params {params}")
    
    # Get the test data for this specific region
    test_input = test_data.groupby("Region").get_group(region)[input_columns].dropna()
    test_output = test_data.groupby("Region").get_group(region)[output_columns].dropna()
    
    # Generate predictions, obtain and log the final formatted data
    test_results, train_results = get_predictions(regressors[-1], test_input, test_output, model_input, model_output)
    test_predictions.append(test_results)
    train_predictions.append(train_results)

Best NSW model has a score of 0.8142628048304628 and best params {'bagging_fraction': 0.7, 'boosting_type': 'goss', 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_samples': 20, 'n_estimators': 250, 'subsample': 1.0}
Best QLD model has a score of 0.8337583180520138 and best params {'bagging_fraction': 0.7, 'boosting_type': 'goss', 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 6, 'min_child_samples': 20, 'n_estimators': 250, 'subsample': 1.0}
Best SA model has a score of 0.6620201561704514 and best params {'bagging_fraction': 0.7, 'boosting_type': 'goss', 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_samples': 20, 'n_estimators': 200, 'subsample': 1.0}
Best TAS model has a score of 0.7728150057772355 and best params {'bagging_fraction': 0.7, 'boosting_type': 'goss', 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_samples': 20, 'n_estimators': 250, 'subsample': 1.0}
Best VIC model has a scor

In [7]:
# Save models and their parameters to a pickle file
with open("trained_models.pickle", "wb") as f:
    pickle.dump((models, [regressor.best_params_ for regressor in regressors]), f)

In [8]:
# Load the trained models and their best parameters from the pickle file
with open('trained_models.pickle', 'rb') as f:
    models, best_params = pickle.load(f)

# Create a dictionary to map regions to their corresponding models
regions = ['NSW', 'QLD', 'SA', 'TAS', 'VIC']
trained_models_dict = dict(zip(regions, models))

nsw_model = trained_models_dict['NSW']
nsw_best_params = best_params[regions.index('NSW')]

qld_model = trained_models_dict['QLD']
qld_best_params = best_params[regions.index('QLD')]

sa_model = trained_models_dict['SA']
sa_best_params = best_params[regions.index('SA')]

tas_model = trained_models_dict['TAS']
tas_best_params = best_params[regions.index('TAS')]

vic_model = trained_models_dict['VIC']
vic_best_params = best_params[regions.index('VIC')]

In [9]:
# Retrieve best parameters for each region
nsw_best_params
qld_best_params
sa_best_params
tas_best_params
vic_best_params

{'bagging_fraction': 0.7,
 'boosting_type': 'goss',
 'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 7,
 'min_child_samples': 20,
 'n_estimators': 250,
 'subsample': 1.0}

{'bagging_fraction': 0.7,
 'boosting_type': 'goss',
 'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_samples': 20,
 'n_estimators': 250,
 'subsample': 1.0}

{'bagging_fraction': 0.7,
 'boosting_type': 'goss',
 'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 7,
 'min_child_samples': 20,
 'n_estimators': 200,
 'subsample': 1.0}

{'bagging_fraction': 0.7,
 'boosting_type': 'goss',
 'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 7,
 'min_child_samples': 20,
 'n_estimators': 250,
 'subsample': 1.0}

{'bagging_fraction': 0.7,
 'boosting_type': 'gbdt',
 'colsample_bytree': 0.7,
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 50,
 'n_estimators': 450,
 'subsample': 0.7}

### Training the model for each region with best parameters derived

In [12]:
data = resampled_data[['Region','Precipitation', 'RelativeHumidity%', 'AirTemperature', 'WetBulbTemperature', 
                                'DewTemperature', 'SeaPressure', 'StationPressure', 'Month', 'Day', 'Hour', 
                                'DayOfYear', 'Season_Autumn', 'Season_Spring', 'Season_Summer', 'Season_Winter', 
                                'TimeOfDay_Afternoon', 'TimeOfDay_Evening', 'TimeOfDay_Morning', 'TimeOfDay_Night', 
                                'IsWeekend_False', 'IsWeekend_True', 'TotalDemand']]

# Save data to a pickle file
data.to_pickle('data.pkl')

In [13]:
# Function to create a LightGBM model with best parameters
def create_lgb_model(best_params):
    model = LGBMRegressor(**best_params, n_jobs=-1, random_state=42, verbose=-1)
    
    return model

# Function to train LightGBM model for a specific region
def train_lgb_model(data, region, input_columns, output_columns, best_params):
    
    # Filter data for the specified region
    region_data = data[data['Region'] == region]
    
    # Prepare the input features and target variable
    data_X = region_data[input_columns]
    data_y = region_data[output_columns]
    
    # Train-test split without shuffling
    random_state = np.random.RandomState(seed=42)
    X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, shuffle=False, random_state=random_state)
    
    # Create and fit the LightGBM model with best parameters
    model = create_lgb_model(best_params)
    model.fit(X_train, y_train)
    
    return model, X_test, y_test

In [14]:
# Example usage for Region NSW
nsw_model, nsw_X_test, nsw_y_test = train_lgb_model(data, 'NSW', input_columns, output_columns, nsw_best_params)

In [15]:
# Example usage for Region SA
sa_model, sa_X_test, sa_y_test = train_lgb_model(data, 'SA', input_columns, output_columns, sa_best_params)

In [16]:
# Example usage for Region QLD
qld_model, qld_X_test, qld_y_test = train_lgb_model(data, 'QLD', input_columns, output_columns, qld_best_params)

In [17]:
# Example usage for Region QLD
vic_model, vic_X_test, vic_y_test = train_lgb_model(data, 'VIC', input_columns, output_columns, vic_best_params)

In [18]:
# Example usage for Region TAS
tas_model, tas_X_test, tas_y_test = train_lgb_model(data, 'TAS', input_columns, output_columns, tas_best_params)

### Saving the individual regional models in pickle files

In [19]:
# Save the model for NSW region to a pickle file
with open('nsw_model.pickle', 'wb') as f:
    pickle.dump(nsw_model, f)

In [20]:
# Save the model for SA region to a pickle file
with open('sa_model.pickle', 'wb') as f:
    pickle.dump(sa_model, f)

In [21]:
# Save the model for QLD region to a pickle file
with open('qld_model.pickle', 'wb') as f:
    pickle.dump(qld_model, f)

In [22]:
# Save the model for VIC region to a pickle file
with open('vic_model.pickle', 'wb') as f:
    pickle.dump(vic_model, f)

In [23]:
# Save the model for TAS region to a pickle file
with open('tas_model.pickle', 'wb') as f:
    pickle.dump(tas_model, f)