In [None]:
import os
import random
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
from IPython.display import display

import numpy as np
import pandas as pd
from scipy.stats import uniform, randint

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import (train_test_split, TimeSeriesSplit,
                                     RandomizedSearchCV, GridSearchCV)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor

import xgboost as xgb
from xgboost import XGBRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import (Dense, LSTM, Dropout, Embedding, Reshape, Concatenate,
                                     RepeatVector, Bidirectional, Layer, Add, LayerNormalization,
                                     Multiply, Lambda, MultiHeadAttention, GlobalAveragePooling1D,
                                     BatchNormalization, Conv1D, Activation, ReLU)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import register_keras_serializable
import keras_tuner as kt
from keras_tuner import HyperModel, RandomSearch, Objective
from keras_tuner.tuners import BayesianOptimization

from tcn import TCN
import pickle
import optuna
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Add, TimeDistributed

In [None]:
df = pd.read_csv("data.csv")

In [None]:
df.head()

# Feature Engineering

In [None]:
# Convert TXNDATE to datetime format
df['TXNDATE'] = pd.to_datetime(df['TXNDATE'], dayfirst=True)  

In [None]:
# Sort in ascending order
df = df.sort_values(by=['BRANCHID', 'TXNDATE']).reset_index(drop=True)

In [None]:
# Day of the week (0 = Monday, 6 = Sunday)
df['DayOfWeek'] = df['TXNDATE'].dt.dayofweek

# Is weekend? (Saturday = 5, Sunday = 6)
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

# Month
df['Month'] = df['TXNDATE'].dt.month

# Year
df['Year'] = df['TXNDATE'].dt.year

In [None]:
# View new features
df[['TXNDATE', 'DayOfWeek', 'IsWeekend', 'Month', 'Year']].head(500)

In [None]:
# Load holiday list from Excel
holidays_df = pd.read_csv("holidays.csv")

In [None]:
# Convert 'DATE' column to datetime format
holidays_df['DATE'] = pd.to_datetime(holidays_df['DATE'])

In [None]:
# Create a holiday list as a set for faster lookup
holiday_dates = set(holidays_df['DATE'])

# Create 'IsHoliday' column: 1 if date is a holiday, else 0
df['IsHoliday'] = df['TXNDATE'].isin(holiday_dates).astype(int)

In [None]:
df[['TXNDATE', 'IsHoliday']].head(1000)

In [None]:
# Create 'IsNonWorkingDay' column: 1 if date is a NonWorking Day, else 0
df['IsNonWorkingDay'] = ((df['IsWeekend'] == 1) | (df['IsHoliday'] == 1)).astype(int)

In [None]:
df.head(500)

### Lag Features (Past Values)

In [None]:
df['Lag1_Credit'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].shift(1)
df['Lag1_Debit'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].shift(1)
df['Lag1_Customers'] = df.groupby('BRANCHID')['CUSTOMER'].shift(1)

In [None]:
for lag in [1,2, 3, 7, 14]:
    df[f'Lag{lag}_Credit'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].shift(lag)
    df[f'Lag{lag}_Debit'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].shift(lag)
    df[f'Lag{lag}_Customers'] = df.groupby('BRANCHID')['CUSTOMER'].shift(lag)

In [None]:
df[['TOTALCREDITAMOUNT', 'TOTALDEBITAMOUNT', 'CUSTOMER',
    'Lag1_Credit', 'Lag1_Debit', 'Lag1_Customers', 
    'Lag2_Credit', 'Lag2_Debit', 'Lag2_Customers' ,
    'Lag3_Credit', 'Lag3_Debit', 'Lag3_Customers' ,
    'Lag7_Credit', 'Lag7_Debit', 'Lag7_Customers',
    'Lag14_Credit', 'Lag14_Debit', 'Lag14_Customers'
   ]].head(20)

### Rolling Statistics (Short-term & Long-term Trends)

In [None]:
# Rolling features for debits
df['Rolling7_DebitMean'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].transform(lambda x: x.shift(1).rolling(window=7).mean())
df['Rolling7_DebitStd'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].transform(lambda x: x.shift(1).rolling(window=7).std())

# Rolling features for customers
df['Rolling7_CustomersMean'] = df.groupby('BRANCHID')['CUSTOMER'].transform(lambda x: x.shift(1).rolling(window=7).mean())
df['Rolling7_CustomersStd'] = df.groupby('BRANCHID')['CUSTOMER'].transform(lambda x: x.shift(1).rolling(window=7).std())

# Rolling features for credit
df['Rolling7_CreditMean'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].transform(lambda x: x.shift(1).rolling(window=7).mean())
df['Rolling7_CreditStd'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].transform(lambda x: x.shift(1).rolling(window=7).std())

In [None]:
df.head(10)

In [None]:
# 3-day rolling averages
df['Rolling3_CreditMean'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].transform(lambda x: x.shift(1).rolling(3).mean())
df['Rolling3_DebitMean'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].transform(lambda x: x.shift(1).rolling(3).mean())
df['Rolling3_CustomersMean'] = df.groupby('BRANCHID')['CUSTOMER'].transform(lambda x: x.shift(1).rolling(3).mean())

# 3-day rolling standard deviation
df['Rolling3_CustomersStd'] = df.groupby('BRANCHID')['CUSTOMER'].transform(lambda x: x.shift(1).rolling(3).std())
df['Rolling3_CreditStd'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].transform(lambda x: x.shift(1).rolling(3).std())
df['Rolling3_DebitStd'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].transform(lambda x: x.shift(1).rolling(3).std())

In [None]:
# 30-day rolling averages
df['Rolling30_CreditMean'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].transform(lambda x: x.shift(1).rolling(30).mean())
df['Rolling30_DebitMean'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].transform(lambda x: x.shift(1).rolling(30).mean())
df['Rolling30_CustomersMean'] = df.groupby('BRANCHID')['CUSTOMER'].transform(lambda x: x.shift(1).rolling(30).mean())

# 30-day rolling standard deviation
df['Rolling30_CustomersStd'] = df.groupby('BRANCHID')['CUSTOMER'].transform(lambda x: x.shift(1).rolling(30).std())
df['Rolling30_CreditStd'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].transform(lambda x: x.shift(1).rolling(30).std())
df['Rolling30_DebitStd'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].transform(lambda x: x.shift(1).rolling(30).std())

### Combined Features (Lag & Rolling)

In [None]:
target_cols = ['Credit', 'Debit', 'Customers']
rolling_windows = [3, 7]

for target in target_cols:
    lag_col = f'Lag1_{target}'
    
    for window in rolling_windows:
        mean_col = f'Rolling{window}_{target}Mean'
        std_col = f'Rolling{window}_{target}Std'
        
        # Difference from mean
        df[f'{lag_col}_vs_Mean{window}'] = df[lag_col] - df[mean_col]
        
        # Ratio to mean (avoid division by 0)
        df[f'{lag_col}_Ratio_Mean{window}'] = df[lag_col] / (df[mean_col] + 1e-6)
        
        # Z-score (standardized deviation)
        df[f'{lag_col}_ZScore{window}'] = (df[lag_col] - df[mean_col]) / (df[std_col] + 1e-6)

In [None]:
df.head(100)

In [None]:
# Sort data by date per branch
df = df.sort_values(['BRANCHID', 'TXNDATE'], ascending=[True, True])

### Ratios & Deltas

In [None]:
# How much of the credit was withdrawn
df['Debit_to_Credit_Ratio'] = df['TOTALDEBITAMOUNT'] / (df['TOTALCREDITAMOUNT'] + 1)

In [None]:
#Cash in or out per customers visited
df['CashOut_Per_Customer'] = df['TOTALDEBITAMOUNT'] / (df['CUSTOMER'] + 1)
df['CashIn_Per_Customer'] = df['TOTALCREDITAMOUNT'] / (df['CUSTOMER'] + 1)

In [None]:
#  Net Cash Flow (Shows overall surplus or deficit of cash that day) - Response variable
df['NetCashFlow'] = df['TOTALCREDITAMOUNT'] - df['TOTALDEBITAMOUNT']

In [None]:
# Net Cash Flow per Customer
df['NetCashFlow_Per_Customer'] = df['NetCashFlow'] / (df['CUSTOMER'] + 1)

In [None]:
# Day-over-Day Deltas
df['Delta_Debit'] = df.groupby('BRANCHID')['TOTALDEBITAMOUNT'].diff(1)
df['Delta_Credit'] = df.groupby('BRANCHID')['TOTALCREDITAMOUNT'].diff(1)
df['Delta_Customers'] = df.groupby('BRANCHID')['CUSTOMER'].diff(1)

In [None]:
# Rolling Delta vs Mean (Z-score type feature) : helps to detect spikes or drops
df['Debit_ZScore'] = (df['TOTALDEBITAMOUNT'] - df['Rolling7_DebitMean']) / (df['Rolling7_DebitStd'] + 1)
df['Credit_ZScore'] = (df['TOTALCREDITAMOUNT'] - df['Rolling7_CreditMean']) / (df['Rolling7_CreditStd'] + 1)

### Time Positioning Flags

In [None]:
df['IsMonthStart'] = df['TXNDATE'].dt.is_month_start.astype(int)
df['IsMonthEnd'] = df['TXNDATE'].dt.is_month_end.astype(int)

In [None]:
# First and last 5 days of the month

# Ensure TXNDATE is datetime
df['TXNDATE'] = pd.to_datetime(df['TXNDATE'])

# Day of the month
df['DayOfMonth'] = df['TXNDATE'].dt.day

# Number of days in the month
df['DaysInMonth'] = df['TXNDATE'].dt.days_in_month

# Is it in the first 5 days of the month?
df['IsFirst5Days'] = (df['DayOfMonth'] <= 5).astype(int)

# Is it in the last 5 days of the month?
df['IsLast5Days'] = (df['DayOfMonth'] > (df['DaysInMonth'] - 5)).astype(int)

In [None]:
# Load the inflation dataset
inflation = pd.read_csv("inflation.csv")

In [None]:
# Make inflation 'Month' is zero-padded to match datetime format
inflation['Month'] = inflation['Month'].astype(int)
inflation['Year'] = inflation['Year'].astype(int)

In [None]:
# Merge the inflation data into main dataframe
df = df.merge(inflation, on=['Year', 'Month'], how='left')

In [None]:
# Check for missing inflation values
missing_inflation = df[df['NCPI_Index'].isna()]
print(f"Rows with missing inflation data: {len(missing_inflation)}")

In [None]:
df.head(500)

# Preprocessing

### 1. Date Parsing & Sorting

In [None]:
df["TXNDATE"] = pd.to_datetime(df["TXNDATE"], format="%d/%m/%Y")
df.sort_values(["BRANCHID", "TXNDATE"], inplace=True)

### 2. Categorical Encoding

In [None]:
df_ = df.copy()
for col in ["BRANCH", "DISTRICT", "PROVINCE", "CODE"]:
    df_[col] = LabelEncoder().fit_transform(df[col])

In [None]:
df.head(500)

In [None]:
df_.head(500)

### 3. Handle Missing Values 

In [None]:
df.shape

In [None]:
df_.shape

In [None]:
rolling_lag_cols = [
    "Lag1_Credit",	"Lag1_Debit",	"Lag1_Customers",	"Lag2_Credit",	"Lag2_Debit",	"Lag2_Customers",	
    "Lag3_Credit",	"Lag3_Debit",	"Lag3_Customers",	"Lag7_Credit",	"Lag7_Debit",	"Lag7_Customers",	
    "Lag14_Credit",	"Lag14_Debit",	"Lag14_Customers",
    "Rolling7_DebitMean", "Rolling7_DebitStd", "Rolling7_CustomersMean", "Rolling7_CustomersStd", "Rolling7_CreditMean", "Rolling7_CreditStd",
    "Rolling3_CreditMean", "Rolling3_DebitMean", "Rolling3_CustomersMean", "Rolling3_CustomersStd", "Rolling3_CreditStd", "Rolling3_DebitStd",
    "Rolling30_CreditMean", "Rolling30_DebitMean", "Rolling30_CustomersMean", "Rolling30_CustomersStd", "Rolling30_CreditStd", "Rolling30_DebitStd",
    "Lag1_Credit_vs_Mean3",	"Lag1_Credit_Ratio_Mean3",	"Lag1_Credit_ZScore3",	"Lag1_Credit_vs_Mean7",	"Lag1_Credit_Ratio_Mean7",	
    "Lag1_Credit_ZScore7",	"Lag1_Debit_vs_Mean3",	"Lag1_Debit_Ratio_Mean3",	"Lag1_Debit_ZScore3",	"Lag1_Debit_vs_Mean7",	"Lag1_Debit_Ratio_Mean7",	
    "Lag1_Debit_ZScore7",	"Lag1_Customers_vs_Mean3",	"Lag1_Customers_Ratio_Mean3",	"Lag1_Customers_ZScore3",	"Lag1_Customers_vs_Mean7",	
    "Lag1_Customers_Ratio_Mean7",	"Lag1_Customers_ZScore7", "Delta_Debit", "Delta_Credit", "Delta_Customers", "Debit_ZScore", "Credit_ZScore"
]

In [None]:
df_cleaned = df_.dropna(subset=rolling_lag_cols)

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.head()

# Data Split

In [None]:
# Ensure TXNDATE is datetime
df_cleaned["TXNDATE"] = pd.to_datetime(df_cleaned["TXNDATE"])

split_date = pd.to_datetime("2025-01-01")

train_df = df_cleaned[df_cleaned["TXNDATE"] < split_date].copy()
test_df = df_cleaned[df_cleaned["TXNDATE"] >= split_date].copy()

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

In [None]:
# Define target 
target_col = "NetCashFlow"

In [None]:
# Distribution of the "NetCashFlow"
print(df_cleaned["NetCashFlow"].describe())

In [None]:
drop_cols = ["TXNDATE", "BRANCHID", "NetCashFlow", "NetCashFlow_Per_Customer", "TOTALTXNAMOUNT"]
feature_cols = [col for col in train_df.columns if col not in drop_cols]

In [None]:
X_train1 = train_df[feature_cols]
y_train = train_df[target_col]

X_test1 = test_df[feature_cols]
y_test = test_df[target_col]

## Correlation Check

In [None]:
# Compute correlation matrix on X_train1
cor_matrix = X_train1.corr().abs()

# Identify pairs with high correlation
high_corr_pairs = []
for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if cor_matrix.iloc[i, j] > 0.8:
            f1 = cor_matrix.columns[i]
            f2 = cor_matrix.columns[j]
            high_corr_pairs.append((f1, f2, cor_matrix.iloc[i, j]))

# Sort and display
high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature_1', 'Feature_2', 'Correlation'])
high_corr_df = high_corr_df.sort_values(by='Correlation', ascending=False)

print("Highly Correlated Feature Pairs (|correlation| > 0.8):")
print(high_corr_df)


In [None]:
# Calculate correlation of each feature with target
feature_target_corr = X_train1.corrwith(y_train).abs()

# Drop the feature with lower correlation to target
to_drop = []
for f1, f2, _ in high_corr_pairs:
    if f1 in to_drop or f2 in to_drop:
        continue  
    if feature_target_corr[f1] < feature_target_corr[f2]:
        to_drop.append(f1)
    else:
        to_drop.append(f2)

print("\nFeatures to drop due to high multicollinearity:")
print(to_drop)

# Drop from training and test sets
X_train = X_train1.drop(columns=to_drop)
X_test = X_test1.drop(columns=to_drop)


## Fitting and Testing the Basic Model

In [None]:
# Train a basic XGBoost model
model = xgb.XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions on test set
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Basic Model Evaluation on Test Set:")
print(f"  RMSE: {rmse:,.2f}")
print(f"  MAE : {mae:,.2f}")
print(f"  R²  : {r2:.4f}")

# Randomized search using all featurs

In [None]:
# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 950, 1200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],   
    'reg_lambda': [1, 5, 10]    
}

# TimeSeriesSplit (on training set only)
tscv = TimeSeriesSplit(n_splits=5)

# Model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=1,
    n_jobs=-1,
     random_state=42
)


random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Final Test Set Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

print("\n Best Parameters from CV:")
print(random_search.best_params_)

### Improved randomised search - All features

In [None]:
# Hyperparameter grid
param_distributions = {
    'n_estimators': randint(800, 1500),             
    'learning_rate': uniform(0.01, 0.1),             
    'max_depth': randint(3, 8),                     
    'subsample': uniform(0.6, 0.4),                  
    'colsample_bytree': uniform(0.6, 0.4),           
    'reg_alpha': uniform(0, 1),                      
    'reg_lambda': uniform(1, 9)                      
}

# TimeSeriesSplit (on training set only)
tscv = TimeSeriesSplit(n_splits=5)

# Model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=1,
    n_jobs=-1,
     random_state=42
)


random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Final Test Set Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

print("\n Best Parameters from CV:")
print(random_search.best_params_)

## Hyperparameter Tuning with GridSearchCV (Considering all features)

In [None]:
param_grid = {
    'n_estimators': [1150, 1200, 1250],
    'max_depth': [4,5,6],
    'learning_rate': [0.05,0.07],
    'subsample': [0.8],
    'colsample_bytree': [1.0],
    'reg_alpha': [ 0.1, 1],   
    'reg_lambda': [1, 5]    
}

tscv = TimeSeriesSplit(n_splits=5)

model = xgb.XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  
    cv=tscv,
    verbose=1,
    n_jobs=-1  
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)

rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("\n Final Evaluation on Test Set:")
print(f"   RMSE: {rmse_best:,.2f}")
print(f"   MAE : {mae_best:,.2f}")
print(f"   R²  : {r2_best:.4f}")

print("Best Parameters:", grid_search.best_params_)

### Optimal Model - All Features

In [None]:
param_grid = {
    'n_estimators': [1250],
    'max_depth': [5],
    'learning_rate': [0.05],
    'subsample': [0.8],
    'colsample_bytree': [1.0],
    'reg_alpha': [1],   
    'reg_lambda': [1]    
}

tscv = TimeSeriesSplit(n_splits=5)

model = xgb.XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  
    cv=tscv,
    verbose=1,
    n_jobs=-1  
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)

rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("\n Final Evaluation on Test Set:")
print(f"   RMSE: {rmse_best:,.2f}")
print(f"   MAE : {mae_best:,.2f}")
print(f"   R²  : {r2_best:.4f}")

print("Best Parameters:", grid_search.best_params_)

# Feature Selection

In [None]:
model = xgb.XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Basic Model Evaluation on Test Set:")
print(f"  RMSE: {rmse:,.2f}")
print(f"  MAE : {mae:,.2f}")
print(f"  R²  : {r2:.4f}")

In [None]:
# Plot feature importance
xgb.plot_importance(model, max_num_features=30, importance_type='gain', height=0.5)
plt.title("Top 20 Feature Importances")
plt.tight_layout()
plt.show()

In [None]:
# Get sorted features by importance
importance_dict = model.get_booster().get_score(importance_type='gain')
important_features = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
top_features = [feature for feature, score in important_features]

In [None]:
importance_dict = model.get_booster().get_score(importance_type='gain')
important_features = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Features with their importance scores
print("Feature Importances (by Gain):")
for feature, score in important_features:
    print(f"{feature}: {score:.4f}")

In [None]:
total_gain = sum(importance_dict.values())
normalized_importance = [(feature, score / total_gain) for feature, score in important_features]

print("Normalized Feature Importances (by Gain):")
for feature, score in normalized_importance:
    print(f"{feature}: {score:.4%}")

In [None]:
normalized_scores = dict(normalized_importance)

# Select features above a chosen importance threshold (0.7%)
selected_features = [feature for feature, score in normalized_scores.items() if score >= 0.007]

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

model_selected = xgb.XGBRegressor(n_estimators=100, random_state=42)
model_selected.fit(X_train_selected, y_train)

y_pred_selected = model_selected.predict(X_test_selected)
rmse_sel = np.sqrt(mean_squared_error(y_test, y_pred_selected))
mae_sel = mean_absolute_error(y_test, y_pred_selected)
r2_sel = r2_score(y_test, y_pred_selected)

print("\n Evaluation after Feature Selection:")
print(f"   RMSE: {rmse_sel:,.2f}")
print(f"   MAE : {mae_sel:,.2f}")
print(f"   R²  : {r2_sel:.4f}")

In [None]:
normalized_scores = dict(normalized_importance)

# Select features above a chosen importance threshold (0.5%)
selected_features = [feature for feature, score in normalized_scores.items() if score >= 0.005]

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

model_selected = xgb.XGBRegressor(n_estimators=100, random_state=42)
model_selected.fit(X_train_selected, y_train)

y_pred_selected = model_selected.predict(X_test_selected)
rmse_sel = np.sqrt(mean_squared_error(y_test, y_pred_selected))
mae_sel = mean_absolute_error(y_test, y_pred_selected)
r2_sel = r2_score(y_test, y_pred_selected)

print("\n Evaluation after Feature Selection:")
print(f"   RMSE: {rmse_sel:,.2f}")
print(f"   MAE : {mae_sel:,.2f}")
print(f"   R²  : {r2_sel:.4f}")

In [None]:
print("\nSelected Features:")
for f in selected_features:
    print(f)


## Hyperparameter Tuning with RandomizedSearchCV (Only considering selected features)

In [None]:
#  Expanded but controlled hyperparameter grid
param_grid = {
    'n_estimators': [300, 750, 1200],
    'learning_rate': [0.075, 0.1],
    'max_depth': [5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],   
    'reg_lambda': [1, 5, 10]    
}

#  Time-aware cross-validation
tscv = TimeSeriesSplit(n_splits=5)

#  XGBoost regressor with fixed objective and random seed
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

#  Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=20,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_selected, y_train)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test_selected)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Final Test Set Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

print("\n Best Parameters from CV:")
print(random_search.best_params_)

In [None]:
#  Expanded hyperparameter grid
param_grid = {
    'n_estimators': [1500, 1750,2000],
    'learning_rate': [0.85, 0.1],
    'max_depth': [5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],  
    'reg_lambda': [1, 5, 10]    
}

#  Time-aware cross-validation
tscv = TimeSeriesSplit(n_splits=5)

#  XGBoost regressor with fixed objective and random seed
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

#  Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)


random_search.fit(X_train_selected, y_train)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test_selected)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Final Test Set Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

print("\n Best Parameters from CV:")
print(random_search.best_params_)

### Hyperparameter Tuning with GridSearchCV (Only considering selected features)

In [None]:
param_grid = {
    'n_estimators': [1450, 1500, 1550],
    'max_depth': [5, 7],
    'learning_rate': [ 0.05, 0.1, 0.15],
    'subsample': [ 1.0],
    'colsample_bytree': [1.0],
    'reg_alpha': [0.1, 1],   
    'reg_lambda': [5, 10]
}

# Time Series Split
tscv = TimeSeriesSplit(n_splits=5)

# Set Up Grid Search
model = xgb.XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  
    cv=tscv,
    verbose=1,
    n_jobs=-1  
)


grid_search.fit(X_train_selected, y_train)

print("Best Parameters:", grid_search.best_params_)


best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test_selected)

rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("\n Final Evaluation on Test Set:")
print(f"   RMSE: {rmse_best:,.2f}")
print(f"   MAE : {mae_best:,.2f}")
print(f"   R²  : {r2_best:.4f}")

### Optimal Model - Selected features

In [None]:
# optimized parameters
params = {
    'n_estimators': 1550,
    'max_depth': 5,
    'learning_rate': 0.1,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'reg_alpha': 0.1,
    'reg_lambda': 10,
    'random_state': 42,
    'n_jobs': -1
}

# Instantiate model with optimized params
best_model = XGBRegressor(**params)

# Fit on selected features and target
best_model.fit(X_train_selected, y_train)

# Predict on test set
y_pred_best = best_model.predict(X_test_selected)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred_best))
mae = mean_absolute_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)

print("Final XGBoost Model Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

### Saving the optimal XGB model

In [None]:
# Save the XGBoost model
with open("xgb_best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Save selected feature names
with open("xgb_selected_features.pkl", "wb") as f:
    pickle.dump(X_train_selected.columns.tolist(), f)

print("XGBoost model and selected features saved successfully.")


In [None]:
# Load model
with open("xgb_best_model.pkl", "rb") as f:
    best_model = pickle.load(f)

# Load selected features
with open("xgb_selected_features.pkl", "rb") as f:
    selected_features = pickle.load(f)

# Prepare test data
X_test_selected = X_test[selected_features]
y_pred = best_model.predict(X_test_selected)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Final XGBoost Model Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")


### Branchwise MAE

In [None]:
# Original BRANCHID values for the test set
branch_ids_test = df_cleaned.loc[X_test_selected.index, "BRANCHID"].values

# DataFrame with predictions and actuals
results_df = X_test_selected.copy()
results_df["Actual"] = y_test.values
results_df["Predicted"] = y_pred_best
results_df["BRANCHID"] = branch_ids_test

# Compute absolute error per row
results_df["AbsError"] = abs(results_df["Actual"] - results_df["Predicted"])

# Compute per-branch metrics: MAE, average actual, and % MAE
relative_mae = results_df.groupby("BRANCHID").agg(
    MAE=("AbsError", "mean"),
    AvgCashFlow=("Actual", "mean")
).assign(
    MAE_Percent=lambda x: 100 * x["MAE"] / x["AvgCashFlow"]
).reset_index()

# Display Top 10 branches with highest % MAE
print("\n Branch-wise MAE (% of Avg Daily Cash Flow) – Top 10:")
print(relative_mae.sort_values("MAE_Percent", ascending=False).head(10))

In [None]:
top_mae_branches = branch_mae.sort_values("MAE", ascending=False).head(10)

plt.figure(figsize=(10, 5))
plt.bar(top_mae_branches["BRANCHID"].astype(str), top_mae_branches["MAE"], color="steelblue")
plt.xlabel("Branch ID")
plt.ylabel("MAE")
plt.title("Top 10 Branches by MAE")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()



## Prediction Plots

In [None]:
# Ensure TXNDATE is datetime
test_df = test_df.copy()
test_df['TXNDATE'] = pd.to_datetime(test_df['TXNDATE'])

# Combine true and predicted values into test_df
test_df = test_df.reset_index(drop=True)
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred_best).reset_index(drop=True)  

test_df['Actual'] = y_test_series
test_df['Predicted'] = y_pred_series

# Filter for BranchID = 1
branch_df = test_df[test_df['BRANCHID'] == 1].sort_values('TXNDATE')

# Plot
plt.figure(figsize=(10, 5))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black")
plt.plot(branch_df["TXNDATE"], branch_df["Predicted"], label="Predicted", color="dodgerblue", linestyle='--')
plt.title("Actual vs Predicted NetCashFlow (Test Set) — BranchID = 1")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Ensure TXNDATE is datetime
test_df = test_df.copy()
test_df['TXNDATE'] = pd.to_datetime(test_df['TXNDATE'])

# Combine true and predicted values into test_df
test_df = test_df.reset_index(drop=True)
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred_best).reset_index(drop=True)  

test_df['Actual'] = y_test_series
test_df['Predicted'] = y_pred_series

# Filter for BranchID = 2
branch_df = test_df[test_df['BRANCHID'] == 2].sort_values('TXNDATE')

# Plot
plt.figure(figsize=(10, 5))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black")
plt.plot(branch_df["TXNDATE"], branch_df["Predicted"], label="Predicted", color="dodgerblue", linestyle='--')
plt.title("Actual vs Predicted NetCashFlow (Test Set) — BranchID = 2")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Ensure TXNDATE is datetime
test_df = test_df.copy()
test_df['TXNDATE'] = pd.to_datetime(test_df['TXNDATE'])

# Combine true and predicted values into test_df
test_df = test_df.reset_index(drop=True)
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred_best).reset_index(drop=True)  

test_df['Actual'] = y_test_series
test_df['Predicted'] = y_pred_series

# Filter for BranchID = 56
branch_df = test_df[test_df['BRANCHID'] == 56].sort_values('TXNDATE')

# Plot
plt.figure(figsize=(10, 5))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black")
plt.plot(branch_df["TXNDATE"], branch_df["Predicted"], label="Predicted", color="dodgerblue", linestyle='--')
plt.title("Actual vs Predicted NetCashFlow (Test Set) — BranchID = 56")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Ensure TXNDATE is datetime
test_df = test_df.copy()
test_df['TXNDATE'] = pd.to_datetime(test_df['TXNDATE'])

# Combine true and predicted values into test_df
test_df = test_df.reset_index(drop=True)
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred_best).reset_index(drop=True)  

test_df['Actual'] = y_test_series
test_df['Predicted'] = y_pred_series

# Filter for BranchID = 100
branch_df = test_df[test_df['BRANCHID'] == 100].sort_values('TXNDATE')

# Plot
plt.figure(figsize=(10, 5))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black")
plt.plot(branch_df["TXNDATE"], branch_df["Predicted"], label="Predicted", color="dodgerblue", linestyle='--')
plt.title("Actual vs Predicted NetCashFlow (Test Set) — BranchID = 100")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Light GBM

### Basic LightGBM (All Features)

In [None]:
#  Basic LightGBM model with default parameters
lgb_model = LGBMRegressor(random_state=42)

lgb_model.fit(X_train, y_train)

y_pred_lgb = lgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
mae = mean_absolute_error(y_test, y_pred_lgb)
r2 = r2_score(y_test, y_pred_lgb)

print(f"\n LightGBM Basic Model Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

### RandomizedSearch using all features

In [None]:
param_dist = {
    'num_leaves': randint(20, 150),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.005, 0.05),        
    'n_estimators': randint(300, 1000),
    'subsample': uniform(0.6, 0.4),                
    'colsample_bytree': uniform(0.6, 0.4),         
    'reg_alpha': uniform(0, 1.0),
    'reg_lambda': uniform(0, 1.0),
    'min_child_samples': randint(10, 100),
    'min_split_gain': uniform(0.0, 0.5)
}

# Initial model
lgb = LGBMRegressor(random_state=42)

# RandomizedSearchCV setup
random_search = RandomizedSearchCV(
    estimator=lgb,
    param_distributions=param_dist,
    n_iter=50,  
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_lgb_final = random_search.best_estimator_

y_pred_lgb_final = best_lgb_final.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb_final))
mae = mean_absolute_error(y_test, y_pred_lgb_final)
r2 = r2_score(y_test, y_pred_lgb_final)

print(f"\n Final Optimized LightGBM Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters Found:")
print(random_search.best_params_)

### Tune num_leaves and max_depth

In [None]:
#  Parameter grid
param_grid = {
    'num_leaves': [50, 60, 70],
    'max_depth': [5, 7, 10, -1]
}

#  TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

lgb = LGBMRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)


best_lgb_1 = grid_search.best_estimator_


y_pred_lgb_1 = best_lgb_1.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb_1))
mae = mean_absolute_error(y_test, y_pred_lgb_1)
r2 = r2_score(y_test, y_pred_lgb_1)

print(f"\n Step 1 Optimized LightGBM:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Params:", grid_search.best_params_)

### Tune learning_rate & n_estimators

In [None]:
#  Use best params from the previous step
lgb = LGBMRegressor(
    random_state=42,
    max_depth=-1,
    num_leaves=70
)

param_grid = {
    'learning_rate': [0.04, 0.05, 0.06],
    'n_estimators': [800, 900, 1000]
}

grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_lgb_2 = grid_search.best_estimator_

y_pred_lgb_2 = best_lgb_2.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb_2))
mae = mean_absolute_error(y_test, y_pred_lgb_2)
r2 = r2_score(y_test, y_pred_lgb_2)

print(f"\n Step 2 Optimized LightGBM:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Params:", grid_search.best_params_)

### Tune subsample and colsample_bytree

In [None]:
# Use all previous best parameters
lgb = LGBMRegressor(
    random_state=42,
    max_depth=-1,
    num_leaves=70,
    learning_rate=0.01,
    n_estimators=700
)


param_grid = {
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [ 0.8, 0.9, 1.0]
}


grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)


grid_search.fit(X_train, y_train)


best_lgb_3 = grid_search.best_estimator_


y_pred_lgb_3 = best_lgb_3.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb_3))
mae = mean_absolute_error(y_test, y_pred_lgb_3)
r2 = r2_score(y_test, y_pred_lgb_3)

print(f"\n Step 3 Optimized LightGBM:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Params:", grid_search.best_params_)

### With found variables

In [None]:
param_grid = {
    'num_leaves': [70],
    'learning_rate': [0.04],
    'n_estimators': [800],
    'max_depth': [-1]
}

lgb = LGBMRegressor(
    subsample=0.6,
    colsample_bytree=0.9,
    reg_alpha=0.24,
    reg_lambda=0.09,
    min_child_samples=10,
    min_split_gain=0.087,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)


y_pred_grid = grid_search.best_estimator_.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_grid))
mae = mean_absolute_error(y_test, y_pred_grid)
r2 = r2_score(y_test, y_pred_grid)


print(f"\n Final GridSearch LightGBM Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(grid_search.best_params_)

### Optuna Tuning for LightGBM - All features

In [None]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': 0.01,
        'n_estimators': 700,
        'max_depth': -1,
        'num_leaves': trial.suggest_int('num_leaves', 31, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
    }

    tscv = TimeSeriesSplit(n_splits=5)
    rmse_scores = []

    for train_idx, val_idx in tscv.split(X_train):
        X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_metric='rmse',
            callbacks=[early_stopping(50), log_evaluation(0)]
        )

        preds = model.predict(X_val_cv)
        rmse = np.sqrt(mean_squared_error(y_val_cv, preds))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)


# Start the study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Best parameters
print(" Best Params Found by Optuna:")
print(study.best_params)

In [None]:
print("Best RMSE:", study.best_value)

print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

In [None]:
# Train final model with best params
best_params = study.best_params
best_params.update({
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'n_estimators': 700,
    'random_state': 42
})

best_model = lgb.LGBMRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)


from sklearn.metrics import mean_absolute_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Optuna-Tuned LightGBM:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

### Basic LightGBM (All Features)

In [None]:
#  Basic LightGBM model with default parameters
lgb_model = LGBMRegressor(random_state=42)

lgb_model.fit(X_train, y_train)

y_pred_lgb = lgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
mae = mean_absolute_error(y_test, y_pred_lgb)
r2 = r2_score(y_test, y_pred_lgb)

print(f"\n LightGBM Basic Model Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

### Featue Selection in LightGBM

In [None]:
lgb_train_dataset = lgb.Dataset(X_train, label=y_train)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': 42
}

lgb_model = lgb.train(
    params,
    lgb_train_dataset,
    num_boost_round=100
)

# Get feature importances
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgb_model.feature_importance(importance_type='gain')
})

# Sort by importance
importance_df.sort_values(by='importance', ascending=False, inplace=True)

# Select top 20 features
TOP_N = 20
top_features = importance_df.head(TOP_N)['feature'].tolist()

# Create reduced feature sets
X_train_select_LGBM = X_train[top_features]
X_test_select_LGBM = X_test[top_features]  


plt.figure(figsize=(10, 6))
plt.barh(importance_df.head(TOP_N)['feature'][::-1], importance_df.head(TOP_N)['importance'][::-1])
plt.title('Top LightGBM Feature Importances (by Gain)')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("Selected features for LightGBM:", top_features)

In [None]:
# Get raw gain importances
feature_importances = lgb_model.feature_importance(importance_type='gain')
feature_names = lgb_model.feature_name()

# Normalize importances
total_gain = np.sum(feature_importances)
importance_percent = 100.0 * feature_importances / total_gain

# Create DataFrame
normalized_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Normalized Importance (%)': importance_percent
})

# Sort and round
normalized_importance_df = normalized_importance_df.sort_values(by='Normalized Importance (%)', ascending=False)
normalized_importance_df['Normalized Importance (%)'] = normalized_importance_df['Normalized Importance (%)'].round(4)

print("Normalized Feature Importances (by Gain):")
for _, row in normalized_importance_df.iterrows():
    print(f"{row['Feature']}: {row['Normalized Importance (%)']:.4f}%")

### RandomizedSearch using Selected features

### threshold = 0.5%

In [None]:
threshold = 0.5  # in percent

selected_features = normalized_importance_df[
    normalized_importance_df['Normalized Importance (%)'] > threshold
]['Feature'].tolist()

# Create reduced feature sets
X_train_select_LGBM = X_train[selected_features]
X_test_select_LGBM = X_test[selected_features]  

print(f"Selected {len(selected_features)} features with > {threshold}% importance:")
print(selected_features)

In [None]:
# Parameter grid
param_dist = {
    'num_leaves': randint(20, 150),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.005, 0.05),
    'n_estimators': randint(300, 1000),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 1.0),
    'reg_lambda': uniform(0, 1.0),
    'min_child_samples': randint(10, 100),
    'min_split_gain': uniform(0.0, 0.5)
}

lgb = LGBMRegressor(random_state=42)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_select_LGBM, y_train)

y_pred_lgb_final = random_search.best_estimator_.predict(X_test_select_LGBM)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb_final))
mae = mean_absolute_error(y_test, y_pred_lgb_final)
r2 = r2_score(y_test, y_pred_lgb_final)

print(f"\n Final Optimized LightGBM (Selected Features):")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(random_search.best_params_)

### threshold = 0.01%

In [None]:
threshold = 0.01  # in percent


selected_features = normalized_importance_df[
    normalized_importance_df['Normalized Importance (%)'] > threshold
]['Feature'].tolist()

X_train_select_LGBM = X_train[selected_features]
X_test_select_LGBM = X_test[selected_features]  

print(f"Selected {len(selected_features)} features with > {threshold}% importance:")
print(selected_features)

In [None]:
# Parameter grid
param_dist = {
    'num_leaves': randint(20, 150),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.005, 0.05),
    'n_estimators': randint(300, 1500),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 1.0),
    'reg_lambda': uniform(0, 1.0),
    'min_child_samples': randint(10, 100),
    'min_split_gain': uniform(0.0, 0.5)
}

# TimeSeriesSplit 
tscv = TimeSeriesSplit(n_splits=5)

lgb = LGBMRegressor(random_state=42)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_select_LGBM, y_train)

y_pred_lgb_final = random_search.best_estimator_.predict(X_test_select_LGBM)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb_final))
mae = mean_absolute_error(y_test, y_pred_lgb_final)
r2 = r2_score(y_test, y_pred_lgb_final)

print(f"\n Final Optimized LightGBM (Selected Features):")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(random_search.best_params_)

### threshold = 0.05%

In [None]:
threshold = 0.05  # in percent

selected_features = normalized_importance_df[
    normalized_importance_df['Normalized Importance (%)'] > threshold
]['Feature'].tolist()

X_train_select_LGBM = X_train[selected_features]
X_test_select_LGBM = X_test[selected_features] 

print(f"Selected {len(selected_features)} features with > {threshold}% importance:")
print(selected_features)

In [None]:
# Parameter grid 
param_dist = {
    'num_leaves': randint(20, 150),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.005, 0.05),
    'n_estimators': randint(300, 1500),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 1.0),
    'reg_lambda': uniform(0, 1.0),
    'min_child_samples': randint(10, 100),
    'min_split_gain': uniform(0.0, 0.5)
}

# TimeSeriesSplit 
tscv = TimeSeriesSplit(n_splits=5)

lgb = LGBMRegressor(random_state=42)

# RandomizedSearchCV 
random_search = RandomizedSearchCV(
    estimator=lgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_select_LGBM, y_train)

y_pred_lgb_final = random_search.best_estimator_.predict(X_test_select_LGBM)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb_final))
mae = mean_absolute_error(y_test, y_pred_lgb_final)
r2 = r2_score(y_test, y_pred_lgb_final)

print(f"\n Final Optimized LightGBM (Selected Features):")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(random_search.best_params_)

### Grid search (0.05) - Selected Parameters

In [None]:
# reduced parameter grid
param_grid = {
    'num_leaves': [60, 75, 90],
    'learning_rate': [0.0065, 0.0075, 0.0085],
    'n_estimators': [1400, 1450, 1500],
    'max_depth': [6, 7]
}

# best-performing values from previous RandomizedSearch
lgb = LGBMRegressor(
    subsample=0.673,
    colsample_bytree=0.934,
    reg_alpha=0.24,
    reg_lambda=0.09,
    min_child_samples=10,
    min_split_gain=0.087,
    random_state=42
)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv, 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_select_LGBM, y_train)

y_pred_grid = grid_search.best_estimator_.predict(X_test_select_LGBM)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_grid))
mae = mean_absolute_error(y_test, y_pred_grid)
r2 = r2_score(y_test, y_pred_grid)

print(f"\n Final GridSearch LightGBM Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(grid_search.best_params_)

In [None]:
# reduced parameter grid
param_grid = {
    'num_leaves': [85, 110, 130],
    'learning_rate': [0.0065, 0.0075],
    'n_estimators': [1400, 1500, 1700],
    'max_depth': [6, 7]
}

# best-performing values from previous RandomizedSearch
lgb = LGBMRegressor(
    subsample=0.673,
    colsample_bytree=0.934,
    reg_alpha=0.24,
    reg_lambda=0.09,
    min_child_samples=10,
    min_split_gain=0.087,
    random_state=42
)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_select_LGBM, y_train)

y_pred_grid = grid_search.best_estimator_.predict(X_test_select_LGBM)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_grid))
mae = mean_absolute_error(y_test, y_pred_grid)
r2 = r2_score(y_test, y_pred_grid)

print(f"\n Final GridSearch LightGBM Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(grid_search.best_params_)

### Optuna Tuning for LightGBM - Selected features

In [None]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': 0.01,
        'n_estimators': 700,
        'max_depth': -1,
        'num_leaves': trial.suggest_int('num_leaves', 31, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
    }

    tscv = TimeSeriesSplit(n_splits=5)
    rmse_scores = []

    for train_idx, val_idx in tscv.split(X_train_select_LGBM):
        X_train_cv, X_val_cv = X_train_select_LGBM.iloc[train_idx], X_train_select_LGBM.iloc[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train_cv, y_train_cv,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_metric='rmse',
            callbacks=[early_stopping(50), log_evaluation(0)]
        )

        preds = model.predict(X_val_cv)
        rmse = np.sqrt(mean_squared_error(y_val_cv, preds))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)


# Start the study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Best parameters
print(" Best Params Found by Optuna:")
print(study.best_params)

In [None]:
# best score
print("Best RMSE:", study.best_value)

# best parameters
print("Best hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

In [None]:
# Final model with best params
best_params = study.best_params
best_params.update({
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'n_estimators': 700,
    'random_state': 42
})

best_model = lgb.LGBMRegressor(**best_params)
best_model.fit(X_train_select_LGBM, y_train)


y_pred = best_model.predict(X_test_select_LGBM)


from sklearn.metrics import mean_absolute_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Optuna-Tuned LightGBM:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

### Optimal model - Light GBM

In [None]:
# reduced parameter grid
param_grid = {
    'num_leaves': [130],
    'learning_rate': [0.0065],
    'n_estimators': [1400],
    'max_depth': [7]
}

# best-performing values from previous RandomizedSearch
lgb = LGBMRegressor(
    subsample=0.673,
    colsample_bytree=0.934,
    reg_alpha=0.24,
    reg_lambda=0.09,
    min_child_samples=10,
    min_split_gain=0.087,
    random_state=42
)

# TimeSeriesSplit 
tscv = TimeSeriesSplit(n_splits=5)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    n_jobs=-1
)


grid_search.fit(X_train_select_LGBM, y_train)


y_pred_grid = grid_search.best_estimator_.predict(X_test_select_LGBM)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_grid))
mae = mean_absolute_error(y_test, y_pred_grid)
r2 = r2_score(y_test, y_pred_grid)


print(f"\n Final GridSearch LightGBM Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(grid_search.best_params_)

In [None]:
# reduced parameter grid
param_grid = {
    'num_leaves': [130],
    'learning_rate': [0.0065],
    'n_estimators': [1400],
    'max_depth': [7]
}

# best-performing values from previous RandomizedSearch
lgb = LGBMRegressor(
    subsample=0.673,
    colsample_bytree=0.934,
    reg_alpha=0.24,
    reg_lambda=0.09,
    min_child_samples=10,
    min_split_gain=0.087,
    random_state=42
)

# TimeSeriesSplit 
tscv = TimeSeriesSplit(n_splits=5)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,  
    verbose=1,
    n_jobs=-1
)


grid_search.fit(X_train_select_LGBM, y_train)


y_pred_grid = grid_search.best_estimator_.predict(X_test_select_LGBM)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_grid))
mae = mean_absolute_error(y_test, y_pred_grid)
r2 = r2_score(y_test, y_pred_grid)


print(f"\n Final GridSearch LightGBM Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")
print("\n Best Parameters:")
print(grid_search.best_params_)

### Saving the optimal LGBM model

In [None]:
# Save the best LightGBM model
with open("lgbm_best_model.pkl", "wb") as f:
    pickle.dump(grid_search.best_estimator_, f)

# Save the selected feature names used for LGBM
with open("lgbm_selected_features.pkl", "wb") as f:
    pickle.dump(X_train_select_LGBM.columns.tolist(), f)

print("LightGBM model and selected features saved successfully.")


In [None]:
# Load the saved LGBM model
with open("lgbm_best_model.pkl", "rb") as f:
    lgbm_model = pickle.load(f)

# Load selected feature names
with open("lgbm_selected_features.pkl", "rb") as f:
    selected_features_lgbm = pickle.load(f)

# Use selected features for prediction
X_test_lgbm = X_test[selected_features_lgbm]
y_pred_grid = lgbm_model.predict(X_test_lgbm)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_grid))
mae = mean_absolute_error(y_test, y_pred_grid)
r2 = r2_score(y_test, y_pred_grid)

print(f"\n Final LightGBM Results:")
print(f" RMSE: {rmse:,.2f}")
print(f" MAE : {mae:,.2f}")
print(f" R²  : {r2:.4f}")

### Branch wise MAE

In [None]:
# original BRANCHID values for the test set
branch_ids_test = df_cleaned.loc[X_test_select_LGBM.index, "BRANCHID"].values

# DataFrame with predictions and actuals
results_df = X_test_select_LGBM.copy()
results_df["Actual"] = y_test.values
results_df["Predicted"] = y_pred_grid
results_df["BRANCHID"] = branch_ids_test

# Compute Branchwise MAE
branch_mae = results_df.groupby("BRANCHID").apply(
    lambda g: mean_absolute_error(g["Actual"], g["Predicted"])
).reset_index(name="MAE")

# Top 10 branches by MAE
print("\n Branch-wise MAE (Top 10 branches with highest error):")
print(branch_mae.sort_values("MAE", ascending=False).head(10))


In [None]:
top_branches = branch_mae.sort_values("MAE", ascending=False).head(10)

plt.figure(figsize=(10,5))
plt.bar(top_branches["BRANCHID"].astype(str), top_branches["MAE"], color="tomato")
plt.xlabel("Branch ID")
plt.ylabel("MAE")
plt.title("Top 10 Branches by MAE")
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(True, axis='y')
plt.show()


In [None]:
results_df["AbsError"] = abs(results_df["Actual"] - results_df["Predicted"])
relative_mae = results_df.groupby("BRANCHID").agg(
    MAE=("AbsError", "mean"),
    AvgCashFlow=("Actual", "mean")
).assign(MAE_Percent=lambda x: 100 * x["MAE"] / x["AvgCashFlow"])


In [None]:
# original BRANCHID values for the test set
branch_ids_test = df_cleaned.loc[X_test_select_LGBM.index, "BRANCHID"].values

# DataFrame with predictions and actuals
results_df = X_test_select_LGBM.copy()
results_df["Actual"] = y_test.values
results_df["Predicted"] = y_pred_grid
results_df["BRANCHID"] = branch_ids_test

# Compute absolute error per row
results_df["AbsError"] = abs(results_df["Actual"] - results_df["Predicted"])

# Compute per-branch metrics: MAE, average actual, and % MAE
relative_mae = results_df.groupby("BRANCHID").agg(
    MAE=("AbsError", "mean"),
    AvgCashFlow=("Actual", "mean")
).assign(
    MAE_Percent=lambda x: 100 * x["MAE"] / x["AvgCashFlow"]
).reset_index()

# Display Top 10 branches with highest % MAE
print("\n Branch-wise MAE (% of Avg Daily Cash Flow) – Top 10:")
print(relative_mae.sort_values("MAE_Percent", ascending=False).head(10))

In [None]:
# Ensure TXNDATE is datetime
test_df = test_df.copy()
test_df['TXNDATE'] = pd.to_datetime(test_df['TXNDATE'])

# Combine true and predicted values into test_df
test_df = test_df.reset_index(drop=True)
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred_grid).reset_index(drop=True)  

test_df['Actual'] = y_test_series
test_df['Predicted'] = y_pred_series

# Filter for BranchID = 210
branch_df = test_df[test_df['BRANCHID'] == 210].sort_values('TXNDATE')

# Plot
plt.figure(figsize=(10, 5))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black")
plt.plot(branch_df["TXNDATE"], branch_df["Predicted"], label="Predicted", color="dodgerblue", linestyle='--')
plt.title("Actual vs Predicted NetCashFlow (Test Set) — BranchID = 210")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Model Blending

### Optimize Blend Weight

In [None]:
# 50% XGB, 50% LightGBM
y_pred_blend_weighted = 0.5 * y_pred_best + 0.5 * y_pred_grid

rmse_weighted = np.sqrt(mean_squared_error(y_test, y_pred_blend_weighted))
mae_weighted = mean_absolute_error(y_test, y_pred_blend_weighted)
r2_weighted = r2_score(y_test, y_pred_blend_weighted)

print("\n Weighted Blended Model Evaluation (50% XGB, 50% LGBM):")
print(f"   RMSE: {rmse_weighted:,.2f}")
print(f"   MAE : {mae_weighted:,.2f}")
print(f"   R²  : {r2_weighted:.4f}")


In [None]:
# 40% XGB, 60% LightGBM
y_pred_blend_weighted = 0.4 * y_pred_best + 0.6 * y_pred_grid

rmse_weighted = np.sqrt(mean_squared_error(y_test, y_pred_blend_weighted))
mae_weighted = mean_absolute_error(y_test, y_pred_blend_weighted)
r2_weighted = r2_score(y_test, y_pred_blend_weighted)

print("\n Weighted Blended Model Evaluation (40% XGB, 60% LGBM):")
print(f"   RMSE: {rmse_weighted:,.2f}")
print(f"   MAE : {mae_weighted:,.2f}")
print(f"   R²  : {r2_weighted:.4f}")

In [None]:
# 30% XGB, 70% LightGBM
y_pred_blend_weighted = 0.3 * y_pred_best + 0.7 * y_pred_grid

rmse_weighted = np.sqrt(mean_squared_error(y_test, y_pred_blend_weighted))
mae_weighted = mean_absolute_error(y_test, y_pred_blend_weighted)
r2_weighted = r2_score(y_test, y_pred_blend_weighted)

print("\n Weighted Blended Model Evaluation (30% XGB, 70% LGBM):")
print(f"   RMSE: {rmse_weighted:,.2f}")
print(f"   MAE : {mae_weighted:,.2f}")
print(f"   R²  : {r2_weighted:.4f}")

In [None]:
# 60% XGB, 40% LightGBM
y_pred_blend_weighted = 0.6 * y_pred_best + 0.4 * y_pred_grid

rmse_weighted = np.sqrt(mean_squared_error(y_test, y_pred_blend_weighted))
mae_weighted = mean_absolute_error(y_test, y_pred_blend_weighted)
r2_weighted = r2_score(y_test, y_pred_blend_weighted)

print("\n Weighted Blended Model Evaluation (60% XGB, 40% LGBM):")
print(f"   RMSE: {rmse_weighted:,.2f}")
print(f"   MAE : {mae_weighted:,.2f}")
print(f"   R²  : {r2_weighted:.4f}")

In [None]:
# 70% XGB, 30% LightGBM
y_pred_blend_weighted = 0.7 * y_pred_best + 0.3 * y_pred_grid

rmse_weighted = np.sqrt(mean_squared_error(y_test, y_pred_blend_weighted))
mae_weighted = mean_absolute_error(y_test, y_pred_blend_weighted)
r2_weighted = r2_score(y_test, y_pred_blend_weighted)

print("\n Weighted Blended Model Evaluation (70% XGB, 30% LGBM):")
print(f"   RMSE: {rmse_weighted:,.2f}")
print(f"   MAE : {mae_weighted:,.2f}")
print(f"   R²  : {r2_weighted:.4f}")

In [None]:
# 80% XGB, 20% LightGBM
y_pred_blend_weighted = 0.8 * y_pred_best + 0.2 * y_pred_grid

rmse_weighted = np.sqrt(mean_squared_error(y_test, y_pred_blend_weighted))
mae_weighted = mean_absolute_error(y_test, y_pred_blend_weighted)
r2_weighted = r2_score(y_test, y_pred_blend_weighted)

print("\n Weighted Blended Model Evaluation (80% XGB, 20% LGBM):")
print(f"   RMSE: {rmse_weighted:,.2f}")
print(f"   MAE : {mae_weighted:,.2f}")
print(f"   R²  : {r2_weighted:.4f}")

### Optimal Blending model (Using saved XGB & LGBM models)

In [None]:
# Load Saved Models & Features
with open("xgb_best_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)

with open("xgb_selected_features.pkl", "rb") as f:
    xgb_features = pickle.load(f)

with open("lgbm_best_model.pkl", "rb") as f:
    lgbm_model = pickle.load(f)

with open("lgbm_selected_features.pkl", "rb") as f:
    lgbm_features = pickle.load(f)


# Prepare Test Data
X_test_xgb = X_test[xgb_features]
X_test_lgbm = X_test[lgbm_features]


# Predict with Each Model
y_pred_xgb = xgb_model.predict(X_test_xgb)
y_pred_lgbm = lgbm_model.predict(X_test_lgbm)


# Weighted Blending
y_pred_blend = 0.6 * y_pred_xgb + 0.4 * y_pred_lgbm


# Evaluate Blended Model
rmse_blend = np.sqrt(mean_squared_error(y_test, y_pred_blend))
mae_blend = mean_absolute_error(y_test, y_pred_blend)
r2_blend = r2_score(y_test, y_pred_blend)

print("\n Weighted Blended Model Evaluation (60% XGB, 40% LGBM):")
print(f"   RMSE: {rmse_blend:,.2f}")
print(f"   MAE : {mae_blend:,.2f}")
print(f"   R²  : {r2_blend:.4f}")


### Saving the optimal model

In [None]:
class WeightedBlender:
    def __init__(self, model1, model2, weight1, weight2, features1, features2):
        self.model1 = model1
        self.model2 = model2
        self.weight1 = weight1
        self.weight2 = weight2
        self.features1 = features1
        self.features2 = features2

    def predict(self, X):
        X1 = X[self.features1]
        X2 = X[self.features2]
        pred1 = self.model1.predict(X1)
        pred2 = self.model2.predict(X2)
        return self.weight1 * pred1 + self.weight2 * pred2


In [None]:
# Create the weighted blender object
blender = WeightedBlender(
    model1=xgb_model,
    model2=lgbm_model,
    weight1=0.6,
    weight2=0.4,
    features1=xgb_features,
    features2=lgbm_features
)

# Save the blended model object
with open("weighted_blended_model.pkl", "wb") as f:
    pickle.dump(blender, f)

print("Blended model saved successfully as 'weighted_blended_model.pkl'")


In [None]:
# Load the blended model

class WeightedBlender:
    def __init__(self, model1, model2, weight1, weight2, features1, features2):
        self.model1 = model1
        self.model2 = model2
        self.weight1 = weight1
        self.weight2 = weight2
        self.features1 = features1
        self.features2 = features2

    def predict(self, X):
        X1 = X[self.features1]
        X2 = X[self.features2]
        pred1 = self.model1.predict(X1)
        pred2 = self.model2.predict(X2)
        return self.weight1 * pred1 + self.weight2 * pred2

with open("weighted_blended_model.pkl", "rb") as f:
    blended_model = pickle.load(f)

# Predict on test data
y_pred_blended = blended_model.predict(X_test)

# Evaluate if needed
rmse = np.sqrt(mean_squared_error(y_test, y_pred_blended))
mae = mean_absolute_error(y_test, y_pred_blended)
r2 = r2_score(y_test, y_pred_blended)

print("\n Loaded Blended Model Evaluation:")
print(f"   RMSE: {rmse:,.2f}")
print(f"   MAE : {mae:,.2f}")
print(f"   R²  : {r2:.4f}")


### Branchwise Evaluation

In [None]:
results_df["BlendedPred"] = y_pred_blend_weighted
results_df["AbsError_Blend"] = abs(results_df["Actual"] - results_df["BlendedPred"])

# MAE per branch and as a percentage of cash flow
relative_mae_blend = results_df.groupby("BRANCHID").agg(
    MAE=("AbsError_Blend", "mean"),
    AvgCashFlow=("Actual", "mean")
).assign(
    MAE_Percent=lambda x: 100 * x["MAE"] / x["AvgCashFlow"]
).reset_index()

# Top risky branches
print("\n Top 10 Branches by MAE% (Blended Model):")
print(relative_mae_blend.sort_values("MAE_Percent", ascending=False).head(10))


In [None]:
blend_weight = 0.5
y_pred_blend = blend_weight * y_pred_best + (1 - blend_weight) * y_pred_grid

test_df = test_df.copy()
test_df['TXNDATE'] = pd.to_datetime(test_df['TXNDATE'])

# Combine actuals and predictions into test_df
test_df = test_df.reset_index(drop=True)
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred_blend).reset_index(drop=True)

test_df['Actual'] = y_test_series
test_df['Predicted'] = y_pred_series

# BranchID = 210
branch_df = test_df[test_df['BRANCHID'] == 210].sort_values('TXNDATE')

plt.figure(figsize=(10, 5))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black")
plt.plot(branch_df["TXNDATE"], branch_df["Predicted"], label="Blended Prediction", color="forestgreen", linestyle='--')
plt.title("Actual vs Blended Predicted NetCashFlow — BranchID = 210")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
blend_weight = 0.5
y_pred_blend = blend_weight * y_pred_best + (1 - blend_weight) * y_pred_grid


test_df = test_df.copy()
test_df['TXNDATE'] = pd.to_datetime(test_df['TXNDATE'])

# Combine actuals and predictions into test_df
test_df = test_df.reset_index(drop=True)
y_test_series = pd.Series(y_test).reset_index(drop=True)
y_pred_series = pd.Series(y_pred_blend).reset_index(drop=True)

test_df['Actual'] = y_test_series
test_df['Predicted'] = y_pred_series

# BranchID = 2
branch_df = test_df[test_df['BRANCHID'] == 2].sort_values('TXNDATE')

plt.figure(figsize=(10, 5))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black")
plt.plot(branch_df["TXNDATE"], branch_df["Predicted"], label="Blended Prediction", color="forestgreen", linestyle='--')
plt.title("Actual vs Blended Predicted NetCashFlow — BranchID = 2")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Model Stacking

### Meta-Model (Ridge Regression)

In [None]:
# Base models
xgb_model = xgb.XGBRegressor(
    n_estimators=1550, max_depth=5, learning_rate=0.1, subsample=1.0,
    colsample_bytree=1.0, reg_alpha=0.1, reg_lambda=10, random_state=42
)

lgb_model = LGBMRegressor(
    num_leaves=130, learning_rate=0.0065, n_estimators=1400, max_depth=7,
    subsample=0.673, colsample_bytree=0.934, reg_alpha=0.24, reg_lambda=0.09,
    min_child_samples=10, min_split_gain=0.087, random_state=42
)

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Initialize out-of-fold prediction arrays
xgb_oof = np.zeros(len(X_train_selected))
lgb_oof = np.zeros(len(X_train_select_LGBM))

# Loop to generate OOF predictions
for train_idx, val_idx in tscv.split(X_train_selected):
    # XGB
    xgb_model.fit(X_train_selected.iloc[train_idx], y_train.iloc[train_idx])
    xgb_oof[val_idx] = xgb_model.predict(X_train_selected.iloc[val_idx])
    
    # LGB
    lgb_model.fit(X_train_select_LGBM.iloc[train_idx], y_train.iloc[train_idx])
    lgb_oof[val_idx] = lgb_model.predict(X_train_select_LGBM.iloc[val_idx])
    
# Combine OOF predictions
stacked_train = pd.DataFrame({
    "XGB_Pred": xgb_oof,
    "LGB_Pred": lgb_oof
})

In [None]:
# Meta-model (Ridge)
meta_model = Ridge(alpha=1.0)
meta_model.fit(stacked_train, y_train)

# Retrain base models on full training set
xgb_model.fit(X_train_selected, y_train)
lgb_model.fit(X_train_select_LGBM, y_train)

# Predict on test set
xgb_test_pred = xgb_model.predict(X_test_selected)
lgb_test_pred = lgb_model.predict(X_test_select_LGBM)

# Create stacked features for test set
stacked_test = pd.DataFrame({
    "XGB_Pred": xgb_test_pred,
    "LGB_Pred": lgb_test_pred
})

# Final prediction using meta-model
final_stacked_pred = meta_model.predict(stacked_test)

rmse_stack = np.sqrt(mean_squared_error(y_test, final_stacked_pred))
mae_stack = mean_absolute_error(y_test, final_stacked_pred)
r2_stack = r2_score(y_test, final_stacked_pred)

print("\n Basic Stacked Model (Time Series CV) Evaluation:")
print(f"   RMSE: {rmse_stack:,.2f}")
print(f"   MAE : {mae_stack:,.2f}")
print(f"   R²  : {r2_stack:.4f}")

### GradientBoostingRegressor as Meta-Model

In [None]:
# Replace Ridge with Gradient Boosting
meta_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

meta_model.fit(stacked_train, y_train)


stacked_test = pd.DataFrame({
    "XGB_Pred": xgb_test_pred,
    "LGB_Pred": lgb_test_pred
})

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (GradientBoosting Meta) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

### Randomised Search

In [None]:
param_dist = {
    'n_estimators': np.arange(50, 1001, 50),         
    'learning_rate': np.linspace(0.005, 0.2, 20),    
    'max_depth': [2, 3, 4, 5, 6],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
}


tscv = TimeSeriesSplit(n_splits=5)


gbr = GradientBoostingRegressor(random_state=42)

# Random Search
random_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=param_dist,
    n_iter=50,  
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# Fit on stacked_train
random_search.fit(stacked_train, y_train)

# Show best parameters
print("\n Best Parameters from RandomizedSearchCV:")
print(random_search.best_params_)

# Get the best estimator from RandomizedSearchCV
best_meta_model = random_search.best_estimator_

# Predict on stacked test data
final_stack_pred = best_meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

# Print final performance
print("\n Evaluation of Best Meta-Model from RandomizedSearchCV:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

### Fine-Tuning with GridSearchCV

In [None]:
# Define fine-tuned grid based on previous best
param_grid = {
    'n_estimators': [125, 150, 175],
    'learning_rate': [0.09, 0.10, 0.11, 0.12],
    'max_depth': [4, 5, 6],
    'subsample': [0.65, 0.7, 0.75]
}

tscv = TimeSeriesSplit(n_splits=5)

# Initialize model
gbr = GradientBoostingRegressor(random_state=42)

# Grid Search
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=2,
    n_jobs=-1
)

# Fit Grid Search on stacked_train
grid_search.fit(stacked_train, y_train)


print("\n Best Parameters from GridSearchCV:")
print(grid_search.best_params_)

# Predict with best fine-tuned meta-model
final_meta_model = grid_search.best_estimator_
final_stack_pred = final_meta_model.predict(stacked_test)


rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Tuned GradientBoosting Meta-Model Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")


### Tune min_samples_split & min_samples_leaf

In [None]:
# Define fine-tuned grid based on previous best
param_grid = {
    'n_estimators': [175],
    'learning_rate': [0.09],
    'max_depth': [6],
    'subsample': [0.75],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tscv = TimeSeriesSplit(n_splits=5)

gbr = GradientBoostingRegressor(random_state=42)

# Grid Search
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=2,
    n_jobs=-1
)

# Fit Grid Search on stacked_train
grid_search.fit(stacked_train, y_train)

# Show best parameters
print("\n Best Parameters from GridSearchCV:")
print(grid_search.best_params_)

# Predict with best fine-tuned meta-model
final_meta_model = grid_search.best_estimator_
final_stack_pred = final_meta_model.predict(stacked_test)


rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Tuned GradientBoosting Meta-Model Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

### With tuned Parameters

In [None]:
# Define fine-tuned grid based on previous best
param_grid = {
    'n_estimators': [175],
    'learning_rate': [0.09],
    'max_depth': [6],
    'subsample': [0.75],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Time-aware CV
tscv = TimeSeriesSplit(n_splits=5)

# Initialize model
gbr = GradientBoostingRegressor(random_state=42)

# Grid Search
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=2,
    n_jobs=-1
)

# Fit Grid Search on stacked_train
grid_search.fit(stacked_train, y_train)

# Show best parameters
print("\n Best Parameters from GridSearchCV:")
print(grid_search.best_params_)

# Predict with best fine-tuned meta-model
final_meta_model = grid_search.best_estimator_
final_stack_pred = final_meta_model.predict(stacked_test)


rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Tuned GradientBoosting Meta-Model Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

### Extended Meta Feature Set

In [None]:
# Initialize arrays
xgb_oof = np.zeros(len(X_train_selected))
lgb_oof = np.zeros(len(X_train_select_LGBM))

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Loop over folds
for train_idx, val_idx in tscv.split(X_train_selected):
    # XGB
    xgb_model.fit(X_train_selected.iloc[train_idx], y_train.iloc[train_idx])
    xgb_oof[val_idx] = xgb_model.predict(X_train_selected.iloc[val_idx])
    
    # LGBM
    lgb_model.fit(X_train_select_LGBM.iloc[train_idx], y_train.iloc[train_idx])
    lgb_oof[val_idx] = lgb_model.predict(X_train_select_LGBM.iloc[val_idx])


In [None]:
# Train meta features
stacked_train = pd.DataFrame({
    "XGB_Pred": xgb_oof,
    "LGB_Pred": lgb_oof
})
stacked_train["Avg_Pred"] = (stacked_train["XGB_Pred"] + stacked_train["LGB_Pred"]) / 2
stacked_train["Diff_Pred"] = stacked_train["XGB_Pred"] - stacked_train["LGB_Pred"]


# Retrain base models on full training data
xgb_model.fit(X_train_selected, y_train)
lgb_model.fit(X_train_select_LGBM, y_train)

# Test meta features
xgb_test_pred = xgb_model.predict(X_test_selected)
lgb_test_pred = lgb_model.predict(X_test_select_LGBM)

stacked_test = pd.DataFrame({
    "XGB_Pred": xgb_test_pred,
    "LGB_Pred": lgb_test_pred
})
stacked_test["Avg_Pred"] = (stacked_test["XGB_Pred"] + stacked_test["LGB_Pred"]) / 2
stacked_test["Diff_Pred"] = stacked_test["XGB_Pred"] - stacked_test["LGB_Pred"]

In [None]:
meta_model = GradientBoostingRegressor(
    learning_rate=0.09,
    max_depth=6,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=175,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

In [None]:
# Build extended meta-features for train set
stacked_train = pd.DataFrame({
    "XGB_Pred": xgb_oof,
    "LGB_Pred": lgb_oof
})
stacked_train["Avg_Pred"] = (stacked_train["XGB_Pred"] + stacked_train["LGB_Pred"]) / 2
stacked_train["Diff_Pred"] = stacked_train["XGB_Pred"] - stacked_train["LGB_Pred"]
stacked_train["Ratio_Pred"] = stacked_train["XGB_Pred"] / (stacked_train["LGB_Pred"] + 1e-6)
stacked_train["Min_Pred"] = stacked_train[["XGB_Pred", "LGB_Pred"]].min(axis=1)
stacked_train["Max_Pred"] = stacked_train[["XGB_Pred", "LGB_Pred"]].max(axis=1)

# Build extended meta-features for test set
stacked_test = pd.DataFrame({
    "XGB_Pred": xgb_test_pred,
    "LGB_Pred": lgb_test_pred
})
stacked_test["Avg_Pred"] = (stacked_test["XGB_Pred"] + stacked_test["LGB_Pred"]) / 2
stacked_test["Diff_Pred"] = stacked_test["XGB_Pred"] - stacked_test["LGB_Pred"]
stacked_test["Ratio_Pred"] = stacked_test["XGB_Pred"] / (stacked_test["LGB_Pred"] + 1e-6)
stacked_test["Min_Pred"] = stacked_test[["XGB_Pred", "LGB_Pred"]].min(axis=1)
stacked_test["Max_Pred"] = stacked_test[["XGB_Pred", "LGB_Pred"]].max(axis=1)


print("\n Extended Meta Features (train):")
print(stacked_train.head())


In [None]:
# Use tuned hyperparameters
meta_model = GradientBoostingRegressor(
    learning_rate=0.09,
    max_depth=6,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=175,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")


### RandomisedSearch to improve the result

In [None]:
param_dist = {
    "n_estimators": np.arange(100, 1001, 100),
    "learning_rate": np.linspace(0.01, 0.2, 20),
    "max_depth": [3, 5, 7],
    "subsample": [0.6, 0.8, 1.0],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Time Series cross-validation
tscv = TimeSeriesSplit(n_splits=3)

# Initialize the model
gbr = GradientBoostingRegressor(random_state=42)

# RandomizedSearchCV setup
random_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=param_dist,
    n_iter=30,                  
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# Fit on extended stacked_train
random_search.fit(stacked_train, y_train)

# Show best parameters
print("\n Best Parameters from RandomizedSearchCV:")
print(random_search.best_params_)

# Evaluate the best estimator
best_meta_model = random_search.best_estimator_
final_stack_pred = best_meta_model.predict(stacked_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Tuned GradientBoosting Meta-Model (Extended Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

### Manually adjusting parameters

In [None]:
# Use tuned hyperparameters
meta_model = GradientBoostingRegressor(
    learning_rate=0.09,
    max_depth=6,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=190,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

In [None]:
# Use tuned hyperparameters

meta_model = GradientBoostingRegressor(
    learning_rate=0.09,
    max_depth=6,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=250,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

In [None]:
# Use tuned hyperparameters

meta_model = GradientBoostingRegressor(
    learning_rate=0.09,
    max_depth=7,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=250,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

In [None]:
# Use tuned hyperparameters
meta_model = GradientBoostingRegressor(
    learning_rate=0.09,
    max_depth=7,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=500,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

In [None]:
meta_model = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=7,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=250,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

In [None]:
meta_model = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=7,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=500,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)

final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")

### Optimised Stack Model

In [None]:
### Save the optimised model ###

# Load saved base models and features
with open("xgb_best_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)

with open("xgb_selected_features.pkl", "rb") as f:
    xgb_features = pickle.load(f)

with open("lgbm_best_model.pkl", "rb") as f:
    lgbm_model = pickle.load(f)

with open("lgbm_selected_features.pkl", "rb") as f:
    lgbm_features = pickle.load(f)


# Prepare training and test sets
X_train_xgb = X_train[xgb_features]
X_test_xgb = X_test[xgb_features]
X_train_lgb = X_train[lgbm_features]
X_test_lgb = X_test[lgbm_features]


# Generate Out-of-Fold Predictions
tscv = TimeSeriesSplit(n_splits=5)
xgb_oof = np.zeros(len(X_train_xgb))
lgb_oof = np.zeros(len(X_train_lgb))

for train_idx, val_idx in tscv.split(X_train_xgb):
    # Train and predict with XGB
    xgb_model.fit(X_train_xgb.iloc[train_idx], y_train.iloc[train_idx])
    xgb_oof[val_idx] = xgb_model.predict(X_train_xgb.iloc[val_idx])
    
    # Train and predict with LGB
    lgbm_model.fit(X_train_lgb.iloc[train_idx], y_train.iloc[train_idx])
    lgb_oof[val_idx] = lgbm_model.predict(X_train_lgb.iloc[val_idx])


# Build Extended Meta-Features for Train
stacked_train = pd.DataFrame({
    "XGB_Pred": xgb_oof,
    "LGB_Pred": lgb_oof
})
stacked_train["Avg_Pred"] = (stacked_train["XGB_Pred"] + stacked_train["LGB_Pred"]) / 2
stacked_train["Diff_Pred"] = stacked_train["XGB_Pred"] - stacked_train["LGB_Pred"]
stacked_train["Ratio_Pred"] = stacked_train["XGB_Pred"] / (stacked_train["LGB_Pred"] + 1e-6)
stacked_train["Min_Pred"] = stacked_train[["XGB_Pred", "LGB_Pred"]].min(axis=1)
stacked_train["Max_Pred"] = stacked_train[["XGB_Pred", "LGB_Pred"]].max(axis=1)


# Train base models on full training set
xgb_model.fit(X_train_xgb, y_train)
lgbm_model.fit(X_train_lgb, y_train)


# Predict base models on test set
xgb_test_pred = xgb_model.predict(X_test_xgb)
lgb_test_pred = lgbm_model.predict(X_test_lgb)


# Build Extended Meta-Features for Test
stacked_test = pd.DataFrame({
    "XGB_Pred": xgb_test_pred,
    "LGB_Pred": lgb_test_pred
})
stacked_test["Avg_Pred"] = (stacked_test["XGB_Pred"] + stacked_test["LGB_Pred"]) / 2
stacked_test["Diff_Pred"] = stacked_test["XGB_Pred"] - stacked_test["LGB_Pred"]
stacked_test["Ratio_Pred"] = stacked_test["XGB_Pred"] / (stacked_test["LGB_Pred"] + 1e-6)
stacked_test["Min_Pred"] = stacked_test[["XGB_Pred", "LGB_Pred"]].min(axis=1)
stacked_test["Max_Pred"] = stacked_test[["XGB_Pred", "LGB_Pred"]].max(axis=1)


# Train Meta-Model on Extended Features
meta_model = GradientBoostingRegressor(
    learning_rate=0.09,
    max_depth=6,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=190,
    subsample=0.75,
    random_state=42
)

meta_model.fit(stacked_train, y_train)


# Predict & Evaluate Final Stacked Model
final_stack_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_stack_pred))
mae = mean_absolute_error(y_test, final_stack_pred)
r2 = r2_score(y_test, final_stack_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")


# Save Final Meta Model
with open("stacked_meta_model.pkl", "wb") as f:
    pickle.dump(meta_model, f)

print("Final stacked meta-model saved as 'stacked_meta_model.pkl'")


In [None]:
# Load saved models
with open("xgb_best_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)

with open("lgbm_best_model.pkl", "rb") as f:
    lgbm_model = pickle.load(f)

with open("xgb_selected_features.pkl", "rb") as f:
    xgb_features = pickle.load(f)

with open("lgbm_selected_features.pkl", "rb") as f:
    lgbm_features = pickle.load(f)

with open("stacked_meta_model.pkl", "rb") as f:
    meta_model = pickle.load(f)

# Prepare features
X_test_xgb = X_test[xgb_features]
X_test_lgb = X_test[lgbm_features]

# Base predictions
xgb_pred = xgb_model.predict(X_test_xgb)
lgb_pred = lgbm_model.predict(X_test_lgb)

# Meta features
stacked_test = pd.DataFrame({
    "XGB_Pred": xgb_pred,
    "LGB_Pred": lgb_pred,
    "Avg_Pred": (xgb_pred + lgb_pred) / 2,
    "Diff_Pred": xgb_pred - lgb_pred,
    "Ratio_Pred": xgb_pred / (lgb_pred + 1e-6),
    "Min_Pred": np.minimum(xgb_pred, lgb_pred),
    "Max_Pred": np.maximum(xgb_pred, lgb_pred)
})

# Final stacked prediction
final_pred = meta_model.predict(stacked_test)

rmse = np.sqrt(mean_squared_error(y_test, final_pred))
mae = mean_absolute_error(y_test, final_pred)
r2 = r2_score(y_test, final_pred)

print("\n Final Stacked Model (Extended Meta-Features) Evaluation:")
print(f"   RMSE : {rmse:,.2f}")
print(f"   MAE  : {mae:,.2f}")
print(f"   R²   : {r2:.4f}")


# Comparison of findings

In [None]:
# --- Load models and selected features ---

# XGBoost
with open("xgb_best_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)
with open("xgb_selected_features.pkl", "rb") as f:
    xgb_features = pickle.load(f)
X_test_xgb = X_test[xgb_features]
xgb_pred = xgb_model.predict(X_test_xgb)

# LightGBM
with open("lgbm_best_model.pkl", "rb") as f:
    lgbm_model = pickle.load(f)
with open("lgbm_selected_features.pkl", "rb") as f:
    lgbm_features = pickle.load(f)
X_test_lgb = X_test[lgbm_features]
lgb_pred = lgbm_model.predict(X_test_lgb)

# Weighted Blend
with open("weighted_blended_model.pkl", "rb") as f:
    blended_model = pickle.load(f)
y_pred_blended = blended_model.predict(X_test)

# Stacked Model
with open("stacked_meta_model.pkl", "rb") as f:
    meta_model = pickle.load(f)

stacked_test = pd.DataFrame({
    "XGB_Pred": xgb_pred,
    "LGB_Pred": lgb_pred,
    "Avg_Pred": (xgb_pred + lgb_pred) / 2,
    "Diff_Pred": xgb_pred - lgb_pred,
    "Ratio_Pred": xgb_pred / (lgb_pred + 1e-6),
    "Min_Pred": np.minimum(xgb_pred, lgb_pred),
    "Max_Pred": np.maximum(xgb_pred, lgb_pred)
})
final_pred = meta_model.predict(stacked_test)

# --- Evaluate all models ---
def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

results = {
    "Model": ["XGBoost", "LightGBM", "Weighted Blend", "Stacked Model"],
    "RMSE": [],
    "MAE": [],
    "R²": []
}

# Calculate metrics
for pred in [xgb_pred, lgb_pred, y_pred_blended, final_pred]:
    rmse, mae, r2 = evaluate(y_test, pred)
    results["RMSE"].append(rmse)
    results["MAE"].append(mae)
    results["R²"].append(r2)

# Create DataFrame
results_df = pd.DataFrame(results)

# Format numbers for readability
results_df["RMSE"] = results_df["RMSE"].map("{:,.2f}".format)
results_df["MAE"] = results_df["MAE"].map("{:,.2f}".format)
results_df["R²"] = results_df["R²"].map("{:.4f}".format)

print("\nModel Performance Summary:")
print(results_df)


### Plots

In [None]:
# Load saved models and features
with open("xgb_best_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)

with open("xgb_selected_features.pkl", "rb") as f:
    xgb_features = pickle.load(f)

with open("lgbm_best_model.pkl", "rb") as f:
    lgbm_model = pickle.load(f)

with open("lgbm_selected_features.pkl", "rb") as f:
    lgbm_features = pickle.load(f)

with open("stacked_meta_model.pkl", "rb") as f:
    meta_model = pickle.load(f)


# Ensure TXNDATE is datetime and reset index
test_df = test_df.copy()
test_df["TXNDATE"] = pd.to_datetime(test_df["TXNDATE"])
test_df = test_df.reset_index(drop=True)


# Prepare test sets
X_test_xgb = test_df[xgb_features]
X_test_lgb = test_df[lgbm_features]
y_test_series = pd.Series(y_test).reset_index(drop=True)


# Get predictions from base models
y_pred_xgb = pd.Series(xgb_model.predict(X_test_xgb)).reset_index(drop=True)
y_pred_lgb = pd.Series(lgbm_model.predict(X_test_lgb)).reset_index(drop=True)


# Blended prediction (60% XGB + 40% LGBM)
y_pred_blend = 0.6 * y_pred_xgb + 0.4 * y_pred_lgb


# Build meta features for stacked prediction
stacked_test = pd.DataFrame({
    "XGB_Pred": y_pred_xgb,
    "LGB_Pred": y_pred_lgb
})
stacked_test["Avg_Pred"] = (y_pred_xgb + y_pred_lgb) / 2
stacked_test["Diff_Pred"] = y_pred_xgb - y_pred_lgb
stacked_test["Ratio_Pred"] = y_pred_xgb / (y_pred_lgb + 1e-6)
stacked_test["Min_Pred"] = np.minimum(y_pred_xgb, y_pred_lgb)
stacked_test["Max_Pred"] = np.maximum(y_pred_xgb, y_pred_lgb)


# Final stacked prediction
y_pred_stack = pd.Series(meta_model.predict(stacked_test)).reset_index(drop=True)


# Merge predictions into test_df
test_df["Actual"] = y_test_series
test_df["XGB_Pred"] = y_pred_xgb
test_df["LGB_Pred"] = y_pred_lgb
test_df["Blended_Pred"] = y_pred_blend
test_df["Stacked_Pred"] = y_pred_stack


# Filter for specific BranchID and plot
branch_id = 210 
branch_df = test_df[test_df["BRANCHID"] == branch_id].sort_values("TXNDATE")

plt.figure(figsize=(12,6))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black", linewidth=2)
plt.plot(branch_df["TXNDATE"], branch_df["XGB_Pred"], label="XGB", color="dodgerblue", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["LGB_Pred"], label="LightGBM", color="green", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Blended_Pred"], label="Blended (60% XGB + 40% LGBM)", color="purple", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Stacked_Pred"], label="Stacked", color="red", linestyle="--")

plt.title(f"Actual vs Predicted NetCashFlow — BranchID = {branch_id}")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### ML Prediction for a given date range

In [None]:
# Load saved models and features
with open("xgb_best_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)

with open("xgb_selected_features.pkl", "rb") as f:
    xgb_features = pickle.load(f)

with open("lgbm_best_model.pkl", "rb") as f:
    lgbm_model = pickle.load(f)

with open("lgbm_selected_features.pkl", "rb") as f:
    lgbm_features = pickle.load(f)

with open("stacked_meta_model.pkl", "rb") as f:
    meta_model = pickle.load(f)


# Ensure TXNDATE is datetime and reset index
test_df = test_df.copy()
test_df["TXNDATE"] = pd.to_datetime(test_df["TXNDATE"])
test_df = test_df.reset_index(drop=True)


# Prepare test sets
X_test_xgb = test_df[xgb_features]
X_test_lgb = test_df[lgbm_features]
y_test_series = pd.Series(y_test).reset_index(drop=True)


# Get predictions from base models
y_pred_xgb = pd.Series(xgb_model.predict(X_test_xgb)).reset_index(drop=True)
y_pred_lgb = pd.Series(lgbm_model.predict(X_test_lgb)).reset_index(drop=True)


# Blended prediction (60% XGB + 40% LGBM)
y_pred_blend = 0.6 * y_pred_xgb + 0.4 * y_pred_lgb


# Build meta features for stacked prediction
stacked_test = pd.DataFrame({
    "XGB_Pred": y_pred_xgb,
    "LGB_Pred": y_pred_lgb
})
stacked_test["Avg_Pred"] = (y_pred_xgb + y_pred_lgb) / 2
stacked_test["Diff_Pred"] = y_pred_xgb - y_pred_lgb
stacked_test["Ratio_Pred"] = y_pred_xgb / (y_pred_lgb + 1e-6)
stacked_test["Min_Pred"] = np.minimum(y_pred_xgb, y_pred_lgb)
stacked_test["Max_Pred"] = np.maximum(y_pred_xgb, y_pred_lgb)


# Final stacked prediction
y_pred_stack = pd.Series(meta_model.predict(stacked_test)).reset_index(drop=True)


# Merge predictions into test_df
test_df["Actual"] = y_test_series
test_df["XGB_Pred"] = y_pred_xgb
test_df["LGB_Pred"] = y_pred_lgb
test_df["Blended_Pred"] = y_pred_blend
test_df["Stacked_Pred"] = y_pred_stack


# Filter for specific BranchID and Date Range, then plot
branch_id = 21   
start_date = "2025-03-04"   
end_date   = "2025-03-10"   

branch_df = test_df[test_df["BRANCHID"] == branch_id].copy()
branch_df = branch_df.sort_values("TXNDATE")

# Apply date filter
mask = (branch_df["TXNDATE"] >= pd.to_datetime(start_date)) & (branch_df["TXNDATE"] <= pd.to_datetime(end_date))
branch_df = branch_df.loc[mask]

# Plot
plt.figure(figsize=(12,6))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black", linewidth=2)
plt.plot(branch_df["TXNDATE"], branch_df["XGB_Pred"], label="XGB", color="dodgerblue", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["LGB_Pred"], label="LightGBM", color="green", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Blended_Pred"], label="Blended (60% XGB + 40% LGBM)", color="purple", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Stacked_Pred"], label="Stacked", color="red", linestyle="--")

plt.title(f"Actual vs Predicted NetCashFlow — BranchID = {branch_id}\n({start_date} to {end_date})")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# specify the date and branch you want to inspect
query_date = pd.to_datetime("2025-03-20")  
branch_id = 210 

# Filter test_df for branch and date
subset = test_df[
    (test_df["BRANCHID"] == branch_id) & 
    (test_df["TXNDATE"] == query_date)
]

if subset.empty:
    print(f"No data found for BranchID={branch_id} on {query_date.date()}")
else:
    # Extract actual and predicted values
    actual = subset["Actual"].values[0]
    xgb_pred = subset["XGB_Pred"].values[0]
    lgb_pred = subset["LGB_Pred"].values[0]
    blended_pred = subset["Blended_Pred"].values[0]
    stacked_pred = subset["Stacked_Pred"].values[0]

    # Prepare plot data
    labels = ["Actual", "XGB", "LightGBM", "Blended", "Stacked"]
    values = [actual, xgb_pred, lgb_pred, blended_pred, stacked_pred]
    colors = ["black", "dodgerblue", "green", "purple", "red"]

    plt.figure(figsize=(8,5))
    # Plot each point individually to assign legend labels
    for label, val, color in zip(labels, values, colors):
        plt.scatter(label, val, color=color, s=150, label=label)
        plt.text(label, val, f"{val:,.0f}", ha='center', va='bottom', fontsize=12)

    plt.title(f"NetCashFlow on {query_date.date()} for BranchID {branch_id}")
    plt.ylabel("Amount")
    plt.grid(axis='y')
    plt.legend()
    plt.show()


### Predicted values in a table

In [None]:

# Specify date range and branch

start_date = pd.to_datetime("2025-03-10")  
end_date = pd.to_datetime("2025-03-15")    
branch_id = 210                            


# Filter DataFrame for given branch and date range
subset = test_df[
    (test_df["BRANCHID"] == branch_id) & 
    (test_df["TXNDATE"] >= start_date) & 
    (test_df["TXNDATE"] <= end_date)
].copy()


# Check if data exists and display table
if subset.empty:
    print(f"No data found for BranchID={branch_id} between {start_date.date()} and {end_date.date()}")
else:
    # Select and rename relevant columns
    display_df = subset[[
        "TXNDATE", "Actual", "XGB_Pred", "LGB_Pred", "Blended_Pred", "Stacked_Pred"
    ]].rename(columns={
        "TXNDATE": "Date",
        "Actual": "Actual",
        "XGB_Pred": "XGBoost",
        "LGB_Pred": "LightGBM",
        "Blended_Pred": "Blended",
        "Stacked_Pred": "Stacked"
    })

    # Format large numbers for display 
    display_df = display_df.sort_values("Date")
    display_df["Date"] = display_df["Date"].dt.date 

    # Format numeric columns
    formatted_df = display_df.copy()
    for col in ["Actual", "XGBoost", "LightGBM", "Blended", "Stacked"]:
        formatted_df[col] = formatted_df[col].apply(lambda x: f"{x:,.0f}")

    # Show table
    import IPython
    from IPython.display import display
    display(formatted_df)


# Part 02

## Part 02 - LSTM

### Check Data Types and Convert

In [None]:
# Set random seed for reproducibility

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
df_step1 = df.copy()

# Convert TXNDATE to datetime
df_step1['TXNDATE'] = pd.to_datetime(df_step1['TXNDATE'], dayfirst=True)

# Convert categorical columns to 'category' dtype
categorical_cols = ['BRANCH', 'DISTRICT', 'PROVINCE', 'CODE']
for col in categorical_cols:
    df_step1[col] = df_step1[col].astype('category')

print(df_step1.dtypes)

### Missing values

In [None]:
df_step1 = df_step1.dropna(subset=rolling_lag_cols).reset_index(drop=True)

In [None]:
df_step1.shape

### Drop 'BRANCH' column

In [None]:
# Create the mapping dictionary once before dropping the column
branch_mapping = df_step1[['BRANCHID', 'BRANCH']].drop_duplicates().set_index('BRANCHID')['BRANCH'].to_dict()

# Drop the BRANCH column before modeling
df_step2 = df_step1.drop(columns=['BRANCH'])

In [None]:
df_step2.head()

### Categorical Encoding and Binary conversion

In [None]:
# List of categorical columns to one-hot encode
cat_cols = ['DISTRICT', 'PROVINCE']

# For binary columns, convert to int 
binary_cols = ['IsWeekend', 'IsHoliday', 'IsNonWorkingDay', 'IsMonthStart', 'IsMonthEnd', 'IsFirst5Days', 'IsLast5Days']

# Convert binary columns to integer (0/1) type
for col in binary_cols:
    df_step2[col] = df_step2[col].astype(int)

# One-hot encode categorical columns with drop_first=True to avoid multicollinearity
df_encoded = pd.get_dummies(df_step2, columns=cat_cols, drop_first=True)

### Data Split

In [None]:
# Make sure TXNDATE is datetime
df_encoded["TXNDATE"] = pd.to_datetime(df_encoded["TXNDATE"], dayfirst=True)

# Sort by TXNDATE and BRANCHID
df_encoded = df_encoded.sort_values(["TXNDATE", "BRANCHID"]).reset_index(drop=True)

# Split date thresholds
train_end = pd.Timestamp("2024-12-31")
test_start = pd.Timestamp("2025-01-01")
test_end = pd.Timestamp("2025-03-31")

# Training data
train_df = df_encoded[df_encoded["TXNDATE"] <= train_end].copy()

# Test data
test_df = df_encoded[
    (df_encoded["TXNDATE"] >= test_start) &
    (df_encoded["TXNDATE"] <= test_end)
].copy()

print("Train dates:", train_df["TXNDATE"].min(), "to", train_df["TXNDATE"].max())
print("Test dates:", test_df["TXNDATE"].min(), "to", test_df["TXNDATE"].max())
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Columns to drop from inputs
drop_cols = [
    "NetCashFlow_Per_Customer",
    "TOTALTXNAMOUNT"
]

# Drop from both train and test
train_df = train_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)

# Define target column
target_col = "NetCashFlow"

# Prepare feature and target sets
X_train_full = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_test_full = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

print("X_train columns:", X_train_full.columns.tolist())
print("X_train shape:", X_train_full.shape)
print("y_train shape:", y_train.shape)

### Correlation Check

In [None]:
# Reset index
X_train_full_reset = X_train_full.reset_index(drop=True)
y_train_reset = y_train.reset_index(drop=True)

# Combine to a single DataFrame
train_merged = pd.concat([X_train_full_reset, y_train_reset], axis=1)

# Confirm columns
print(train_merged.columns)

In [None]:
train_merged["TXNDATE"] = pd.to_datetime(train_merged["TXNDATE"])
train_merged = train_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

In [None]:
# Extract numeric columns to feed LSTM
cols_to_exclude = ["TXNDATE", "BRANCHID", "NetCashFlow"]
numeric_cols = [c for c in train_merged.columns if c not in cols_to_exclude]
print("Numeric features for LSTM:", numeric_cols)

### sequence length = 7 - Without log & Scaling

In [None]:
# Set Random Seed for Reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)


# Sequence Generator Function
def generate_sequences(data, seq_len, numeric_cols, target_col="NetCashFlow"):
    X_num_seq, X_branch_seq, y_seq = [], [], []
    
    for branch_id, group in data.groupby("BRANCHID"):
        group = group.sort_values("TXNDATE").reset_index(drop=True)
        for i in range(len(group) - seq_len):
            X_num_seq.append(group.loc[i:i+seq_len-1, numeric_cols].values)
            X_branch_seq.append(branch_id)
            y_seq.append(group.loc[i+seq_len, target_col])
    
    return (
        np.array(X_num_seq).astype(np.float32),
        np.array(X_branch_seq).reshape(-1, 1).astype(np.int32),
        np.array(y_seq).astype(np.float32)
    )

    
# Define Sequence Length and Columns
SEQ_LEN = 7
numeric_cols = [c for c in X_train_full.columns if c not in ["TXNDATE", "BRANCHID", "NetCashFlow"]]


# Prepare Train Data
train_merged = pd.concat([X_train_full.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
train_merged["TXNDATE"] = pd.to_datetime(train_merged["TXNDATE"])
train_merged = train_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq, X_branch_seq, y_seq = generate_sequences(train_merged, SEQ_LEN, numeric_cols)


# Prepare Test Data
X_test_aligned = (
    X_test_full.drop(columns=["TXNDATE"])
    .reindex(columns=[c for c in X_train_full.columns if c != "TXNDATE"], fill_value=0)
)
test_merged = pd.concat([
    X_test_aligned.reset_index(drop=True),
    X_test_full["TXNDATE"].reset_index(drop=True),
    y_test.reset_index(drop=True)
], axis=1)
test_merged["TXNDATE"] = pd.to_datetime(test_merged["TXNDATE"])
test_merged = test_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq_test, X_branch_seq_test, y_seq_test = generate_sequences(test_merged, SEQ_LEN, numeric_cols)


# Model Architecture
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# LSTM Path
lstm_out = LSTM(64, return_sequences=False)(seq_input)

# Merge
concat = Concatenate()([lstm_out, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()


# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)


# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Rescale The Target

In [None]:
# Standardize y (Target Variable)
y_scaler = StandardScaler()

y_train_scaled = y_scaler.fit_transform(y_seq.reshape(-1, 1)).flatten()
y_test_scaled = y_scaler.transform(y_seq_test.reshape(-1, 1)).flatten()


# Train Model on Scaled Target
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_train_scaled,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)


# Predict & Inverse Transform
y_pred_scaled = model.predict([X_num_seq_test, X_branch_seq_test])
y_pred = y_scaler.inverse_transform(y_pred_scaled)


# Evaluate Performance
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f" Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### sequence length = 30 - Without log & Scaling

In [None]:
# Set Random Seed for Reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)


# Sequence Generator Function
def generate_sequences(data, seq_len, numeric_cols, target_col="NetCashFlow"):
    X_num_seq, X_branch_seq, y_seq = [], [], []
    
    for branch_id, group in data.groupby("BRANCHID"):
        group = group.sort_values("TXNDATE").reset_index(drop=True)
        for i in range(len(group) - seq_len):
            X_num_seq.append(group.loc[i:i+seq_len-1, numeric_cols].values)
            X_branch_seq.append(branch_id)
            y_seq.append(group.loc[i+seq_len, target_col])
    
    return (
        np.array(X_num_seq).astype(np.float32),
        np.array(X_branch_seq).reshape(-1, 1).astype(np.int32),
        np.array(y_seq).astype(np.float32)
    )


# Define Sequence Length and Columns
SEQ_LEN = 30
numeric_cols = [c for c in X_train_full.columns if c not in ["TXNDATE", "BRANCHID", "NetCashFlow"]]


# Prepare Train Data
train_merged = pd.concat([X_train_full.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
train_merged["TXNDATE"] = pd.to_datetime(train_merged["TXNDATE"])
train_merged = train_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq, X_branch_seq, y_seq = generate_sequences(train_merged, SEQ_LEN, numeric_cols)


# Prepare Test Data
X_test_aligned = (
    X_test_full.drop(columns=["TXNDATE"])
    .reindex(columns=[c for c in X_train_full.columns if c != "TXNDATE"], fill_value=0)
)
test_merged = pd.concat([
    X_test_aligned.reset_index(drop=True),
    X_test_full["TXNDATE"].reset_index(drop=True),
    y_test.reset_index(drop=True)
], axis=1)
test_merged["TXNDATE"] = pd.to_datetime(test_merged["TXNDATE"])
test_merged = test_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq_test, X_branch_seq_test, y_seq_test = generate_sequences(test_merged, SEQ_LEN, numeric_cols)


# Model Architecture
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# LSTM Path
lstm_out = LSTM(64, return_sequences=False)(seq_input)

# Merge
concat = Concatenate()([lstm_out, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()


# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Sequence length = 30 - Log Transformation

In [None]:
# Log Transform
min_target = y_train.min()
shift_value = abs(min_target) + 1

y_train_log = np.log1p(y_train + shift_value)

# Sequence Generator with Efficient NumPy Access
def create_lstm_sequences(X_array, y_array, branch_array, window):
    X_numeric_seq = []
    X_branch_seq = []
    y_seq = []

    for i in range(window, len(X_array)):
        X_numeric_seq.append(X_array[i - window:i])
        X_branch_seq.append(branch_array[i])
        y_seq.append(y_array[i])

    return (
        np.array(X_numeric_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )

#Prepare Train Data
X_train_full_sorted = X_train_full.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)
X_array = X_train_full_sorted[numeric_cols].to_numpy(dtype=np.float32)
y_array = y_train_log.loc[X_train_full_sorted.index].to_numpy(dtype=np.float32)
branch_array = X_train_full_sorted["BRANCHID"].to_numpy(dtype=np.int32)

SEQ_LEN = 30
X_num_seq, X_branch_seq, y_seq_log = create_lstm_sequences(X_array, y_array, branch_array, SEQ_LEN)

# Prepare Test Data
X_test_full_reset = X_test_full.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)
test_merged = pd.concat([X_test_full_reset, y_test_reset], axis=1)
test_merged["TXNDATE"] = pd.to_datetime(test_merged["TXNDATE"])
test_merged = test_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_test_num_seq = []
X_test_branch_seq = []
y_test_seq = []

for branch_id, group in test_merged.groupby("BRANCHID"):
    group = group.reset_index(drop=True)
    numeric_array = group[numeric_cols].to_numpy(dtype=np.float32)
    target_array = group["NetCashFlow"].to_numpy(dtype=np.float32)
    
    for i in range(len(group) - SEQ_LEN):
        X_test_num_seq.append(numeric_array[i:i+SEQ_LEN])
        X_test_branch_seq.append(branch_id)
        y_test_seq.append(target_array[i + SEQ_LEN])

X_num_seq_test = np.array(X_test_num_seq, dtype=np.float32)
X_branch_seq_test = np.array(X_test_branch_seq, dtype=np.int32).reshape(-1, 1)
y_seq_test = np.array(y_test_seq, dtype=np.float32)

# Model Definition
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

lstm_out = LSTM(64, return_sequences=False)(seq_input)

concat = Concatenate()([lstm_out, branch_embedding_flat])
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq_log,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict and Inverse Transform
y_pred_log = model.predict([X_num_seq_test, X_branch_seq_test])
y_pred = np.expm1(y_pred_log.flatten()) - shift_value

# Evaluate Performance
mae = mean_absolute_error(y_seq_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f" Test MAE: {mae:,.2f}")
print(f" Test RMSE: {rmse:,.2f}")

### Sequence length = 60 - Without log & Scaling

In [None]:
# Set Random Seed for Reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)

# Sequence Generator Function
def generate_sequences(data, seq_len, numeric_cols, target_col="NetCashFlow"):
    X_num_seq, X_branch_seq, y_seq = [], [], []
    
    for branch_id, group in data.groupby("BRANCHID"):
        group = group.sort_values("TXNDATE").reset_index(drop=True)
        for i in range(len(group) - seq_len):
            X_num_seq.append(group.loc[i:i+seq_len-1, numeric_cols].values)
            X_branch_seq.append(branch_id)
            y_seq.append(group.loc[i+seq_len, target_col])
    
    return (
        np.array(X_num_seq).astype(np.float32),
        np.array(X_branch_seq).reshape(-1, 1).astype(np.int32),
        np.array(y_seq).astype(np.float32)
    )

# Define Sequence Length and Columns
SEQ_LEN = 60
numeric_cols = [c for c in X_train_full.columns if c not in ["TXNDATE", "BRANCHID", "NetCashFlow"]]

# Prepare Train Data
train_merged = pd.concat([X_train_full.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
train_merged["TXNDATE"] = pd.to_datetime(train_merged["TXNDATE"])
train_merged = train_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq, X_branch_seq, y_seq = generate_sequences(train_merged, SEQ_LEN, numeric_cols)

# Prepare Test Data
X_test_aligned = (
    X_test_full.drop(columns=["TXNDATE"])
    .reindex(columns=[c for c in X_train_full.columns if c != "TXNDATE"], fill_value=0)
)
test_merged = pd.concat([
    X_test_aligned.reset_index(drop=True),
    X_test_full["TXNDATE"].reset_index(drop=True),
    y_test.reset_index(drop=True)
], axis=1)
test_merged["TXNDATE"] = pd.to_datetime(test_merged["TXNDATE"])
test_merged = test_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq_test, X_branch_seq_test, y_seq_test = generate_sequences(test_merged, SEQ_LEN, numeric_cols)

# Model Architecture

n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# LSTM Path
lstm_out = LSTM(64, return_sequences=False)(seq_input)

# Merge
concat = Concatenate()([lstm_out, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Sequence length = 60 - With Scaling

In [None]:
#Set Random Seed for Reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)

# Sequence Generator Function
def generate_sequences(data, seq_len, numeric_cols, target_col="NetCashFlow"):
    X_num_seq, X_branch_seq, y_seq = [], [], []
    
    for branch_id, group in data.groupby("BRANCHID"):
        group = group.sort_values("TXNDATE").reset_index(drop=True)
        
        # Convert once per branch to NumPy arrays 
        numeric_array = group[numeric_cols].to_numpy(dtype=np.float32)
        target_array = group[target_col].to_numpy(dtype=np.float32)
        
        for i in range(len(group) - seq_len):
            X_num_seq.append(numeric_array[i:i+seq_len])
            y_seq.append(target_array[i + seq_len])
            X_branch_seq.append(branch_id)
    
    return (
        np.array(X_num_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )


# Define Sequence Length and Columns
SEQ_LEN = 60
numeric_cols = [c for c in X_train_full.columns if c not in ["TXNDATE", "BRANCHID", "NetCashFlow"]]

# Prepare and Scale Train Data
train_merged = pd.concat([X_train_full.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
train_merged["TXNDATE"] = pd.to_datetime(train_merged["TXNDATE"])
train_merged = train_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

# Scale numeric columns in training data
scaler = StandardScaler()
train_merged[numeric_cols] = scaler.fit_transform(train_merged[numeric_cols])

X_num_seq, X_branch_seq, y_seq = generate_sequences(train_merged, SEQ_LEN, numeric_cols)

# Prepare and Scale Test Data
X_test_aligned = (
    X_test_full.drop(columns=["TXNDATE"])
    .reindex(columns=[c for c in X_train_full.columns if c != "TXNDATE"], fill_value=0)
)
test_merged = pd.concat([
    X_test_aligned.reset_index(drop=True),
    X_test_full["TXNDATE"].reset_index(drop=True),
    y_test.reset_index(drop=True)
], axis=1)
test_merged["TXNDATE"] = pd.to_datetime(test_merged["TXNDATE"])
test_merged = test_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

# transform test numeric data
test_merged[numeric_cols] = scaler.transform(test_merged[numeric_cols])

X_num_seq_test, X_branch_seq_test, y_seq_test = generate_sequences(test_merged, SEQ_LEN, numeric_cols)

# Model Architecture
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# LSTM Path
lstm_out = LSTM(64, return_sequences=False)(seq_input)

# Merge
concat = Concatenate()([lstm_out, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Sequence length = 60 - Log Transformation

In [None]:
# Log Transform
min_target = y_train.min()
shift_value = abs(min_target) + 1

y_train_log = np.log1p(y_train + shift_value)

# Sequence Generator with Efficient NumPy Access
def create_lstm_sequences(X_array, y_array, branch_array, window):
    X_numeric_seq = []
    X_branch_seq = []
    y_seq = []

    for i in range(window, len(X_array)):
        X_numeric_seq.append(X_array[i - window:i])
        X_branch_seq.append(branch_array[i])
        y_seq.append(y_array[i])

    return (
        np.array(X_numeric_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )

#Prepare Train Data
X_train_full_sorted = X_train_full.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)
X_array = X_train_full_sorted[numeric_cols].to_numpy(dtype=np.float32)
y_array = y_train_log.loc[X_train_full_sorted.index].to_numpy(dtype=np.float32)
branch_array = X_train_full_sorted["BRANCHID"].to_numpy(dtype=np.int32)

SEQ_LEN = 60
X_num_seq, X_branch_seq, y_seq_log = create_lstm_sequences(X_array, y_array, branch_array, SEQ_LEN)

# Prepare Test Data 
X_test_full_reset = X_test_full.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)
test_merged = pd.concat([X_test_full_reset, y_test_reset], axis=1)
test_merged["TXNDATE"] = pd.to_datetime(test_merged["TXNDATE"])
test_merged = test_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_test_num_seq = []
X_test_branch_seq = []
y_test_seq = []

for branch_id, group in test_merged.groupby("BRANCHID"):
    group = group.reset_index(drop=True)
    numeric_array = group[numeric_cols].to_numpy(dtype=np.float32)
    target_array = group["NetCashFlow"].to_numpy(dtype=np.float32)
    
    for i in range(len(group) - SEQ_LEN):
        X_test_num_seq.append(numeric_array[i:i+SEQ_LEN])
        X_test_branch_seq.append(branch_id)
        y_test_seq.append(target_array[i + SEQ_LEN])

X_num_seq_test = np.array(X_test_num_seq, dtype=np.float32)
X_branch_seq_test = np.array(X_test_branch_seq, dtype=np.int32).reshape(-1, 1)
y_seq_test = np.array(y_test_seq, dtype=np.float32)

# Model Definition
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

lstm_out = LSTM(64, return_sequences=False)(seq_input)

concat = Concatenate()([lstm_out, branch_embedding_flat])
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq_log,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict and Inverse Transform
y_pred_log = model.predict([X_num_seq_test, X_branch_seq_test])
y_pred = np.expm1(y_pred_log.flatten()) - shift_value

# Evaluate Performance
mae = mean_absolute_error(y_seq_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f" Test MAE: {mae:,.2f}")
print(f" Test RMSE: {rmse:,.2f}")


## Improving the basic LSTM

### Common sequence

In [None]:
# Set Random Seed for Reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)

# Optimized Sequence Generator
def generate_sequences(data, seq_len, numeric_cols, target_col="NetCashFlow"):
    X_num_seq, X_branch_seq, y_seq = [], [], []

    for branch_id, group in data.groupby("BRANCHID"):
        group = group.sort_values("TXNDATE").reset_index(drop=True)
        num_array = group[numeric_cols].to_numpy(dtype=np.float32)
        target_array = group[target_col].to_numpy(dtype=np.float32)

        for i in range(len(group) - seq_len):
            X_num_seq.append(num_array[i:i+seq_len])
            X_branch_seq.append(branch_id)
            y_seq.append(target_array[i + seq_len])

    return (
        np.array(X_num_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )

# Define Sequence Length and Feature Columns
SEQ_LEN = 60
numeric_cols = [c for c in X_train_full.columns if c not in ["TXNDATE", "BRANCHID", "NetCashFlow"]]

# Prepare Train Data
train_merged = pd.concat([X_train_full.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
train_merged["TXNDATE"] = pd.to_datetime(train_merged["TXNDATE"])
train_merged = train_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq, X_branch_seq, y_seq = generate_sequences(train_merged, SEQ_LEN, numeric_cols)

# Prepare Test Data
X_test_aligned = (
    X_test_full.drop(columns=["TXNDATE"])
    .reindex(columns=[c for c in X_train_full.columns if c != "TXNDATE"], fill_value=0)
)
test_merged = pd.concat([
    X_test_aligned.reset_index(drop=True),
    X_test_full["TXNDATE"].reset_index(drop=True),
    y_test.reset_index(drop=True)
], axis=1)
test_merged["TXNDATE"] = pd.to_datetime(test_merged["TXNDATE"])
test_merged = test_merged.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq_test, X_branch_seq_test, y_seq_test = generate_sequences(test_merged, SEQ_LEN, numeric_cols)

### Tuned Architecture (Deep LSTM with Dropout & Dense Layers - 60 day sequence)

In [None]:
# Model Architecture
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Reshape((8,), name="reshape_embedding")(branch_embedding)

# Stacked LSTM layers
lstm_1 = LSTM(64, return_sequences=True, name="lstm_layer_1")(seq_input)
dropout_1 = Dropout(0.3, name="dropout_1")(lstm_1)

lstm_2 = LSTM(32, return_sequences=False, name="lstm_layer_2")(dropout_1)
dropout_2 = Dropout(0.3, name="dropout_2")(lstm_2)

# Merge LSTM and branch embedding
merged = Concatenate(name="concat_lstm_branch")([dropout_2, branch_embedding_flat])

# Dense layers
dense_1 = Dense(64, activation="relu", name="dense_1")(merged)
dropout_3 = Dropout(0.2, name="dropout_3")(dense_1)
dense_2 = Dense(32, activation="relu", name="dense_2")(dropout_3)

# Output
output = Dense(1, name="output")(dense_2)

# Compile model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Basic LSTM with Residual Connection

In [None]:
# --- LSTM Model with Residual Connection ---

# Model Architecture

n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# Residual LSTM Block
lstm_out = LSTM(64, return_sequences=False, name="main_lstm")(seq_input)

# Project input to same shape for residual connection
residual_proj = Dense(64, name="residual_proj")(seq_input[:, -1, :])  # take last timestep input
residual_out = Add(name="residual_add")([lstm_out, residual_proj])

# Merge
concat = Concatenate(name="merge_lstm_branch")([residual_out, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu", name="dense_1")(concat)
x = Dropout(0.3, name="dropout_1")(x)
x = Dense(32, activation="relu", name="dense_2")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Basic LSTM with Attention Layer

In [None]:
# --- LSTM Model with Attention ---

# Model Architecture with Attention

from tensorflow.keras.layers import Layer, InputSpec
import tensorflow.keras.backend as K

# Custom Attention Layer
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros", trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

# Number of branches
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# LSTM + Attention
lstm_out = LSTM(64, return_sequences=True)(seq_input)
attention_out = Attention()(lstm_out)

# Merge
concat = Concatenate()([attention_out, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1)(x)

# Compile Model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Basic LSTM Model with Multi-Head Attention

In [None]:
# Model with Multi-Head Attention

from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding, Concatenate, MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Number of branches
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# LSTM to generate temporal representations
lstm_out = LSTM(64, return_sequences=True, name="lstm_output")(seq_input)

# Multi-Head Self-Attention
attention_out = MultiHeadAttention(num_heads=4, key_dim=16, name="multihead_attention")(lstm_out, lstm_out)
attention_out = LayerNormalization(epsilon=1e-6)(attention_out + lstm_out)  # Residual connection

# Global Pooling after attention
context_vector = GlobalAveragePooling1D(name="global_avg_pool")(attention_out)

# Merge with branch embedding
merged = Concatenate(name="merged_features")([context_vector, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(merged)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1)(x)

# Compile Model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Basic LSTM Model with Residuals + Multi-Head Attention

In [None]:
# --- Model Architecture ---
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu", name="branch_dense")(branch_embedding[:, 0, :])

# LSTM Layer (with return_sequences for attention)
lstm_out = LSTM(64, return_sequences=True, name="main_lstm")(seq_input)

# Multi-Head Attention
attn_out = MultiHeadAttention(num_heads=4, key_dim=16, name="multihead_attention")(lstm_out, lstm_out)

# Residual Connection and Normalization
attn_res = Add(name="attn_residual_add")([lstm_out, attn_out])
attn_norm = LayerNormalization(name="attn_norm")(attn_res)

# Global Average Pooling (to convert 3D to 2D before merge)
pooled_output = GlobalAveragePooling1D(name="avg_pool")(attn_norm)

# Merge attention output with branch embedding
merged = Concatenate(name="merge_lstm_branch")([pooled_output, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu", name="dense_1")(merged)
x = Dropout(0.3, name="dropout_1")(x)
x = Dense(32, activation="relu", name="dense_2")(x)
output = Dense(1, name="output")(x)

# Build & Compile Model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# --- Training ---
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# --- Prediction & Evaluation ---
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Tuned LSTM with Residual Connections

In [None]:
# Model Architecture
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Reshape((8,), name="reshape_embedding")(branch_embedding)

# Project input for residual connection
proj_input = TimeDistributed(Dense(64), name="proj_input")(seq_input)

# First LSTM layer + residual
lstm_1 = LSTM(64, return_sequences=True, name="lstm_layer_1")(seq_input)
dropout_1 = Dropout(0.3, name="dropout_1")(lstm_1)
residual_1 = Add(name="residual_1")([proj_input, dropout_1])

# Second LSTM layer
lstm_2 = LSTM(32, return_sequences=False, name="lstm_layer_2")(residual_1)
dropout_2 = Dropout(0.3, name="dropout_2")(lstm_2)

# Merge LSTM and branch embedding
merged = Concatenate(name="concat_lstm_branch")([dropout_2, branch_embedding_flat])

# Dense layers
dense_1 = Dense(64, activation="relu", name="dense_1")(merged)
dropout_3 = Dropout(0.2, name="dropout_3")(dense_1)
dense_2 = Dense(32, activation="relu", name="dense_2")(dropout_3)

# Output
output = Dense(1, name="output")(dense_2)

# Compile model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Tuned LSTM with Residual + Multi-Head Attention

In [None]:
# Model parameters
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Reshape((8,), name="reshape_embedding")(branch_embedding)

# Project input for residual
proj_input = TimeDistributed(Dense(64), name="proj_input")(seq_input)

# First LSTM layer + Dropout + Residual
lstm_1 = LSTM(64, return_sequences=True, name="lstm_1")(seq_input)
dropout_1 = Dropout(0.3, name="dropout_1")(lstm_1)
residual_1 = Add(name="residual_1")([proj_input, dropout_1])
norm_1 = LayerNormalization(name="norm_1")(residual_1)

# Multi-Head Attention
attention_output = MultiHeadAttention(num_heads=4, key_dim=16, name="multihead_attention")(norm_1, norm_1)
residual_2 = Add(name="residual_2")([norm_1, attention_output])
norm_2 = LayerNormalization(name="norm_2")(residual_2)

# Second LSTM layer (no return_sequences)
lstm_2 = LSTM(32, return_sequences=False, name="lstm_2")(norm_2)
dropout_2 = Dropout(0.3, name="dropout_2")(lstm_2)

# Merge with branch embedding
merged = Concatenate(name="merge")([dropout_2, branch_embedding_flat])

# Dense layers
dense_1 = Dense(64, activation="relu", name="dense_1")(merged)
dropout_3 = Dropout(0.2, name="dropout_3")(dense_1)
dense_2 = Dense(32, activation="relu", name="dense_2")(dropout_3)

# Output layer
output = Dense(1, name="output")(dense_2)

# Define and compile model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Tuned LSTM with Residual Connections + Early Stopping + Batch normalization 

In [None]:
# Model Architecture
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Reshape((8,), name="reshape_embedding")(branch_embedding)

# Project input for residual connection
proj_input = TimeDistributed(Dense(64), name="proj_input")(seq_input)

# First LSTM layer + dropout + batch norm + residual
lstm_1 = LSTM(64, return_sequences=True, name="lstm_layer_1")(seq_input)
lstm_1 = BatchNormalization(name="bn_lstm1")(lstm_1)
dropout_1 = Dropout(0.3, name="dropout_1")(lstm_1)
residual_1 = Add(name="residual_1")([proj_input, dropout_1])

# Second LSTM layer + dropout + batch norm
lstm_2 = LSTM(32, return_sequences=False, name="lstm_layer_2")(residual_1)
lstm_2 = BatchNormalization(name="bn_lstm2")(lstm_2)
dropout_2 = Dropout(0.3, name="dropout_2")(lstm_2)

# Merge LSTM and branch embedding
merged = Concatenate(name="concat_lstm_branch")([dropout_2, branch_embedding_flat])

# Dense layers with BatchNorm + Dropout
dense_1 = Dense(64, activation="relu", name="dense_1")(merged)
dense_1 = BatchNormalization(name="bn_dense1")(dense_1)
dropout_3 = Dropout(0.2, name="dropout_3")(dense_1)

dense_2 = Dense(32, activation="relu", name="dense_2")(dropout_3)
dense_2 = BatchNormalization(name="bn_dense2")(dense_2)

# Output layer
output = Dense(1, name="output")(dense_2)

# Compile model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# EarlyStopping Callback
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True,
    verbose=1
)

# Train Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=30,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stopping],
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])

mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


## BiLSTM

### Basic Bidirectional LSTM + Dropout + Dense Layers

In [None]:
# Define Model Architecture
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Define inputs
input_numeric = Input(shape=(SEQ_LEN, len(numeric_cols)), name="numeric_input")
input_branch = Input(shape=(1,), name="branch_input")

# Embedding layer for branch ID
embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(input_branch)
embedding_flat = Reshape((8,), name="reshape_embedding")(embedding)

# Bidirectional LSTM + Dropout
lstm_out = Bidirectional(LSTM(64, return_sequences=False), name="bilstm")(input_numeric)
dropout = Dropout(0.3, name="dropout")(lstm_out)

# Concatenate LSTM and branch embedding
concat = Concatenate(name="concat")([dropout, embedding_flat])

# Additional Dense layer
dense = Dense(32, activation="relu", name="dense_1")(concat)
output = Dense(1, name="output")(dense)

# Define and compile the model
model = Model(inputs=[input_numeric, input_branch], outputs=output)
model.compile(loss="mse", optimizer="adam", metrics=["mae"])
model.summary()


# Train the Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)


# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

## Tuned Architecture (Stacked BiLSTM + Dropout + Dense)

### 01

In [None]:
# Bidirectional LSTM Model
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# Bidirectional LSTM Path
bilstm_out = Bidirectional(LSTM(64, return_sequences=False))(seq_input)

# Merge
concat = Concatenate()([bilstm_out, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train the Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### 02

In [None]:
# Bidirectional LSTM Model
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Bidirectional LSTM Stack with Dropout and Dense
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Embedding + Reshape
branch_embedding = Embedding(input_dim=n_branches, output_dim=8)(branch_input)
embedding_flat = Reshape((8,))(branch_embedding)

# BiLSTM Stack
bilstm_1 = Bidirectional(LSTM(64, return_sequences=True))(seq_input)
dropout_1 = Dropout(0.3)(bilstm_1)
bilstm_2 = Bidirectional(LSTM(32, return_sequences=False))(dropout_1)
dropout_2 = Dropout(0.3)(bilstm_2)

# Merge and Dense
concat = Concatenate()([dropout_2, embedding_flat])
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1)(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# Train the Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Attention Mechanism - BILSTM Tuned 01

In [None]:
# Bidirectional LSTM + Attention Model
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8)(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# BiLSTM
bilstm_out = Bidirectional(LSTM(64, return_sequences=True))(seq_input)

# Attention
score = Dense(1)(bilstm_out)  
attention_weights = Lambda(lambda x: K.softmax(x, axis=1))(score)
context_vector = Multiply()([bilstm_out, attention_weights])
context_vector = Lambda(lambda x: K.sum(x, axis=1))(context_vector)

# Merge
concat = Concatenate()([context_vector, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

# Model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# Train the Model
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Multi-Head Attention + BiLSTM Tuned 01

In [None]:
# BiLSTM + Multi-Head Attention Model
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# BiLSTM Layer (return_sequences=True for attention)
bilstm_out = Bidirectional(LSTM(64, return_sequences=True))(seq_input)

# Multi-Head Attention
attention_out = MultiHeadAttention(num_heads=4, key_dim=32)(bilstm_out, bilstm_out)
attention_out = LayerNormalization(epsilon=1e-6)(attention_out + bilstm_out)

# Global Average Pooling (or Flatten last timestep)
attn_flat = tf.keras.layers.GlobalAveragePooling1D()(attention_out)

# Merge Attention Output + Branch Embedding
concat = Concatenate()([attn_flat, branch_embedding_flat])

# Dense Layers
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

# Model Compile
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Model Training
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Evaluation
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE : {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Multi-Head Attention + Residual Connections + EarlyStopping + BiLSTM Tuned 01

In [None]:
# Inputs and Embeddings 
n_branches = int(max(X_branch_seq.max(), X_branch_seq_test.max())) + 1

seq_input = Input(shape=(SEQ_LEN, len(numeric_cols)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

# BiLSTM Block 
bilstm_out = Bidirectional(LSTM(64, return_sequences=True))(seq_input)

# Multi-Head Attention + Residual
attention_out = MultiHeadAttention(num_heads=4, key_dim=32)(bilstm_out, bilstm_out)
attention_out = Add()([attention_out, bilstm_out])  # Residual connection
attention_out = LayerNormalization(epsilon=1e-6)(attention_out)

# Global Pooling
attn_flat = GlobalAveragePooling1D()(attention_out)

# Concatenate with Branch Embedding
concat = Concatenate()([attn_flat, branch_embedding_flat])

# Dense Layers with Residual Connection
dense_input = concat
x = Dense(64, activation="relu")(dense_input)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)

# Residual projection (if input dim ≠ x dim)
residual_proj = Dense(32)(dense_input)
x = Add()([x, residual_proj])  # Residual connection
x = LayerNormalization(epsilon=1e-6)(x)

# Output
output = Dense(1, name="output")(x)

# Compile Model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Callbacks
early_stop = EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True, verbose=1
)

# Training
history = model.fit(
    [X_num_seq, X_branch_seq],
    y_seq,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

# Evaluation
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"\n Test MAE : {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")


### Feature Selection

### Common Sequnce

In [None]:
# Reproducibility 
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)

# Feature Selection
TOP_N_FEATURES = 50
exclude_cols = ["TXNDATE", "BRANCHID"]
all_numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

X_temp_selected = X_train_full[all_numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp_selected = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=42)
xgb_model.fit(X_temp_selected, y_temp_selected)

importances = xgb_model.feature_importances_
feat_imp = pd.Series(importances, index=X_temp_selected.columns).sort_values(ascending=False)
top_features_selected = feat_imp.head(TOP_N_FEATURES).index.tolist()
print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features_selected)

# Use for modeling
numeric_cols_selected = top_features_selected

# Sequence Generator (Per Branch)
def generate_sequences(data, seq_len, numeric_cols, target_col="NetCashFlow"):
    X_num_seq, X_branch_seq, y_seq = [], [], []

    for branch_id, group in data.groupby("BRANCHID"):
        group = group.sort_values("TXNDATE").reset_index(drop=True)
        num_array = group[numeric_cols].to_numpy(dtype=np.float32)
        target_array = group[target_col].to_numpy(dtype=np.float32)

        for i in range(len(group) - seq_len):
            X_num_seq.append(num_array[i:i+seq_len])
            X_branch_seq.append(branch_id)
            y_seq.append(target_array[i + seq_len])

    return (
        np.array(X_num_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )

# Prepare Train Sequences
SEQ_LEN = 60

train_merged_selected = pd.concat([X_train_full.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
train_merged_selected["TXNDATE"] = pd.to_datetime(train_merged_selected["TXNDATE"])
train_merged_selected = train_merged_selected.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq_selected, X_branch_seq_selected, y_seq_selected = generate_sequences(
    train_merged_selected, SEQ_LEN, numeric_cols_selected
)

# Prepare Test Sequences
X_test_aligned_selected = (
    X_test_full.drop(columns=["TXNDATE"])
    .reindex(columns=[c for c in X_train_full.columns if c != "TXNDATE"], fill_value=0)
)

test_merged_selected = pd.concat([
    X_test_aligned_selected.reset_index(drop=True),
    X_test_full["TXNDATE"].reset_index(drop=True),
    y_test.reset_index(drop=True)
], axis=1)

test_merged_selected["TXNDATE"] = pd.to_datetime(test_merged_selected["TXNDATE"])
test_merged_selected = test_merged_selected.sort_values(["BRANCHID", "TXNDATE"]).reset_index(drop=True)

X_num_seq_test_selected, X_branch_seq_test_selected, y_seq_test_selected = generate_sequences(
    test_merged_selected, SEQ_LEN, numeric_cols_selected
)


### Feature selection - basic LSTM model

In [None]:
# Define Model Architecture
n_branches = int(max(X_branch_seq_selected.max(), X_branch_seq_test_selected.max())) + 1

seq_input = Input(shape=(SEQ_LEN, len(numeric_cols_selected)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Dense(8, activation="relu")(branch_embedding[:, 0, :])

lstm_out = LSTM(64, return_sequences=False)(seq_input)

concat = Concatenate()([lstm_out, branch_embedding_flat])
x = Dense(64, activation="relu")(concat)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
output = Dense(1, name="output")(x)

model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Train Model
history = model.fit(
    [X_num_seq_selected, X_branch_seq_selected],
    y_seq_selected,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Evaluate Model
y_pred_selected = model.predict([X_num_seq_test_selected, X_branch_seq_test_selected])

mae_test_selected = mean_absolute_error(y_seq_test_selected, y_pred_selected)
rmse_test_selected = np.sqrt(mean_squared_error(y_seq_test_selected, y_pred_selected))

print(f"\nTest MAE: {mae_test_selected:,.2f}")
print(f"Test RMSE: {rmse_test_selected:,.2f}")


### Feature selection - Tuned LSTM with Residual Connections

In [None]:
# Model Architecture
n_branches = int(max(X_branch_seq_selected.max(), X_branch_seq_test_selected.max())) + 1

# Inputs
seq_input = Input(shape=(SEQ_LEN, len(numeric_cols_selected)), name="seq_input")
branch_input = Input(shape=(1,), name="branch_input")

# Branch Embedding
branch_embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(branch_input)
branch_embedding_flat = Reshape((8,), name="reshape_embedding")(branch_embedding)

# Project input for residual connection
proj_input = TimeDistributed(Dense(64), name="proj_input")(seq_input)

# First LSTM layer + residual
lstm_1 = LSTM(64, return_sequences=True, name="lstm_layer_1")(seq_input)
dropout_1 = Dropout(0.3, name="dropout_1")(lstm_1)
residual_1 = Add(name="residual_1")([proj_input, dropout_1])

# Second LSTM layer
lstm_2 = LSTM(32, return_sequences=False, name="lstm_layer_2")(residual_1)
dropout_2 = Dropout(0.3, name="dropout_2")(lstm_2)

# Merge LSTM and branch embedding
merged = Concatenate(name="concat_lstm_branch")([dropout_2, branch_embedding_flat])

# Dense layers
dense_1 = Dense(64, activation="relu", name="dense_1")(merged)
dropout_3 = Dropout(0.2, name="dropout_3")(dense_1)
dense_2 = Dense(32, activation="relu", name="dense_2")(dropout_3)

# Output layer
output = Dense(1, name="output")(dense_2)

# Compile model
model = Model(inputs=[seq_input, branch_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# --- Train Model ---
history = model.fit(
    [X_num_seq_selected, X_branch_seq_selected],
    y_seq_selected,
    epochs=20,#20
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predict & Evaluate
y_pred_selected = model.predict([X_num_seq_test_selected, X_branch_seq_test_selected])

mae_test_selected = mean_absolute_error(y_seq_test_selected, y_pred_selected)
rmse_test_selected = np.sqrt(mean_squared_error(y_seq_test_selected, y_pred_selected))

print(f"\nTest MAE: {mae_test_selected:,.2f}")
print(f"Test RMSE: {rmse_test_selected:,.2f}")


### Feature Selection - tuned 01 BiLSTM (BiLSTM + Multi-Head Attention)

In [None]:
# Define Inputs using selected features
n_branches = int(max(X_branch_seq_selected.max(), X_branch_seq_test_selected.max())) + 1

input_numeric = Input(shape=(SEQ_LEN, len(numeric_cols_selected)), name="numeric_input")
input_branch = Input(shape=(1,), name="branch_input")

# Branch Embedding
embedding = Embedding(input_dim=n_branches, output_dim=8, name="branch_embedding")(input_branch)
embedding_flat = Reshape((8,), name="reshape_embedding")(embedding)

# BiLSTM + Multi-Head Self-Attention 
bilstm_out = Bidirectional(LSTM(64, return_sequences=True), name="bilstm")(input_numeric)

attention_out = MultiHeadAttention(num_heads=4, key_dim=32, name="multihead_attention")(
    bilstm_out, bilstm_out
)
attention_out = LayerNormalization(epsilon=1e-6, name="layer_norm")(
    attention_out + bilstm_out
)

# Flatten & Merge
attn_flat = GlobalAveragePooling1D(name="global_avg_pool")(attention_out)
concat = Concatenate(name="concat")([attn_flat, embedding_flat])

# Dense Layers
x = Dense(64, activation="relu", name="dense_64")(concat)
x = Dropout(0.3, name="dropout")(x)
x = Dense(32, activation="relu", name="dense_32")(x)
output = Dense(1, name="output")(x)

# Compile Model
model = Model(inputs=[input_numeric, input_branch], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
model.summary()

# Train Model using selected feature sequences
history = model.fit(
    [X_num_seq_selected, X_branch_seq_selected],
    y_seq_selected,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Evaluate on test set with selected features
y_pred_selected = model.predict([X_num_seq_test_selected, X_branch_seq_test_selected])
mae_test_selected = mean_absolute_error(y_seq_test_selected, y_pred_selected)
rmse_test_selected = np.sqrt(mean_squared_error(y_seq_test_selected, y_pred_selected))

print(f"\nTest MAE (Selected Features): {mae_test_selected:,.2f}")
print(f"Test RMSE (Selected Features): {rmse_test_selected:,.2f}")



### Improved model - Feature Selection + Scaling

In [None]:
# Reproducibility Setup
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)


# Sequence Configuration
SEQ_LEN = 60


# Feature Selection
exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

TOP_N_FEATURES = 50
X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

feature_importances = pd.Series(xgb_model.feature_importances_, index=X_temp.columns)
top_features = feature_importances.nlargest(TOP_N_FEATURES).index.tolist()

print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features)


# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=top_features)


# Sequence Generator
def create_lstm_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []

    for i in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[i - window:i].values)
        X_branch_seq.append(branch_series.iloc[i])
        y_seq.append(y_series.iloc[i])

    return (
        np.array(X_numeric_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )


# Create Sequences
n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq_train, X_branch_seq_train, y_seq_train = create_lstm_sequences(
    X_train_scaled_df.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_lstm_sequences(
    X_test_scaled_df.reset_index(drop=True),
    y_test.reset_index(drop=True),
    #pd.Series(y_test).reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

In [None]:
# Model Architecture
input_numeric = Input(shape=(SEQ_LEN, len(top_features)), name="numeric_input")
input_branch = Input(shape=(1,), name="branch_input")

embedding = Embedding(input_dim=n_branches, output_dim=16, name="branch_embedding")(input_branch)
embedding_flat = Reshape((16,), name="reshape_embedding")(embedding)

lstm_out = Bidirectional(LSTM(64, return_sequences=False), name="bilstm")(input_numeric)
dropout = Dropout(0.3, name="dropout")(lstm_out)

concat = Concatenate(name="concat")([dropout, embedding_flat])
dense = Dense(32, activation="relu", name="dense_1")(concat)
output = Dense(1, name="output")(dense)

model = Model(inputs=[input_numeric, input_branch], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)

# Train Model
history = model.fit(
    [X_num_seq_train, X_branch_seq_train],
    y_seq_train,
    epochs=30,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


# Evaluate Model
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
mse_test = mean_squared_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mse_test)

print(f"\nTest MAE: {mae_test:,.2f}")
print(f"Test MSE: {mse_test:,.2f}")
print(f"Test RMSE: {rmse_test:,.2f}")


### + Attention

In [None]:
# Define Custom Attention Layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)
        self.score_dense = Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.score_dense(inputs))             
        attention_weights = tf.nn.softmax(score, axis=1)         
        context_vector = attention_weights * inputs              
        context_vector = tf.reduce_sum(context_vector, axis=1)   
        return context_vector

# Define BiLSTM + Attention Model
input_numeric = Input(shape=(SEQ_LEN, len(top_features)), name="numeric_input")
input_branch = Input(shape=(1,), name="branch_input")

# Embedding for branch
embedding = Embedding(input_dim=n_branches, output_dim=16, name="branch_embedding")(input_branch)
embedding_flat = Reshape((16,), name="reshape_embedding")(embedding)

# BiLSTM + Attention
lstm_out = Bidirectional(LSTM(64, return_sequences=True), name="bilstm")(input_numeric)
attention_out = Attention(name="attention")(lstm_out)
dropout = Dropout(0.3, name="dropout")(attention_out)

# Concatenate with branch embedding
concat = Concatenate(name="concat")([dropout, embedding_flat])
dense = Dense(32, activation="relu", name="dense_1")(concat)
output = Dense(1, name="output")(dense)

# Compile model
model = Model(inputs=[input_numeric, input_branch], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Train the Model
history = model.fit(
    [X_num_seq_train, X_branch_seq_train],
    y_seq_train,
    epochs=30,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


# Evaluate on Test Set
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
mse_test = mean_squared_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mse_test)

print(f" Test MAE: {mae_test:,.2f}")
print(f" Test MSE: {mse_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Improved model - Feature Selection + Scaling + Residual Connection

In [None]:
# Model Architecture with Residual
input_numeric = Input(shape=(SEQ_LEN, len(top_features)), name="numeric_input")
input_branch = Input(shape=(1,), name="branch_input")

# Project input for residual connection
x_proj = GlobalAveragePooling1D()(input_numeric)  
x_proj = Dense(128, activation=None, name="residual_projection")(x_proj) 

# Embedding
embedding = Embedding(input_dim=n_branches, output_dim=16, name="branch_embedding")(input_branch)
embedding_flat = Reshape((16,), name="reshape_embedding")(embedding)

# BiLSTM Block
lstm_out = Bidirectional(LSTM(64, return_sequences=False), name="bilstm")(input_numeric)

# Residual Connection
residual_out = Add(name="residual_add")([lstm_out, x_proj])

# Continue with Dropout and Dense
dropout = Dropout(0.3, name="dropout")(residual_out)
concat = Concatenate(name="concat")([dropout, embedding_flat])
dense = Dense(32, activation="relu", name="dense_1")(concat)
output = Dense(1, name="output")(dense)

# Compile
model = Model(inputs=[input_numeric, input_branch], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
model.summary()


# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)


# Train the Model
history = model.fit(
    [X_num_seq_train, X_branch_seq_train],
    y_seq_train,
    epochs=30,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


# Evaluate on Test Set
y_pred = model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
mse_test = mean_squared_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mse_test)

print(f" Test MAE: {mae_test:,.2f}")
print(f" Test RMSE: {rmse_test:,.2f}")

### Randomised search

In [None]:
# Reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)


# Constants
SEQ_LEN = 60
TOP_N_FEATURES = 50


exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

feature_importances = pd.Series(xgb_model.feature_importances_, index=X_temp.columns)
top_features = feature_importances.nlargest(TOP_N_FEATURES).index.tolist()


# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=top_features)


# Sequence Generator
def create_lstm_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []

    for i in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[i - window:i].values)
        X_branch_seq.append(branch_series.iloc[i])
        y_seq.append(y_series.iloc[i])

    return (
        np.array(X_numeric_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )

n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq_train, X_branch_seq_train, y_seq_train = create_lstm_sequences(
    X_train_scaled_df.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_lstm_sequences(
    X_test_scaled_df.reset_index(drop=True),
    y_test.reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    SEQ_LEN
)

In [None]:
# Keras Tuner Model
def build_model(hp):
    input_numeric = Input(shape=(SEQ_LEN, len(top_features)), name="numeric_input")
    input_branch = Input(shape=(1,), name="branch_input")

    # Embedding dimension
    embed_dim = hp.Int("embed_dim", min_value=4, max_value=32, step=4)
    embedding = Embedding(input_dim=n_branches, output_dim=embed_dim, name="branch_embedding")(input_branch)
    embedding_flat = Reshape((embed_dim,), name="reshape_embedding")(embedding)

    # BiLSTM
    lstm_units = hp.Int("lstm_units", min_value=32, max_value=128, step=16)
    lstm_out = Bidirectional(LSTM(lstm_units, return_sequences=False), name="bilstm")(input_numeric)

    # Dropout
    dropout_rate = hp.Float("dropout", min_value=0.1, max_value=0.5, step=0.1)
    dropout = Dropout(dropout_rate)(lstm_out)

    # Dense Layers
    dense_units = hp.Int("dense_units", min_value=16, max_value=128, step=16)
    concat = Concatenate()([dropout, embedding_flat])
    dense = Dense(dense_units, activation="relu")(concat)
    output = Dense(1, name="output")(dense)

    model = Model(inputs=[input_numeric, input_branch], outputs=output)
    
    lr = hp.Float("learning_rate", min_value=1e-4, max_value=1e-2, sampling="log")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss="mse", metrics=["mae"])
    
    return model

# Tuner Setup
tuner = kt.RandomSearch(
    build_model,
    objective="val_mae",
    max_trials=10,
    executions_per_trial=1,
    directory="tuner_results",
    project_name="bilstm_cashflow"
)

# Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)

# Search
tuner.search(
    [X_num_seq_train, X_branch_seq_train],
    y_seq_train,
    validation_split=0.1,
    epochs=25,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=2
)

# Get Best Model
best_model = tuner.get_best_models(num_models=1)[0]
best_hp = tuner.get_best_hyperparameters(1)[0]

# Evaluate
y_pred = best_model.predict([X_num_seq_test, X_branch_seq_test])
mae = mean_absolute_error(y_seq_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print("\nBest Hyperparameters:")
print(best_hp.values)
print(f"\nTest MAE: {mae:,.2f}")
print(f"Test RMSE: {rmse:,.2f}")


In [None]:
# Save best hyperparameters as dictionary
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
hp_dict = best_hps.values

import json
with open("p2.json", "w") as f:
    json.dump(hp_dict, f)

In [None]:
# Load the saved hyperparameters
with open("p2.json", "r") as f:
    saved_hps = json.load(f)

# Print the hyperparameters nicely
print("Saved Hyperparameters:")
for key, value in saved_hps.items():
    print(f"{key}: {value}")

### Common Sequence

In [None]:
# Reproducibility Setup
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Sequence Configuration
SEQ_LEN = 60

# Feature Selection
exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

TOP_N_FEATURES = 50
X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

feature_importances = pd.Series(xgb_model.feature_importances_, index=X_temp.columns)
top_features = feature_importances.nlargest(TOP_N_FEATURES).index.tolist()

print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=top_features)

# Sequence Generator
def create_lstm_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []

    for i in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[i - window:i].values)
        X_branch_seq.append(branch_series.iloc[i])
        y_seq.append(y_series.iloc[i])

    return (
        np.array(X_numeric_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )

# Create Sequences
n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq_train, X_branch_seq_train, y_seq_train = create_lstm_sequences(
    X_train_scaled_df.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_lstm_sequences(
    X_test_scaled_df.reset_index(drop=True),
    #y_test.reset_index(drop=True),
    pd.Series(y_test).reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

### With valuation

In [None]:
# Load Saved Hyperparameters
with open("p2.json", "r") as f:
    best_params = json.load(f)

# Rebuild Model Using Saved Hyperparameters
def build_final_lstm_model(input_shape, n_branches, best_params):
    input_numeric = Input(shape=input_shape, name="numeric_input")
    input_branch = Input(shape=(1,), name="branch_input")

    # Embedding Layer
    embed_dim = best_params.get("embed_dim", 16)
    embedding = Embedding(input_dim=n_branches, output_dim=embed_dim, name="branch_embedding")(input_branch)
    embedding_flat = Reshape((embed_dim,), name="reshape_embedding")(embedding)

    # LSTM Block
    lstm_units = best_params.get("lstm_units", 64)
    dropout_rate = best_params.get("dropout", 0.3)
    lstm_out = Bidirectional(LSTM(lstm_units, return_sequences=False), name="bilstm")(input_numeric)
    dropout = Dropout(dropout_rate, name="dropout")(lstm_out)

    # Fully Connected Layers
    dense_units = best_params.get("dense_units", 32)
    concat = Concatenate(name="concat")([dropout, embedding_flat])
    dense = Dense(dense_units, activation="relu", name="dense_1")(concat)

    # Output
    output = Dense(1, name="output")(dense)

    # Build and Compile
    model = Model(inputs=[input_numeric, input_branch], outputs=output)
    model.compile(
        optimizer="adam",
        loss="mse",
        metrics=["mae"]
    )
    return model

# Build the Model
input_shape = (SEQ_LEN, len(top_features))  # assuming SEQ_LEN and top_features are defined
model_1 = build_final_lstm_model(input_shape, n_branches, best_params)
model_1.summary()

# Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)

# Internal Train-Test Split from Training Sequences
X_num_train, X_num_val, X_branch_train, X_branch_val, y_train_new, y_val = train_test_split(
    X_num_seq_train, X_branch_seq_train, y_seq_train, test_size=0.1, shuffle=False
)

# Train Final Model with Manual Validation Set
history = model_1.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_val, X_branch_val], y_val),
    epochs=30,
    batch_size=best_params.get("batch_size", 64),
    callbacks=[early_stop, reduce_lr],
    shuffle=False,
    verbose=1
)


# Evaluate on Test Set
y_pred_1 = model_1.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred_1)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred_1))

print(f"\nTest MAE: {mae_test:,.2f}")
print(f"Test RMSE: {rmse_test:,.2f}")

### Improved structure

In [None]:
# Load Saved Hyperparameters
with open("p2.json", "r") as f:
    best_params = json.load(f)

# Custom Attention Layer
@register_keras_serializable()
class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)
        self.supports_masking = True
        self.W = Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.W(inputs))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        return tf.reduce_sum(context_vector, axis=1)

    def get_config(self):
        return super(CustomAttention, self).get_config()

# Rebuild Enhanced LSTM Model Using Saved Hyperparameters
def build_final_lstm_model_with_attention(input_shape, n_branches, best_params):
    # Inputs
    numeric_input = Input(shape=input_shape, name="numeric_input")       
    branch_input = Input(shape=(1,), name="branch_input")                

    # Branch Embedding + Repeat
    embed_dim = best_params.get("embed_dim", 16)
    branch_embedding = Embedding(input_dim=n_branches, output_dim=embed_dim, name="branch_embedding")(branch_input)
    branch_embedding = branch_embedding[:, 0, :]                                  
    branch_embedding_repeated = RepeatVector(input_shape[0])(branch_embedding)     

    # Concatenate Embedding with Numeric Input
    x = Concatenate(name="concat_embedding_numeric")([numeric_input, branch_embedding_repeated]) 

    # BiLSTM Block with Return Sequences
    lstm_units = best_params.get("lstm_units", 64)
    x = Bidirectional(LSTM(units=lstm_units, return_sequences=True), name="bilstm")(x)

    # Attention Layer
    x = CustomAttention(name="attention")(x)

    # Dense + Dropout
    dense_units = best_params.get("dense_units", 32)
    dropout_rate = best_params.get("dropout", 0.3)
    x = Dense(dense_units, activation="relu", name="dense")(x)
    x = Dropout(dropout_rate, name="dropout")(x)

    # Output Layer
    output = Dense(1, name="output")(x)

    # Build and Compile
    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(
        optimizer="adam",
        loss="mse",
        metrics=["mae"]
    )

    return model

# Build the Model
input_shape = (SEQ_LEN, len(top_features))  # assuming SEQ_LEN and top_features are defined
model_2 = build_final_lstm_model_with_attention(input_shape, n_branches, best_params)
model_2.summary()

# Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)

# Internal Train-Test Split from Training Sequences
X_num_train, X_num_val, X_branch_train, X_branch_val, y_train_new, y_val = train_test_split(
    X_num_seq_train, X_branch_seq_train, y_seq_train, test_size=0.1, shuffle=False
)

# Train Final Model
history = model_2.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_val, X_branch_val], y_val),
    epochs=30,
    batch_size=best_params.get("batch_size", 64),
    callbacks=[early_stop, reduce_lr],
    shuffle=False,
    verbose=1
)

# Evaluate on Test Set
y_pred_2 = model_2.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred_2)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred_2))

print(f"\n Final Test MAE: {mae_test:,.2f}")
print(f" Final Test RMSE: {rmse_test:,.2f}")


### Residual BiLSTM + Attention for Cash Flow Forecasting (Tuned Para)

In [None]:
# Custom Attention Layer
@register_keras_serializable()
class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)
        self.supports_masking = True
        self.W = Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.W(inputs))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        return tf.reduce_sum(context_vector, axis=1)

    def get_config(self):
        return super(CustomAttention, self).get_config()

# Load Saved Hyperparameters
with open("p2.json", "r") as f:
    best_params = json.load(f)

# Build Residual BiLSTM + Attention Model
def build_lstm_attention_residual_model(input_shape, n_branches, best_params):
    numeric_input = Input(shape=input_shape, name="numeric_input")     
    branch_input = Input(shape=(1,), name="branch_input")               

    # Embedding + Repeat
    embed_dim = best_params.get("embed_dim", 16)
    branch_embedding = Embedding(input_dim=n_branches, output_dim=embed_dim, name="branch_embedding")(branch_input)
    branch_embedding = branch_embedding[:, 0, :]  
    branch_embedding_repeated = RepeatVector(input_shape[0])(branch_embedding) 

    # Merge numeric + embedding
    x = Concatenate(name="concat_embedding_numeric")([numeric_input, branch_embedding_repeated])  

    # BiLSTM Layer
    lstm_units = best_params.get("lstm_units", 64)
    lstm_out = Bidirectional(LSTM(units=lstm_units, return_sequences=True), name="bilstm")(x)

    # Residual Connection (project input if needed)
    if x.shape[-1] != lstm_out.shape[-1]:
        x_proj = Dense(lstm_out.shape[-1], name="residual_projection")(x)
        x = Add(name="residual_lstm")([lstm_out, x_proj])
    else:
        x = Add(name="residual_lstm")([lstm_out, x])

    # Attention
    x = CustomAttention(name="attention")(x)

    # Dense + Dropout
    dense_units = best_params.get("dense_units", 32)
    dropout_rate = best_params.get("dropout", 0.3)
    dense_out = Dense(dense_units, activation="relu", name="dense")(x)
    dense_out = Dropout(dropout_rate, name="dropout")(dense_out)

    # Optional Residual on Dense
    if x.shape[-1] == dense_out.shape[-1]:
        x = Add(name="residual_dense")([x, dense_out])
    else:
        x = dense_out

    # Output
    output = Dense(1, name="output")(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

# Set Input Shape and Build Model
input_shape = (SEQ_LEN, len(top_features))  
model_3 = build_lstm_attention_residual_model(input_shape, n_branches, best_params)
model_3.summary()

# Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)

# Internal Train-Test Split
X_num_train, X_num_val, X_branch_train, X_branch_val, y_train_new, y_val = train_test_split(
    X_num_seq_train, X_branch_seq_train, y_seq_train, test_size=0.1, shuffle=False
)

# Train Final Model
history = model_3.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_val, X_branch_val], y_val),
    epochs=30,
    batch_size=best_params.get("batch_size", 64),
    callbacks=[early_stop, reduce_lr],
    shuffle=False,
    verbose=1
)

# Evaluate on Test Set
y_pred_3 = model_3.predict([X_num_seq_test, X_branch_seq_test]).flatten()
mae_test = mean_absolute_error(y_seq_test, y_pred_3)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred_3))

print(f"\n Final Test MAE: {mae_test:,.2f}")
print(f" Final Test RMSE: {rmse_test:,.2f}")


### Deep Residual BiLSTM + Attention Model

### Deep Residual BiLSTM + Attention + LayerNormalization

In [None]:
# Custom Attention Layer
@register_keras_serializable()
class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)
        self.supports_masking = True
        self.W = Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.W(inputs))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        return tf.reduce_sum(context_vector, axis=1)

    def get_config(self):
        return super(CustomAttention, self).get_config()


# Load Saved Hyperparameters
with open("p2.json", "r") as f:
    best_params = json.load(f)


# Build Residual BiLSTM + Attention + LayerNormalization Model
def build_bilstm_attention_with_normalization(input_shape, n_branches, best_params):
    numeric_input = Input(shape=input_shape, name="numeric_input")
    branch_input = Input(shape=(1,), name="branch_input")

    # Embedding + Repeat
    embed_dim = best_params.get("embed_dim", 16)
    branch_embedding = Embedding(input_dim=n_branches, output_dim=embed_dim, name="branch_embedding")(branch_input)
    branch_embedding = branch_embedding[:, 0, :]  # shape: (None, embed_dim)
    branch_embedding_repeated = RepeatVector(input_shape[0])(branch_embedding)  # shape: (None, SEQ_LEN, embed_dim)

    # Merge inputs
    x = Concatenate(name="concat_embedding_numeric")([numeric_input, branch_embedding_repeated])

    # BiLSTM Layer 1 + LayerNorm + Residual
    lstm_units = best_params.get("lstm_units", 64)
    x1 = Bidirectional(LSTM(units=lstm_units, return_sequences=True), name="bilstm_1")(x)
    x1 = LayerNormalization(name="layernorm_1")(x1)
    if x.shape[-1] != x1.shape[-1]:
        x_proj1 = Dense(x1.shape[-1], name="residual_proj_1")(x)
        x1 = Add(name="residual_1")([x1, x_proj1])
    else:
        x1 = Add(name="residual_1")([x1, x])

    # BiLSTM Layer 2 + LayerNorm + Residual
    x2 = Bidirectional(LSTM(units=lstm_units, return_sequences=True), name="bilstm_2")(x1)
    x2 = LayerNormalization(name="layernorm_2")(x2)
    if x1.shape[-1] != x2.shape[-1]:
        x_proj2 = Dense(x2.shape[-1], name="residual_proj_2")(x1)
        x2 = Add(name="residual_2")([x2, x_proj2])
    else:
        x2 = Add(name="residual_2")([x2, x1])

    # Attention Layer
    x = CustomAttention(name="attention")(x2)

    # Dense + Dropout
    dense_units = best_params.get("dense_units", 32)
    dropout_rate = best_params.get("dropout", 0.3)
    dense_out = Dense(dense_units, activation="relu", name="dense")(x)
    dense_out = Dropout(dropout_rate, name="dropout")(dense_out)

    # Optional residual on dense
    if x.shape[-1] == dense_out.shape[-1]:
        x = Add(name="residual_dense")([x, dense_out])
    else:
        x = dense_out

    # Output Layer
    output = Dense(1, name="output")(x)

    # Compile
    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model


# Set Input Shape and Build Model
input_shape = (SEQ_LEN, len(top_features))
model_5 = build_bilstm_attention_with_normalization(input_shape, n_branches, best_params)
model_5.summary()


# Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)


# Train-Validation Split
X_num_train, X_num_val, X_branch_train, X_branch_val, y_train_new, y_val = train_test_split(
    X_num_seq_train, X_branch_seq_train, y_seq_train, test_size=0.1, shuffle=False
)


# Train the Model
history = model_5.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_val, X_branch_val], y_val),
    epochs=30,
    batch_size=best_params.get("batch_size", 64),
    callbacks=[early_stop, reduce_lr],
    shuffle=False,
    verbose=1
)


# Evaluate on Test Set
y_pred_5 = model_5.predict([X_num_seq_test, X_branch_seq_test]).flatten()
mae_test = mean_absolute_error(y_seq_test, y_pred_5)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred_5))

print(f"\n Final Test MAE: {mae_test:,.2f}")
print(f" Final Test RMSE: {rmse_test:,.2f}")


### Optimal Model - LSTM

In [None]:
## Creating Sequence


# Reproducibility Setup
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)


# Sequence Configuration
SEQ_LEN = 60


# Feature Selection
exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

TOP_N_FEATURES = 50
X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

feature_importances = pd.Series(xgb_model.feature_importances_, index=X_temp.columns)
top_features = feature_importances.nlargest(TOP_N_FEATURES).index.tolist()

print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features)


# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=top_features)


# Sequence Generator
def create_lstm_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []

    for i in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[i - window:i].values)
        X_branch_seq.append(branch_series.iloc[i])
        y_seq.append(y_series.iloc[i])

    return (
        np.array(X_numeric_seq, dtype=np.float32),
        np.array(X_branch_seq, dtype=np.int32).reshape(-1, 1),
        np.array(y_seq, dtype=np.float32)
    )


# Create Sequences
n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq_train, X_branch_seq_train, y_seq_train = create_lstm_sequences(
    X_train_scaled_df.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_lstm_sequences(
    X_test_scaled_df.reset_index(drop=True),
    pd.Series(y_test).reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)


In [None]:
# Custom Attention Layer
@register_keras_serializable()
class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)
        self.supports_masking = True
        self.W = Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.W(inputs))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        return tf.reduce_sum(context_vector, axis=1)

    def get_config(self):
        return super(CustomAttention, self).get_config()


# Load Saved Hyperparameters
with open("p2.json", "r") as f:
    best_params = json.load(f)


# Build Residual BiLSTM + Attention + LayerNormalization Model
def build_bilstm_attention_with_normalization(input_shape, n_branches, best_params):
    numeric_input = Input(shape=input_shape, name="numeric_input")
    branch_input = Input(shape=(1,), name="branch_input")

    # Embedding + Repeat
    embed_dim = best_params.get("embed_dim", 16)
    branch_embedding = Embedding(input_dim=n_branches, output_dim=embed_dim, name="branch_embedding")(branch_input)
    branch_embedding = branch_embedding[:, 0, :]  
    branch_embedding_repeated = RepeatVector(input_shape[0])(branch_embedding)  

    # Merge inputs
    x = Concatenate(name="concat_embedding_numeric")([numeric_input, branch_embedding_repeated])

    # BiLSTM Layer 1 + LayerNorm + Residual
    lstm_units = best_params.get("lstm_units", 64)
    x1 = Bidirectional(LSTM(units=lstm_units, return_sequences=True), name="bilstm_1")(x)
    x1 = LayerNormalization(name="layernorm_1")(x1)
    if x.shape[-1] != x1.shape[-1]:
        x_proj1 = Dense(x1.shape[-1], name="residual_proj_1")(x)
        x1 = Add(name="residual_1")([x1, x_proj1])
    else:
        x1 = Add(name="residual_1")([x1, x])

    # BiLSTM Layer 2 + LayerNorm + Residual
    x2 = Bidirectional(LSTM(units=lstm_units, return_sequences=True), name="bilstm_2")(x1)
    x2 = LayerNormalization(name="layernorm_2")(x2)
    if x1.shape[-1] != x2.shape[-1]:
        x_proj2 = Dense(x2.shape[-1], name="residual_proj_2")(x1)
        x2 = Add(name="residual_2")([x2, x_proj2])
    else:
        x2 = Add(name="residual_2")([x2, x1])

    # Attention Layer
    x = CustomAttention(name="attention")(x2)

    # Dense + Dropout
    dense_units = best_params.get("dense_units", 32)
    dropout_rate = best_params.get("dropout", 0.3)
    dense_out = Dense(dense_units, activation="relu", name="dense")(x)
    dense_out = Dropout(dropout_rate, name="dropout")(dense_out)

    # Optional residual on dense
    if x.shape[-1] == dense_out.shape[-1]:
        x = Add(name="residual_dense")([x, dense_out])
    else:
        x = dense_out

    # Output Layer
    output = Dense(1, name="output")(x)

    # Compile
    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model


# Set Input Shape and Build Model
input_shape = (SEQ_LEN, len(top_features))
lstm_model = build_bilstm_attention_with_normalization(input_shape, n_branches, best_params)
lstm_model.summary()


# Callbacks
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)


# Train-Validation Split
X_num_train, X_num_val, X_branch_train, X_branch_val, y_train_new, y_val = train_test_split(
    X_num_seq_train, X_branch_seq_train, y_seq_train, test_size=0.1, shuffle=False
)


# Train the Model
history = lstm_model.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_val, X_branch_val], y_val),
    epochs=30,
    batch_size=best_params.get("batch_size", 64),
    callbacks=[early_stop, reduce_lr],
    shuffle=False,
    verbose=1
)


# Evaluate on Test Set
y_pred_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
mae_test = mean_absolute_error(y_seq_test, y_pred_lstm)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred_lstm))

print(f"\n Final Test MAE: {mae_test:,.2f}")
print(f" Final Test RMSE: {rmse_test:,.2f}")


### Save the optimal model

In [None]:
# Save the trained model with custom_objects included
lstm_model.save("best_lstm_model.keras")
print(" Model saved as 'best_lstm_model.keras'")


In [None]:
### Load the model later

# Re-register the custom layer so Keras can find it
@register_keras_serializable()
class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)
        self.supports_masking = True
        self.W = Dense(1)

    def call(self, inputs):
        score = tf.nn.tanh(self.W(inputs))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        return tf.reduce_sum(context_vector, axis=1)

    def get_config(self):
        return super(CustomAttention, self).get_config()

# Load the model for later use
lstm_model = load_model("best_lstm_model.keras", custom_objects={"CustomAttention": CustomAttention})

# Example: Predict on test set
y_pred_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae_test = mean_absolute_error(y_seq_test, y_pred_lstm)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred_lstm))

print(f"\n Final Test MAE: {mae_test:,.2f}")
print(f" Final Test RMSE: {rmse_test:,.2f}")


# Temporal Convolutional Network (TCN)

### TCN Hyperparameter Tuning with KerasTuner

In [None]:
# Initial Setup 
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
SEQ_LEN = 60

# Feature Selection 
exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

TOP_N_FEATURES = 50
X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

importances = xgb_model.feature_importances_
feat_imp = pd.Series(importances, index=X_temp.columns).sort_values(ascending=False)
top_features = feat_imp.head(TOP_N_FEATURES).index.tolist()

print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=top_features)

# Sequence Creation
def create_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []
    for idx in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[idx - window:idx].values)
        X_branch_seq.append(branch_series.iloc[idx])
        y_seq.append(y_series.iloc[idx])
    return (
        np.array(X_numeric_seq).astype(np.float32),
        np.array(X_branch_seq).astype(np.int32).reshape(-1, 1),
        np.array(y_seq).astype(np.float32)
    )

n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq, X_branch_seq, y_seq = create_sequences(
    X_train_scaled.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_sequences(
    X_test_scaled.reset_index(drop=True),
    y_test.reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

# TCN HyperModel
class TCNHyperModel(HyperModel):
    def __init__(self, input_shape, n_branches):
        self.input_shape = input_shape
        self.n_branches = n_branches

    def build(self, hp):
        input_numeric = Input(shape=self.input_shape, name="numeric_input")
        input_branch = Input(shape=(1,), name="branch_input")

        embed_dim = hp.Choice("embed_dim", [8, 16, 32])
        embedding = Embedding(input_dim=self.n_branches, output_dim=embed_dim)(input_branch)
        embedding_flat = Reshape((embed_dim,), name="reshape_embedding")(embedding)

        nb_filters = hp.Choice("nb_filters", [32, 64, 128])
        kernel_size = hp.Choice("kernel_size", [2, 3, 5])
        dilations = [1, 2, 4, 8]

        tcn_out = TCN(
            nb_filters=nb_filters,
            kernel_size=kernel_size,
            dilations=dilations,
            return_sequences=False,
            dropout_rate=hp.Float("dropout_rate", 0.1, 0.5, step=0.1)
        )(input_numeric)

        concat = Concatenate()([tcn_out, embedding_flat])
        dense_units = hp.Choice("dense_units", [32, 64, 128])
        dense = Dense(dense_units, activation="relu")(concat)

        output = Dense(1)(dense)

        model = Model(inputs=[input_numeric, input_branch], outputs=output)
        model.compile(optimizer="adam", loss="mse", metrics=["mae"])
        return model

# Tuning 
hypermodel = TCNHyperModel(
    input_shape=(SEQ_LEN, len(top_features)),
    n_branches=n_branches
)

tuner = RandomSearch(
    hypermodel,
    objective="val_mae",
    max_trials=10,
    executions_per_trial=1,
    directory="tcn_tuner_results",
    project_name="tcn_model_tuning",
    overwrite=True,
    seed=SEED
)

early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

tuner.search(
    [X_num_seq, X_branch_seq],
    y_seq,
    validation_split=0.1,
    epochs=20,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=2
)

# Evaluation
best_model = tuner.get_best_models(num_models=1)[0]

y_pred = best_model.predict([X_num_seq_test, X_branch_seq_test])
mae_test = mean_absolute_error(y_seq_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"Best TCN Test MAE: {mae_test:,.2f}")
print(f"Best TCN Test RMSE: {rmse_test:,.2f}")


### Save Best Hyperparameters to JSON

In [None]:
# Extract the best hyperparameters
best_hps = tuner.get_best_hyperparameters(1)[0]
best_params_dict = best_hps.values

# Save to JSON
with open("best_tcn_hyperparams.json", "w") as f:
    json.dump(best_params_dict, f, indent=4)

print("Best hyperparameters saved to 'best_tcn_hyperparams.json'")


In [None]:
# Load and print the JSON file
json_file_path = "best_tcn_hyperparams.json"

with open(json_file_path, "r") as f:
    best_params = json.load(f)

# Print in a nicely formatted way
print("Best TCN Hyperparameters:")
for key, value in best_params.items():
    print(f"{key}: {value}")

### Common sequence

In [None]:
# Initial Setup
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
SEQ_LEN = 60

# Feature Selection
exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

TOP_N_FEATURES = 50
X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

importances = xgb_model.feature_importances_
feat_imp = pd.Series(importances, index=X_temp.columns).sort_values(ascending=False)
top_features = feat_imp.head(TOP_N_FEATURES).index.tolist()

print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=top_features)

# Sequence Creation
def create_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []
    for idx in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[idx - window:idx].values)
        X_branch_seq.append(branch_series.iloc[idx])
        y_seq.append(y_series.iloc[idx])
    return (
        np.array(X_numeric_seq).astype(np.float32),
        np.array(X_branch_seq).astype(np.int32).reshape(-1, 1),
        np.array(y_seq).astype(np.float32)
    )

n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq, X_branch_seq, y_seq = create_sequences(
    X_train_scaled.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_sequences(
    X_test_scaled.reset_index(drop=True),
    #y_test.reset_index(drop=True),
    pd.Series(y_test).reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_train, X_num_test, X_branch_train, X_branch_test, y_train_new, y_test = train_test_split(
    X_num_seq, X_branch_seq, y_seq, test_size=0.2, shuffle=False
)

input_shape_num = X_num_seq.shape[1:]
input_shape_branch = X_branch_seq.shape[1:]
vocab_size = int(np.max(X_branch_seq)) + 1

### Best parameters

In [None]:
# Load best parameters
with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)

# Rebuild the model
def build_best_tcn_model(input_shape, n_branches, best_params):
    input_numeric = Input(shape=input_shape, name="numeric_input")
    input_branch = Input(shape=(1,), name="branch_input")

    # Embedding
    embed_dim = best_params["embed_dim"]
    embedding = Embedding(input_dim=n_branches, output_dim=embed_dim)(input_branch)
    embedding_flat = Reshape((embed_dim,), name="reshape_embedding")(embedding)

    # TCN layer
    tcn_out = TCN(
        nb_filters=best_params["nb_filters"],
        kernel_size=best_params["kernel_size"],
        dilations=[1, 2, 4, 8],
        dropout_rate=best_params["dropout_rate"],
        return_sequences=False
    )(input_numeric)

    concat = Concatenate()([tcn_out, embedding_flat])
    dense = Dense(best_params["dense_units"], activation="relu")(concat)
    output = Dense(1)(dense)

    model = Model(inputs=[input_numeric, input_branch], outputs=output)
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model

# Build final model
final_tcn_model = build_best_tcn_model(
    input_shape=(SEQ_LEN, len(top_features)),
    n_branches=n_branches,
    best_params=best_params
)

early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Train or evaluate the final model
final_tcn_model.fit(
    [X_num_train, X_branch_train],
    y_train_new,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop, reduce_lr],
    verbose=2
)


# Evaluate on Test Set
y_pred = final_tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
mae = mean_absolute_error(y_seq_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred))

print(f"Test MAE: {mae:,.2f}")
print(f"Test RMSE: {rmse:,.2f}")

### Temporal Convolutional Network (TCN) based model

In [None]:
# Load Best Hyperparameters
with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)

print("Loaded best hyperparameters:")
for k, v in best_params.items():
    print(f"{k}: {v}")

# Build TCN Model
def build_tcn_model(best_params, input_shape_num, input_shape_branch, vocab_size):
    numeric_input = Input(shape=input_shape_num, name="numeric_input")   
    branch_input = Input(shape=(input_shape_branch[0],), name="branch_input")  

    # Embedding + Repeat
    embed_dim = best_params['embed_dim']
    branch_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(branch_input)
    branch_embedding = branch_embedding[:, 0, :]  
    branch_embedding = RepeatVector(input_shape_num[0])(branch_embedding)

    # Concatenate
    x = Concatenate()([numeric_input, branch_embedding])  

    # TCN block
    tcn_filters = best_params['nb_filters']  
    kernel_size = best_params['kernel_size']
    dropout_rate = best_params['dropout_rate']
    x = TCN(nb_filters=tcn_filters, kernel_size=kernel_size, dropout_rate=dropout_rate, return_sequences=False)(x)

    # Dense + Dropout
    dense_units = best_params['dense_units']
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)

    output = Dense(1)(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)

    # Compile
    lr = 0.001
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss='mse', metrics=['mae'])

    return model


# Build & Train Model
model_tcn_1 = build_tcn_model(best_params, input_shape_num, input_shape_branch, vocab_size)

history = model_tcn_1.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_test, X_branch_test], y_test),
    epochs=30,
    batch_size=64,
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
    shuffle=False,
    verbose=1
)

# Save and Evaluate
model_tcn_1.save("final_tcn_model.keras")
print(" TCN model saved as 'final_tcn_model.keras'")

# Load and Predict
model_tcn_1 = load_model("final_tcn_model.keras", custom_objects={"TCN": TCN})


# Evaluate on Test Set
y_pred_tcn_1 = model_tcn_1.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae = mean_absolute_error(y_seq_test, y_pred_tcn_1)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn_1))

print(f" Final TCN Test MAE: {mae:,.2f}")
print(f" Final TCN Test RMSE: {rmse:,.2f}")

### Deeper TCN (stacking multiple TCN layers)

In [None]:
# Load Best Hyperparameters
with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)

print("Loaded best hyperparameters:")
for k, v in best_params.items():
    print(f"{k}: {v}")

# TCN Block
def tcn_block(x, filters, kernel_size=3, dilation_rate=1):
    x = Conv1D(filters, kernel_size, dilation_rate=dilation_rate, padding='causal')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    return x

# Build Deeper TCN Model
def build_deeper_tcn_model(best_params, input_shape_num, input_shape_branch, vocab_size):
    numeric_input = Input(shape=input_shape_num, name="numeric_input")       
    branch_input = Input(shape=(input_shape_branch[0],), name="branch_input")  

    # Embedding
    embed_dim = best_params['embed_dim']
    branch_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(branch_input)
    branch_embedding = branch_embedding[:, 0, :]
    branch_embedding = RepeatVector(input_shape_num[0])(branch_embedding)     

    # Concatenate numeric + embedding
    x = Concatenate()([numeric_input, branch_embedding])  

    # TCN Stack (Deeper)
    filters = best_params['nb_filters']  
    for i in range(4):  
        x = tcn_block(x, filters=filters, kernel_size=3, dilation_rate=2**i)

    # Global Pooling
    x = GlobalAveragePooling1D()(x)

    # Dense + Dropout
    dense_units = best_params['dense_units']
    dropout_rate = best_params['dropout_rate']
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)

    # Output
    output = Dense(1)(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)

    # Compile
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

    return model


# Build and Train the Deeper TCN Model
model_tcn_4 = build_deeper_tcn_model(best_params, input_shape_num, input_shape_branch, vocab_size)

history = model_tcn_4.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_test, X_branch_test], y_test),
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    shuffle=False,
    verbose=1
)

# Save model
model_tcn_4.save("deeper_tcn_model.keras")
print(" Model saved as 'deeper_tcn_model.keras'")

# Reload model
model_tcn_4 = tf.keras.models.load_model("deeper_tcn_model.keras")


# Evaluate on Test Set
y_pred_tcn_4 = model_tcn_4.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae = mean_absolute_error(y_seq_test, y_pred_tcn_4)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn_4))

print(f" Deeper TCN Test MAE: {mae:,.2f}")
print(f" Deeper TCN Test RMSE: {rmse:,.2f}")

### Residual TCN Blocks (skip connections inside TCN)

In [None]:
# Load Best Hyperparameters

with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)

print(" Loaded best hyperparameters:")
for k, v in best_params.items():
    print(f"{k}: {v}")

# Residual TCN Block

def residual_tcn_block(x, filters, kernel_size=3, dilation_rate=1):
    shortcut = x  

    # Main path
    x = Conv1D(filters, kernel_size, dilation_rate=dilation_rate, padding='causal')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    # Match dimensions if needed
    if shortcut.shape[-1] != x.shape[-1]:
        shortcut = Conv1D(filters, kernel_size=1, padding='same')(shortcut)

    # Add skip connection
    x = Add()([x, shortcut])
    x = ReLU()(x)
    return x

# Build Residual TCN Model

def build_residual_tcn_model(best_params, input_shape_num, input_shape_branch, vocab_size):
    numeric_input = Input(shape=input_shape_num, name="numeric_input")         
    branch_input = Input(shape=(input_shape_branch[0],), name="branch_input")  

    # Embedding
    embed_dim = best_params['embed_dim']
    branch_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(branch_input)
    branch_embedding = branch_embedding[:, 0, :]
    branch_embedding = RepeatVector(input_shape_num[0])(branch_embedding)

    # Concatenate
    x = Concatenate()([numeric_input, branch_embedding])

    # Residual TCN Blocks
    filters = best_params['nb_filters']  
    for i in range(4):
        x = residual_tcn_block(x, filters=filters, kernel_size=best_params['kernel_size'], dilation_rate=2**i)

    # Global Pooling
    x = GlobalAveragePooling1D()(x)

    # Dense + Dropout
    dense_units = best_params['dense_units']
    dropout_rate = best_params['dropout_rate']
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)

    # Output
    output = Dense(1)(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model


# Train Model

model_tcn_5 = build_residual_tcn_model(best_params, input_shape_num, input_shape_branch, vocab_size)

history = model_tcn_5.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_test, X_branch_test], y_test),
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    shuffle=False,
    verbose=1
)

# Save model
model_tcn_5.save("residual_tcn_model.keras")
print(" Model saved as 'residual_tcn_model.keras'")

# Reload model
model_tcn_5 = tf.keras.models.load_model("residual_tcn_model.keras")


# Evaluate on Test Set
y_pred_tcn_5 = model_tcn_5.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae = mean_absolute_error(y_seq_test, y_pred_tcn_5)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn_5))

print(f" Residual TCN Test MAE: {mae:,.2f}")
print(f" Residual TCN Test RMSE: {rmse:,.2f}")

### Residual TCN + MultiHeadAttention

In [None]:
# Load best hyperparameters
with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)

print("Loaded best hyperparameters:")
for k, v in best_params.items():
    print(f"{k}: {v}")


# Residual TCN Block
def residual_tcn_block(x, filters, kernel_size=3, dilation_rate=1):
    shortcut = x
    x = Conv1D(filters, kernel_size, padding='causal', dilation_rate=dilation_rate)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    # Match dimensions for residual connection
    if shortcut.shape[-1] != x.shape[-1]:
        shortcut = Conv1D(filters, kernel_size=1, padding='same')(shortcut)

    x = Add()([x, shortcut])
    x = ReLU()(x)
    return x


# Build TCN + Attention Model
def build_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size):
    numeric_input = Input(shape=input_shape_num, name="numeric_input")
    branch_input = Input(shape=(input_shape_branch[0],), name="branch_input")

    # Embedding for branch
    embed_dim = best_params['embed_dim']
    branch_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(branch_input)
    branch_embedding = branch_embedding[:, 0, :]  
    branch_embedding = RepeatVector(input_shape_num[0])(branch_embedding)  

    # Combine branch embedding and numeric input
    x = Concatenate()([numeric_input, branch_embedding])  

    # Residual TCN layers
    filters = best_params['nb_filters']  
    for i in range(3):
        x = residual_tcn_block(x, filters=filters, kernel_size=3, dilation_rate=2**i)

    # Multi-Head Attention
    attn_output = MultiHeadAttention(num_heads=2, key_dim=filters)(x, x)
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)

    # Final dense layers
    x = GlobalAveragePooling1D()(x)
    x = Dense(best_params['dense_units'], activation='relu')(x)
    x = Dropout(best_params['dropout_rate'])(x)
    output = Dense(1)(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    return model


# Build & Train Model
model_tcn_6 = build_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size)

history = model_tcn_6.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_test, X_branch_test], y_test),
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    shuffle=False,
    verbose=1
)


# Save & Evaluate
model_tcn_6.save("tcn_attention_model.keras")
print("Model saved as 'tcn_attention_model.keras'")

# Reload model
model_tcn_6 = tf.keras.models.load_model("tcn_attention_model.keras")


# Evaluate on Test Set
y_pred_tcn_6 = model_tcn_6.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae = mean_absolute_error(y_seq_test, y_pred_tcn_6)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn_6))

print(f" Test MAE: {mae:,.2f}")
print(f" Test RMSE: {rmse:,.2f}")

### TCN-Attention Model Using Dilated Convolutions - TCN hyper parameters

In [None]:
# Load best hyperparameters
with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)


# Residual Dilated TCN Block
def residual_dilated_tcn_block(x, filters, kernel_size=3, dilation_rate=1, dropout_rate=0.2):
    shortcut = x

    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = Conv1D(filters, kernel_size, padding='causal', dilation_rate=dilation_rate)(x)
    x = Dropout(dropout_rate)(x)

    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = Conv1D(filters, kernel_size, padding='causal', dilation_rate=dilation_rate)(x)
    x = Dropout(dropout_rate)(x)

    # Match shortcut dimensions
    if shortcut.shape[-1] != x.shape[-1]:
        shortcut = Conv1D(filters, kernel_size=1, padding='same')(shortcut)

    return Add()([shortcut, x])


# Model Builder
def build_dilated_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size):
    numeric_input = Input(shape=input_shape_num, name="numeric_input")
    branch_input = Input(shape=(input_shape_branch[0],), name="branch_input")

    embed_dim = best_params['embed_dim']
    branch_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(branch_input)
    branch_embedding = branch_embedding[:, 0, :]  
    branch_embedding = RepeatVector(input_shape_num[0])(branch_embedding)

    # Combine numeric features and branch embedding
    x = Concatenate()([numeric_input, branch_embedding])

    # Stacked Residual Dilated TCN blocks
    filters = best_params["nb_filters"]
    dilation_rates = [1, 2, 4]
    for rate in dilation_rates:
        x = residual_dilated_tcn_block(x, filters=filters, kernel_size=3, dilation_rate=rate, dropout_rate=best_params['dropout_rate'])

    # Multi-head attention layer
    attn_output = MultiHeadAttention(num_heads=2, key_dim=filters)(x, x)
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)

    # Final regression layers
    x = GlobalAveragePooling1D()(x)
    x = Dense(best_params['dense_units'], activation='relu')(x)
    x = Dropout(best_params['dropout_rate'])(x)
    output = Dense(1)(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse', metrics=['mae'])
    return model


# Build and Train model
model_tcn_2 = build_dilated_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size)
history = model_tcn_2.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_test, X_branch_test], y_test),
    epochs=20,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    shuffle=False,
    verbose=1
)

# Save model
model_tcn_2.save("dilated_tcn_attention_model.keras")


# Reload model
model_tcn_2 = tf.keras.models.load_model("dilated_tcn_attention_model.keras")


# Evaluate on Test Set
y_pred_tcn_2 = model_tcn_2.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae = mean_absolute_error(y_seq_test, y_pred_tcn_2)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn_2))


print(f"Test MAE: {mae:,.2f}")
print(f"Test RMSE: {rmse:,.2f}")

### TCN-Attention Model with Positional Encoding and Residual Dilated Blocks

In [None]:
# Positional Encoding Layer
class PositionalEncoding(tf.keras.layers.Layer):
    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]
        d_model = tf.shape(inputs)[2]
        position = tf.cast(tf.range(seq_len)[:, tf.newaxis], tf.float32)
        i = tf.cast(tf.range(d_model)[tf.newaxis, :], tf.float32)

        angle_rates = 1 / tf.pow(10000., (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        angle_rads = position * angle_rates

        sines = tf.sin(angle_rads[:, 0::2])
        cosines = tf.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sines, cosines], axis=-1)

        pos_encoding = pos_encoding[tf.newaxis, ...]
        return inputs + pos_encoding[:, :seq_len, :]


# Residual Dilated TCN Block
def residual_dilated_tcn_block(x, filters, kernel_size=3, dilation_rate=1, dropout_rate=0.2):
    shortcut = x

    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = Conv1D(filters, kernel_size, padding='causal', dilation_rate=dilation_rate)(x)
    x = Dropout(dropout_rate)(x)

    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = Conv1D(filters, kernel_size, padding='causal', dilation_rate=dilation_rate)(x)
    x = Dropout(dropout_rate)(x)

    if shortcut.shape[-1] != x.shape[-1]:
        shortcut = Conv1D(filters, kernel_size=1, padding='same')(shortcut)

    return Add()([shortcut, x])


# Model Builder
def build_improved_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size):
    numeric_input = Input(shape=input_shape_num, name="numeric_input")
    branch_input = Input(shape=(input_shape_branch[0],), name="branch_input")

    # Embedding
    embed_dim = best_params['embed_dim']
    branch_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(branch_input)
    branch_embedding = branch_embedding[:, 0, :]
    branch_embedding = RepeatVector(input_shape_num[0])(branch_embedding)

    # Combine inputs
    x = Concatenate()([numeric_input, branch_embedding])

    # Positional Encoding
    x = PositionalEncoding()(x)

    # Residual Dilated TCN blocks
    filters = best_params['nb_filters']
    dilation_rates = [1, 2, 4]
    for rate in dilation_rates:
        x = residual_dilated_tcn_block(x, filters, kernel_size=3, dilation_rate=rate, dropout_rate=best_params['dropout_rate'])

    # Multi-Head Attention
    attn_output = MultiHeadAttention(num_heads=best_params.get('num_heads', 2), key_dim=filters)(x, x)
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)
    x = Dropout(best_params['dropout_rate'])(x)

    # Global pooling and dense layers
    x = GlobalAveragePooling1D()(x)
    x = Dense(best_params['dense_units'], activation='relu')(x)
    x = Dropout(best_params['dropout_rate'])(x)

    # Optional second dense layer
    if 'dense_units_2' in best_params:
        x = Dense(best_params['dense_units_2'], activation='relu')(x)
        x = Dropout(best_params['dropout_rate'])(x)

    output = Dense(1)(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse', metrics=['mae'])
    return model


# Load Best Hyperparameters
with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)
best_params['num_heads'] = 4              
best_params['dense_units_2'] = 64         


# Train Model
model_tcn_3 = build_improved_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size)

history = model_tcn_3.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_test, X_branch_test], y_test),
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    shuffle=False,
    verbose=1
)


# Evaluate on Test Set
y_pred_tcn_3 = model_tcn_3.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae = mean_absolute_error(y_seq_test, y_pred_tcn_3)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn_3))

print(f"Test MAE: {mae:,.2f}")
print(f"Test RMSE: {rmse:,.2f}")

### Optimised TCN

In [None]:
## Creating Sequence

# Initial Setup
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
SEQ_LEN = 60

# Feature Selection
exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

TOP_N_FEATURES = 50
X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

importances = xgb_model.feature_importances_
feat_imp = pd.Series(importances, index=X_temp.columns).sort_values(ascending=False)
top_features = feat_imp.head(TOP_N_FEATURES).index.tolist()

print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=top_features)

# Sequence Creation
def create_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []
    for idx in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[idx - window:idx].values)
        X_branch_seq.append(branch_series.iloc[idx])
        y_seq.append(y_series.iloc[idx])
    return (
        np.array(X_numeric_seq).astype(np.float32),
        np.array(X_branch_seq).astype(np.int32).reshape(-1, 1),
        np.array(y_seq).astype(np.float32)
    )

n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq, X_branch_seq, y_seq = create_sequences(
    X_train_scaled.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_sequences(
    X_test_scaled.reset_index(drop=True),
    #y_test.reset_index(drop=True),
    pd.Series(y_test).reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_train, X_num_test, X_branch_train, X_branch_test, y_train_new, y_test = train_test_split(
    X_num_seq, X_branch_seq, y_seq, test_size=0.2, shuffle=False
)

input_shape_num = X_num_seq.shape[1:]
input_shape_branch = X_branch_seq.shape[1:]
vocab_size = int(np.max(X_branch_seq)) + 1

In [None]:
# Load best hyperparameters
with open("best_tcn_hyperparams.json", "r") as f:
    best_params = json.load(f)


# Residual Dilated TCN Block
def residual_dilated_tcn_block(x, filters, kernel_size=3, dilation_rate=1, dropout_rate=0.2):
    shortcut = x

    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = Conv1D(filters, kernel_size, padding='causal', dilation_rate=dilation_rate)(x)
    x = Dropout(dropout_rate)(x)

    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = Conv1D(filters, kernel_size, padding='causal', dilation_rate=dilation_rate)(x)
    x = Dropout(dropout_rate)(x)

    # Match shortcut dimensions
    if shortcut.shape[-1] != x.shape[-1]:
        shortcut = Conv1D(filters, kernel_size=1, padding='same')(shortcut)

    return Add()([shortcut, x])


# Model Builder
def build_dilated_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size):
    numeric_input = Input(shape=input_shape_num, name="numeric_input")
    branch_input = Input(shape=(input_shape_branch[0],), name="branch_input")

    embed_dim = best_params['embed_dim']
    branch_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(branch_input)
    branch_embedding = branch_embedding[:, 0, :]  
    branch_embedding = RepeatVector(input_shape_num[0])(branch_embedding)

    # Combine numeric features and branch embedding
    x = Concatenate()([numeric_input, branch_embedding])

    # Stacked Residual Dilated TCN blocks
    filters = best_params["nb_filters"]
    dilation_rates = [1, 2, 4]
    for rate in dilation_rates:
        x = residual_dilated_tcn_block(x, filters=filters, kernel_size=3, dilation_rate=rate, dropout_rate=best_params['dropout_rate'])

    # Multi-head attention layer
    attn_output = MultiHeadAttention(num_heads=2, key_dim=filters)(x, x)
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)

    # Final regression layers
    x = GlobalAveragePooling1D()(x)
    x = Dense(best_params['dense_units'], activation='relu')(x)
    x = Dropout(best_params['dropout_rate'])(x)
    output = Dense(1)(x)

    model = Model(inputs=[numeric_input, branch_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse', metrics=['mae'])
    return model


# Build and Train model
tcn_model = build_dilated_tcn_attention_model(best_params, input_shape_num, input_shape_branch, vocab_size)
history = tcn_model.fit(
    [X_num_train, X_branch_train], y_train_new,
    validation_data=([X_num_test, X_branch_test], y_test),
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    shuffle=False,
    verbose=1
)

# Evaluate on Test Set
y_pred_tcn = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()


mae = mean_absolute_error(y_seq_test, y_pred_tcn)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn))


print(f"Test MAE: {mae:,.2f}")
print(f"Test RMSE: {rmse:,.2f}")

### Save the best model

In [None]:
# Save the model in the .keras format
tcn_model.save("tcn_best_model.keras")
print(" Model saved as 'tcn_best_model.keras'")


In [None]:
## Load the saved model

# Load the model
tcn_model = tf.keras.models.load_model("tcn_best_model.keras")

print(" Model loaded successfully")

# Example: Predict on test data
y_pred_tcn = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

mae = mean_absolute_error(y_seq_test, y_pred_tcn)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_pred_tcn))


print(f"Test MAE: {mae:,.2f}")
print(f"Test RMSE: {rmse:,.2f}")

## Model Blending

In [None]:
# Ensure both are using same test inputs
y_pred_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
y_pred_tcn = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()


In [None]:
## Blend Predictions - Simple average

# Simple average
y_pred_blend = (y_pred_lstm + y_pred_tcn) / 2

mae_blend = mean_absolute_error(y_seq_test, y_pred_blend)
rmse_blend = np.sqrt(mean_squared_error(y_seq_test, y_pred_blend))

print(f" Blended Test MAE: {mae_blend:,.2f}")
print(f" Blended Test RMSE: {rmse_blend:,.2f}")

### Search Best Blending Weights

In [None]:
best_mae = float("inf")
best_weight = 0.0

for w in np.arange(0, 1.05, 0.05):
    y_blend = w * y_pred_lstm + (1 - w) * y_pred_tcn
    mae = mean_absolute_error(y_seq_test, y_blend)
    if mae < best_mae:
        best_mae = mae
        best_weight = w

print(f" Best Blending Weight (LSTM): {best_weight:.2f}")
print(f" Best Blended MAE: {best_mae:,.2f}")


### Manual Blending Evaluation

In [None]:
# Store predictions if not already
y_pred_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
y_pred_tcn = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

# List of weights to try (LSTM weight, TCN weight is 1 - w)
manual_weights = [0.3, 0.4, 0.5, 0.55, 0.6, 0.7]

print("Manual Blending Results:\n")
for w in manual_weights:
    y_blend = w * y_pred_lstm + (1 - w) * y_pred_tcn
    mae = mean_absolute_error(y_seq_test, y_blend)
    rmse = np.sqrt(mean_squared_error(y_seq_test, y_blend))
    print(f"LSTM Weight: {w:.2f} | TCN Weight: {1 - w:.2f} -> MAE: {mae:,.2f}, RMSE: {rmse:,.2f}")

### Optimal Blending model

### Saving the optimal blending model

In [None]:
# Package both models and best weight
blend_package = {
    "lstm_model": lstm_model,         
    "tcn_model": tcn_model,           
    "best_weight": best_weight        
}

# Save as pickle
with open("dl_best_blend_model.pkl", "wb") as f:
    pickle.dump(blend_package, f)

print(" Blended model saved as 'dl_best_blend_model.pkl'")


In [None]:
# Load blended model
with open("dl_best_blend_model.pkl", "rb") as f:
    blend_package = pickle.load(f)

lstm_model_loaded = blend_package["lstm_model"]
tcn_model_loaded = blend_package["tcn_model"]
best_weight_loaded = blend_package["best_weight"]

print(f" Loaded best weight: {best_weight_loaded:.2f}")

# Predict using loaded models
y_pred_lstm_loaded = lstm_model_loaded.predict([X_num_seq_test, X_branch_seq_test]).flatten()
y_pred_tcn_loaded = tcn_model_loaded.predict([X_num_seq_test, X_branch_seq_test]).flatten()

# Blend predictions
y_pred_blend = best_weight_loaded * y_pred_lstm_loaded + (1 - best_weight_loaded) * y_pred_tcn_loaded

# Assuming y_seq_test is your true test targets

mae_blend = mean_absolute_error(y_seq_test, y_pred_blend)
rmse_blend = np.sqrt(mean_squared_error(y_seq_test, y_pred_blend))
r2_blend = r2_score(y_seq_test, y_pred_blend)

print(f"Blended Model Test MAE: {mae_blend:,.2f}")
print(f"Blended Model Test RMSE: {rmse_blend:,.2f}")
print(f"Blended Model Test R²: {r2_blend:.4f}")

## Model Stacking

### Get validation predictions from base models

In [None]:
# Get LSTM and TCN predictions on validation data

val_preds_lstm = lstm_model.predict([X_num_val, X_branch_val]).flatten()
val_preds_tcn  = tcn_model.predict([X_num_val, X_branch_val]).flatten()

# Stack base model predictions as features
X_meta_train = np.vstack((val_preds_lstm, val_preds_tcn)).T  
y_meta_train = y_val  


### Get test predictions for final stacking

In [None]:
# Get base model predictions on test set
test_preds_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
test_preds_tcn  = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

# Stack test predictions
X_meta_test = np.vstack((test_preds_lstm, test_preds_tcn)).T


### Train meta-learner (Ridge)

In [None]:
# Meta-learner
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta_train, y_meta_train)

# Final prediction
y_pred_stack = meta_model.predict(X_meta_test)

# Evaluation
mae_stack = mean_absolute_error(y_seq_test, y_pred_stack)
rmse_stack = np.sqrt(mean_squared_error(y_seq_test, y_pred_stack))

print(f" Stacked Model Test MAE: {mae_stack:,.2f}")
print(f" Stacked Model Test RMSE: {rmse_stack:,.2f}")


### Try other meta-learners - GradientBoostingRegressor

In [None]:
# Meta-learner
meta_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
meta_model.fit(X_meta_train, y_meta_train)

# Final prediction
y_pred_stack = meta_model.predict(X_meta_test)

# Evaluation
mae_stack = mean_absolute_error(y_seq_test, y_pred_stack)
rmse_stack = np.sqrt(mean_squared_error(y_seq_test, y_pred_stack))

print(f" Stacked Model Test MAE: {mae_stack:,.2f}")
print(f" Stacked Model Test RMSE: {rmse_stack:,.2f}")

### Try other meta-learners - XGBRegressor

In [None]:
# Meta-learner
meta_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
meta_model.fit(X_meta_train, y_meta_train)

# Final prediction
y_pred_stack = meta_model.predict(X_meta_test)

# Evaluation
mae_stack = mean_absolute_error(y_seq_test, y_pred_stack)
rmse_stack = np.sqrt(mean_squared_error(y_seq_test, y_pred_stack))

print(f" Stacked Model Test MAE: {mae_stack:,.2f}")
print(f" Stacked Model Test RMSE: {rmse_stack:,.2f}")

### Improving Ridge stacker

#### Generate OOF predictions and train Ridge stacker

In [None]:
n_splits = 5
SEQ_LEN = X_num_train.shape[1]
input_shape = (SEQ_LEN, X_num_train.shape[2]) 
n_branches = int(X_branch_train.max()) + 1

# Containers
oof_preds_lstm = np.zeros(len(y_train_new))
oof_preds_tcn = np.zeros(len(y_train_new))

tscv = TimeSeriesSplit(n_splits=n_splits)

print("Generating OOF predictions using TimeSeriesSplit...\n")

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_num_train)):
    print(f"Fold {fold+1}/{n_splits}")

    # Split data
    X_tr_num, X_val_num = X_num_train[train_idx], X_num_train[val_idx]
    X_tr_branch, X_val_branch = X_branch_train[train_idx], X_branch_train[val_idx]
    y_tr, y_val_part = y_train_new[train_idx], y_train_new[val_idx]

    # Build and fit LSTM
    lstm_model = build_bilstm_attention_with_normalization(input_shape, n_branches, best_params)
    lstm_model.fit([X_tr_num, X_tr_branch], y_tr,
                   validation_data=([X_val_num, X_val_branch], y_val_part),
                   epochs=10, batch_size=best_params.get("batch_size", 64),
                   callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
                   verbose=0, shuffle=False)

    oof_preds_lstm[val_idx] = lstm_model.predict([X_val_num, X_val_branch]).flatten()

    # Build and fit TCN
    tcn_model = build_dilated_tcn_attention_model(best_params, input_shape, (1,), n_branches)
    tcn_model.fit([X_tr_num, X_tr_branch], y_tr,
                  validation_data=([X_val_num, X_val_branch], y_val_part),
                  epochs=10, batch_size=64,
                  callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
                  verbose=0, shuffle=False)

    oof_preds_tcn[val_idx] = tcn_model.predict([X_val_num, X_val_branch]).flatten()


#### Build Meta Features, Train Ridge & Predict on Test Set

In [None]:
# Meta features for training meta-model
X_meta_train = np.vstack((oof_preds_lstm, oof_preds_tcn)).T
y_meta_train = y_train_new


# Retrain full base models on all training data
lstm_model = build_bilstm_attention_with_normalization(input_shape, n_branches, best_params)
lstm_model.fit([X_num_train, X_branch_train], y_train_new,
               epochs=15, batch_size=best_params.get("batch_size", 64),
               callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
               verbose=0, shuffle=False)

tcn_model = build_dilated_tcn_attention_model(best_params, input_shape, (1,), n_branches)
tcn_model.fit([X_num_train, X_branch_train], y_train_new,
              epochs=15, batch_size=64,
              callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
              verbose=0, shuffle=False)

# Predict on test set
y_pred_lstm_test = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
y_pred_tcn_test = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

X_meta_test = np.vstack((y_pred_lstm_test, y_pred_tcn_test)).T


#### Train Ridge Meta-Model and Evaluate

In [None]:
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta_train, y_meta_train)

y_pred_stack = meta_model.predict(X_meta_test)

mae_stack = mean_absolute_error(y_seq_test, y_pred_stack)
rmse_stack = np.sqrt(mean_squared_error(y_seq_test, y_pred_stack))

print(f"\n Stacked Model Test MAE: {mae_stack:,.2f}")
print(f" Stacked Model Test RMSE: {rmse_stack:,.2f}")


### Enrich Meta Features

In [None]:
# Create Enriched Meta-Feature Set

# Train meta features
X_meta_train = np.column_stack([
    oof_preds_lstm,
    oof_preds_tcn,
    np.abs(oof_preds_lstm - oof_preds_tcn),
    (oof_preds_lstm + oof_preds_tcn) / 2
])


# Test meta features
X_meta_test = np.column_stack([
    y_pred_lstm_test,
    y_pred_tcn_test,
    np.abs(y_pred_lstm_test - y_pred_tcn_test),
    (y_pred_lstm_test + y_pred_tcn_test) / 2
])


In [None]:
# Fit a Ridge Meta Model
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta_train, y_meta_train)

# Predict
y_meta_pred = meta_model.predict(X_meta_test)

mae = mean_absolute_error(y_seq_test, y_meta_pred)
print(f" Enriched Stacked MAE: {mae:,.2f}")

rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))
print(f" Enriched Stacked RMSE: {rmse:,.2f}")

### Replace Ridge with XGBoost - 01

In [None]:
# Train meta features
X_meta_train = np.column_stack([
    oof_preds_lstm,
    oof_preds_tcn,
    np.abs(oof_preds_lstm - oof_preds_tcn),
    (oof_preds_lstm + oof_preds_tcn) / 2
])


# Test meta features
X_meta_test = np.column_stack([
    y_pred_lstm_test,
    y_pred_tcn_test,
    np.abs(y_pred_lstm_test - y_pred_tcn_test),
    (y_pred_lstm_test + y_pred_tcn_test) / 2
])


# Fit XGBoost meta model
meta_model = XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)
meta_model.fit(X_meta_train, y_meta_train)


# Predict and Evaluate
y_meta_pred = meta_model.predict(X_meta_test)

mae = mean_absolute_error(y_seq_test, y_meta_pred)
print(f" XGBoost Stacked MAE: {mae:,.2f}")

rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))
print(f" XGBoost Stacked RMSE: {rmse:,.2f}")

### More meta features

In [None]:
# Meta Features (Train) — OOF predictions only

X_meta_train = np.column_stack([
    oof_preds_lstm,                                  # Model 1 OOF
    oof_preds_tcn,                                   # Model 2 OOF
    np.abs(oof_preds_lstm - oof_preds_tcn),          # Absolute difference
    oof_preds_lstm + oof_preds_tcn,                  # Sum of predictions
    (oof_preds_lstm + oof_preds_tcn) / 2,            # Mean prediction
    np.std(np.stack([oof_preds_lstm, oof_preds_tcn], axis=0), axis=0),  # Std dev
])


# Meta Features (Test) — NO use of y_test
X_meta_test = np.column_stack([
    y_pred_lstm,                                     # Model 1 prediction
    y_pred_tcn,                                      # Model 2 prediction
    np.abs(y_pred_lstm - y_pred_tcn),                # Absolute difference
    y_pred_lstm + y_pred_tcn,                        # Sum of predictions
    (y_pred_lstm + y_pred_tcn) / 2,                  # Mean prediction
    np.std(np.stack([y_pred_lstm, y_pred_tcn], axis=0), axis=0),  # Std dev
])


# Fit Ridge meta model
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta_train, y_train_new)  


# Predict and Evaluate

y_meta_pred = meta_model.predict(X_meta_test)

mae = mean_absolute_error(y_seq_test, y_meta_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))

print(f" Final Enriched Stacked MAE: {mae:,.4f}")
print(f" Final Enriched Stacked RMSE: {rmse:,.4f}")

### Replace Ridge with XGBoost

In [None]:
# Meta Features (Train) — OOF predictions only
X_meta_train = np.column_stack([
    oof_preds_lstm,
    oof_preds_tcn,
    np.abs(oof_preds_lstm - oof_preds_tcn),
    oof_preds_lstm + oof_preds_tcn,
    (oof_preds_lstm + oof_preds_tcn) / 2,
    np.std(np.stack([oof_preds_lstm, oof_preds_tcn], axis=0), axis=0),
])


# Meta Features (Test)
X_meta_test = np.column_stack([
    y_pred_lstm,
    y_pred_tcn,
    np.abs(y_pred_lstm - y_pred_tcn),
    y_pred_lstm + y_pred_tcn,
    (y_pred_lstm + y_pred_tcn) / 2,
    np.std(np.stack([y_pred_lstm, y_pred_tcn], axis=0), axis=0),
])


# Fit XGBoost meta model
meta_model = XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)
meta_model.fit(X_meta_train, y_train_new)


# Predict and Evaluate
y_meta_pred = meta_model.predict(X_meta_test)

mae = mean_absolute_error(y_seq_test, y_meta_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))

print(f" XGBoost Stacked MAE: {mae:,.4f}")
print(f" XGBoost Stacked RMSE: {rmse:,.4f}")

### Tune hyperparameters (Replace Ridge with XGBoost - 01)

In [None]:
#n=50


# Define meta features (already stacked)
X_meta_train = np.column_stack([
    oof_preds_lstm,
    oof_preds_tcn,
    np.abs(oof_preds_lstm - oof_preds_tcn),
    (oof_preds_lstm + oof_preds_tcn) / 2
])

X_meta_test = np.column_stack([
    y_pred_lstm_test,
    y_pred_tcn_test,
    np.abs(y_pred_lstm_test - y_pred_tcn_test),
    (y_pred_lstm_test + y_pred_tcn_test) / 2
])


# Define MAE scorer (lower is better)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)


# Parameter search space
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0.5, 1.0, 1.5, 2.0]
}


# Cross-validation and tuning
tscv = TimeSeriesSplit(n_splits=5)
xgb_meta = XGBRegressor(random_state=42, verbosity=0)

random_search = RandomizedSearchCV(
    xgb_meta,
    param_distributions=param_dist,
    scoring=mae_scorer,
    cv=tscv,
    n_iter=50,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_meta_train, y_meta_train)


# Use best estimator to predict
best_meta_model = random_search.best_estimator_
y_meta_pred = best_meta_model.predict(X_meta_test)


# Evaluation
mae = mean_absolute_error(y_seq_test, y_meta_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))

print(f" Tuned XGB Meta MAE: {mae:,.2f}")
print(f" Tuned XGB Meta RMSE: {rmse:,.2f}")
print(f"Best Params: {random_search.best_params_}")

In [None]:
# n=100


# Define meta features 

X_meta_train = np.column_stack([
    oof_preds_lstm,
    oof_preds_tcn,
    np.abs(oof_preds_lstm - oof_preds_tcn),
    (oof_preds_lstm + oof_preds_tcn) / 2
])

X_meta_test = np.column_stack([
    y_pred_lstm_test,
    y_pred_tcn_test,
    np.abs(y_pred_lstm_test - y_pred_tcn_test),
    (y_pred_lstm_test + y_pred_tcn_test) / 2
])


# Define MAE scorer (lower is better)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)


# Parameter search space
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0.5, 1.0, 1.5, 2.0]
}


# Cross-validation and tuning
tscv = TimeSeriesSplit(n_splits=5)
xgb_meta = XGBRegressor(random_state=42, verbosity=0)

random_search = RandomizedSearchCV(
    xgb_meta,
    param_distributions=param_dist,
    scoring=mae_scorer,
    cv=tscv,
    n_iter=100,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_meta_train, y_meta_train)


# Use best estimator to predict
best_meta_model = random_search.best_estimator_
y_meta_pred = best_meta_model.predict(X_meta_test)


# Evaluation
mae = mean_absolute_error(y_seq_test, y_meta_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))

print(f" Tuned XGB Meta MAE: {mae:,.2f}")
print(f" Tuned XGB Meta RMSE: {rmse:,.2f}")
print(f"Best Params: {random_search.best_params_}")

In [None]:
# n=150


# Define meta features 
X_meta_train = np.column_stack([
    oof_preds_lstm,
    oof_preds_tcn,
    np.abs(oof_preds_lstm - oof_preds_tcn),
    (oof_preds_lstm + oof_preds_tcn) / 2
])

X_meta_test = np.column_stack([
    y_pred_lstm_test,
    y_pred_tcn_test,
    np.abs(y_pred_lstm_test - y_pred_tcn_test),
    (y_pred_lstm_test + y_pred_tcn_test) / 2
])


# Define MAE scorer (lower is better)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)


# Parameter search space
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0.5, 1.0, 1.5, 2.0]
}


# Cross-validation and tuning
tscv = TimeSeriesSplit(n_splits=5)
xgb_meta = XGBRegressor(random_state=42, verbosity=0)

random_search = RandomizedSearchCV(
    xgb_meta,
    param_distributions=param_dist,
    scoring=mae_scorer,
    cv=tscv,
    n_iter=150,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_meta_train, y_meta_train)


# Use best estimator to predict
best_meta_model = random_search.best_estimator_
y_meta_pred = best_meta_model.predict(X_meta_test)


# Evaluation
mae = mean_absolute_error(y_seq_test, y_meta_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))

print(f" Tuned XGB Meta MAE: {mae:,.2f}")
print(f" Tuned XGB Meta RMSE: {rmse:,.2f}")
print(f"Best Params: {random_search.best_params_}")

### Grid Search Around Best Params

In [None]:
# MAE scorer
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Focused grid around best randomized params
param_grid = {
    'n_estimators': [40, 50, 60],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.005, 0.01, 0.02],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.5, 0.6, 0.7],
    'reg_alpha': [0.5, 1.0, 1.5],
    'reg_lambda': [0.5, 1.0, 1.5]
}

# Time series split
tscv = TimeSeriesSplit(n_splits=5)

# Model with fixed seed
xgb_meta = XGBRegressor(random_state=42, verbosity=0)

# Grid search
grid_search = GridSearchCV(
    xgb_meta,
    param_grid=param_grid,
    scoring=mae_scorer,
    cv=tscv,
    verbose=1,
    n_jobs=-1
)

# Fit to meta training data
grid_search.fit(X_meta_train, y_meta_train)

# Best model
best_meta_model_grid = grid_search.best_estimator_

# Predict on test set
y_meta_pred = best_meta_model_grid.predict(X_meta_test)

# Final evaluation
mae = mean_absolute_error(y_seq_test, y_meta_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))

print(f"Grid-Tuned XGB Meta MAE: {mae:,.2f}")
print(f"Grid-Tuned XGB Meta RMSE: {rmse:,.2f}")
print(f"Best Grid Params: {grid_search.best_params_}")


### optimal XGBoost meta-model

In [None]:
# Create Enriched Meta-Feature Sets
X_meta_train = np.column_stack([
    oof_preds_lstm,
    oof_preds_tcn,
    np.abs(oof_preds_lstm - oof_preds_tcn),
    (oof_preds_lstm + oof_preds_tcn) / 2
])

X_meta_test = np.column_stack([
    y_pred_lstm_test,
    y_pred_tcn_test,
    np.abs(y_pred_lstm_test - y_pred_tcn_test),
    (y_pred_lstm_test + y_pred_tcn_test) / 2
])


# Define and Train Optimal XGB Meta Model
best_xgb_meta_model = XGBRegressor(
    n_estimators=40,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.5,
    reg_lambda=1.0,
    reg_alpha=0.5,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

best_xgb_meta_model.fit(X_meta_train, y_meta_train)


# Predict and Evaluate
y_meta_pred = best_xgb_meta_model.predict(X_meta_test)

mae = mean_absolute_error(y_seq_test, y_meta_pred)
rmse = np.sqrt(mean_squared_error(y_seq_test, y_meta_pred))

print(f" Optimal XGB Meta MAE: {mae:,.2f}")
print(f" Optimal XGB Meta RMSE: {rmse:,.2f}")


### Save the Optimal Stack model

In [None]:
# Package everything
stack_package = {
    "meta_model": best_xgb_meta_model,      # Trained meta model
    "oof_preds_lstm": oof_preds_lstm,       # OOF predictions from LSTM
    "oof_preds_tcn": oof_preds_tcn,         # OOF predictions from TCN
    "X_meta_train": X_meta_train,           # Meta features (train)
    "X_meta_test": X_meta_test,             # Meta features (test)
    "y_meta_train": y_meta_train,           # Meta target
    "y_seq_test": y_seq_test,               # True test values
    "y_pred_lstm_test": y_pred_lstm_test,   # LSTM test predictions
    "y_pred_tcn_test": y_pred_tcn_test      # TCN test predictions
}

# Save everything in one file
with open("dl_best_stack_model.pkl", "wb") as f:
    pickle.dump(stack_package, f)

print("Everything saved in dl_best_stack_model.pkl")


### Making the sequences

In [None]:
# Initial Setup
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
SEQ_LEN = 60

# Feature Selection
exclude_cols = ["TXNDATE", "BRANCHID"]
numeric_cols = [col for col in X_train_full.columns if col not in exclude_cols]

TOP_N_FEATURES = 50
X_temp = X_train_full[numeric_cols].select_dtypes(include=['int64', 'float64', 'bool']).copy()
y_temp = y_train

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=SEED)
xgb_model.fit(X_temp, y_temp)

importances = xgb_model.feature_importances_
feat_imp = pd.Series(importances, index=X_temp.columns).sort_values(ascending=False)
top_features = feat_imp.head(TOP_N_FEATURES).index.tolist()

print(f"Selected Top {TOP_N_FEATURES} Features:\n", top_features)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full[top_features])
X_test_scaled = scaler.transform(X_test_full[top_features])

X_train_scaled = pd.DataFrame(X_train_scaled, columns=top_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=top_features)

# Sequence Creation
def create_sequences(X_df, y_series, branch_series, window):
    X_numeric_seq, X_branch_seq, y_seq = [], [], []
    for idx in range(window, len(X_df)):
        X_numeric_seq.append(X_df.iloc[idx - window:idx].values)
        X_branch_seq.append(branch_series.iloc[idx])
        y_seq.append(y_series.iloc[idx])
    return (
        np.array(X_numeric_seq).astype(np.float32),
        np.array(X_branch_seq).astype(np.int32).reshape(-1, 1),
        np.array(y_seq).astype(np.float32)
    )

n_branches = int(X_train_full["BRANCHID"].max()) + 1

X_num_seq, X_branch_seq, y_seq = create_sequences(
    X_train_scaled.reset_index(drop=True),
    y_train.reset_index(drop=True),
    X_train_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_seq_test, X_branch_seq_test, y_seq_test = create_sequences(
    X_test_scaled.reset_index(drop=True),
    #y_test.reset_index(drop=True),
    pd.Series(y_test).reset_index(drop=True),
    X_test_full["BRANCHID"].reset_index(drop=True),
    window=SEQ_LEN
)

X_num_train, X_num_test, X_branch_train, X_branch_test, y_train_new, y_test = train_test_split(
    X_num_seq, X_branch_seq, y_seq, test_size=0.2, shuffle=False
)

input_shape_num = X_num_seq.shape[1:]
input_shape_branch = X_branch_seq.shape[1:]
vocab_size = int(np.max(X_branch_seq)) + 1

#### Loading the optimal stack model

In [None]:
# Load the package
with open("dl_best_stack_model.pkl", "rb") as f:
    loaded_stack = pickle.load(f)

meta_model_loaded = loaded_stack["meta_model"]
X_meta_test_loaded = loaded_stack["X_meta_test"]
y_seq_test_loaded = loaded_stack["y_seq_test"]

# Predict with loaded meta model
y_meta_pred_loaded = meta_model_loaded.predict(X_meta_test_loaded)

# Evaluate
mae_loaded = mean_absolute_error(y_seq_test_loaded, y_meta_pred_loaded)
rmse_loaded = np.sqrt(mean_squared_error(y_seq_test_loaded, y_meta_pred_loaded))

print(f"Loaded Meta Model MAE: {mae_loaded:,.2f}")
print(f"Loaded Meta Model RMSE: {rmse_loaded:,.2f}")


In [None]:
# Load all saved models

# LSTM & TCN models
with open("dl_best_blend_model.pkl", "rb") as f:
    blend_package = pickle.load(f)

lstm_model = blend_package["lstm_model"]
tcn_model = blend_package["tcn_model"]
best_blend_weight = blend_package["best_weight"]

# Stack model
with open("dl_best_stack_model.pkl", "rb") as f:
    stack_package = pickle.load(f)

stack_model = stack_package["meta_model"]


# Generate predictions

# Predict with LSTM & TCN
y_pred_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
y_pred_tcn = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

# Predict with Blend
y_pred_blend = best_blend_weight * y_pred_lstm + (1 - best_blend_weight) * y_pred_tcn

# Predict with Stack
X_meta_test_loaded = stack_package["X_meta_test"]  
y_pred_stack = stack_model.predict(X_meta_test_loaded)

# Ground truth
y_test = stack_package["y_seq_test"]


# Evaluation function
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2


# Evaluate each model
results = {
    "Model": ["LSTM", "TCN", "Weighted Blend", "Stacked Model"],
    "RMSE": [],
    "MAE": [],
   
}

for preds in [y_pred_lstm, y_pred_tcn, y_pred_blend, y_pred_stack]:
    rmse, mae, r2 = evaluate_model(y_test, preds)
    results["RMSE"].append(f"{rmse:,.2f}")
    results["MAE"].append(f"{mae:,.2f}")
   


# Create summary table
results_df = pd.DataFrame(results)
print("\n Model Performance Summary:")
print(results_df.to_string(index=False))


In [None]:
# Load saved blend model (contains LSTM, TCN, and best weight)
with open("dl_best_blend_model.pkl", "rb") as f:
    blend_package = pickle.load(f)

lstm_model = blend_package["lstm_model"]
tcn_model = blend_package["tcn_model"]
best_blend_weight = blend_package["best_weight"]

# Load saved stack model (contains meta_model, X_meta_test, and y_seq_test)
with open("dl_best_stack_model.pkl", "rb") as f:
    stack_package = pickle.load(f)

stack_model = stack_package["meta_model"]
X_meta_test_loaded = stack_package["X_meta_test"]
y_seq_test = stack_package["y_seq_test"]

# Ensure TXNDATE is datetime and reset index
test_df = test_df.copy()
test_df["TXNDATE"] = pd.to_datetime(test_df["TXNDATE"])
test_df = test_df.reset_index(drop=True)

# Predict with base models
y_pred_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
y_pred_tcn = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

# Weighted blended prediction
y_pred_blend = best_blend_weight * y_pred_lstm + (1 - best_blend_weight) * y_pred_tcn

# Stacked prediction
y_pred_stack = stack_model.predict(X_meta_test_loaded)

# Align test_df by dropping first SEQ_LEN rows (sequence window)
test_df_seq = test_df.iloc[SEQ_LEN:].copy().reset_index(drop=True)

# Confirm lengths match
assert len(test_df_seq) == len(y_seq_test), f"Mismatch: {len(test_df_seq)} vs {len(y_seq_test)}"

# Merge predictions into aligned DataFrame
test_df_seq["Actual"] = y_seq_test
test_df_seq["LSTM_Pred"] = y_pred_lstm
test_df_seq["TCN_Pred"] = y_pred_tcn
test_df_seq["Blended_Pred"] = y_pred_blend
test_df_seq["Stacked_Pred"] = y_pred_stack

# Plot for a specific BranchID
branch_id = 21 
branch_df = test_df_seq[test_df_seq["BRANCHID"] == branch_id].sort_values("TXNDATE")

plt.figure(figsize=(12,6))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black", linewidth=2)
plt.plot(branch_df["TXNDATE"], branch_df["LSTM_Pred"], label="LSTM", color="dodgerblue", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["TCN_Pred"], label="TCN", color="green", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Blended_Pred"], label="Blended", color="purple", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Stacked_Pred"], label="Stacked", color="red", linestyle="--")

plt.title(f"Actual vs Predicted — BranchID = {branch_id}")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Load saved blend model (contains LSTM, TCN, and best weight)
with open("dl_best_blend_model.pkl", "rb") as f:
    blend_package = pickle.load(f)

lstm_model = blend_package["lstm_model"]
tcn_model = blend_package["tcn_model"]
best_blend_weight = blend_package["best_weight"]

# Load saved stack model (contains meta_model, X_meta_test, and y_seq_test)
with open("dl_best_stack_model.pkl", "rb") as f:
    stack_package = pickle.load(f)

stack_model = stack_package["meta_model"]
X_meta_test_loaded = stack_package["X_meta_test"]
y_seq_test = stack_package["y_seq_test"]

# Ensure TXNDATE is datetime and reset index
test_df = test_df.copy()
test_df["TXNDATE"] = pd.to_datetime(test_df["TXNDATE"])
test_df = test_df.reset_index(drop=True)

# Predict with base models
y_pred_lstm = lstm_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()
y_pred_tcn = tcn_model.predict([X_num_seq_test, X_branch_seq_test]).flatten()

# Weighted blended prediction
y_pred_blend = best_blend_weight * y_pred_lstm + (1 - best_blend_weight) * y_pred_tcn

# Stacked prediction
y_pred_stack = stack_model.predict(X_meta_test_loaded)

# Align test_df by dropping first SEQ_LEN rows (sequence window)
test_df_seq = test_df.iloc[SEQ_LEN:].copy().reset_index(drop=True)

# Confirm lengths match
assert len(test_df_seq) == len(y_seq_test), f"Mismatch: {len(test_df_seq)} vs {len(y_seq_test)}"

# Merge predictions into aligned DataFrame
test_df_seq["Actual"] = y_seq_test
test_df_seq["LSTM_Pred"] = y_pred_lstm
test_df_seq["TCN_Pred"] = y_pred_tcn
test_df_seq["Blended_Pred"] = y_pred_blend
test_df_seq["Stacked_Pred"] = y_pred_stack

# Plot for a specific BranchID with date range
branch_id = 21  
start_date = "2025-03-04"   
end_date   = "2025-03-10"   

branch_df = test_df_seq[test_df_seq["BRANCHID"] == branch_id].copy()
branch_df = branch_df.sort_values("TXNDATE")

# Filter by date range
mask = (branch_df["TXNDATE"] >= pd.to_datetime(start_date)) & (branch_df["TXNDATE"] <= pd.to_datetime(end_date))
branch_df = branch_df.loc[mask]

# Plot
plt.figure(figsize=(12,6))
plt.plot(branch_df["TXNDATE"], branch_df["Actual"], label="Actual", color="black", linewidth=2)
plt.plot(branch_df["TXNDATE"], branch_df["LSTM_Pred"], label="LSTM", color="dodgerblue", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["TCN_Pred"], label="TCN", color="green", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Blended_Pred"], label="Blended", color="purple", linestyle="--")
plt.plot(branch_df["TXNDATE"], branch_df["Stacked_Pred"], label="Stacked", color="red", linestyle="--")

plt.title(f"Actual vs Predicted — BranchID = {branch_id}\n({start_date} to {end_date})")
plt.xlabel("Date")
plt.ylabel("NetCashFlow")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Inputs: specify the date and branch to inspect
query_date = pd.to_datetime("2025-03-07")  
branch_id = 21 

# Filter aligned test DataFrame with sequence predictions
subset = test_df_seq[
    (test_df_seq["BRANCHID"] == branch_id) &
    (test_df_seq["TXNDATE"] == query_date)
]

if subset.empty:
    print(f"No data found for BranchID={branch_id} on {query_date.date()}")
else:
    # Extract scalar values (actual and predictions)
    actual = subset["Actual"].values[0]
    lstm_pred = subset["LSTM_Pred"].values[0]
    tcn_pred = subset["TCN_Pred"].values[0]
    blended_pred = subset["Blended_Pred"].values[0]
    stacked_pred = subset["Stacked_Pred"].values[0]

    # Prepare labels and values
    labels = ["Actual", "LSTM", "TCN", "Blended", "Stacked"]
    values = [actual, lstm_pred, tcn_pred, blended_pred, stacked_pred]
    colors = ["black", "dodgerblue", "green", "purple", "red"]

    plt.figure(figsize=(8, 5))

    # Plot points with legend and value annotations
    for label, val, color in zip(labels, values, colors):
        plt.scatter(label, val, color=color, s=150, label=label)
        plt.text(label, val, f"{val:,.0f}", ha='center', va='bottom', fontsize=12)

    plt.title(f"NetCashFlow on {query_date.date()} for BranchID {branch_id}")
    plt.ylabel("Amount")
    plt.grid(axis='y')
    plt.legend()
    plt.show()

In [None]:
# Inputs: specify the date and branch to inspect
query_date = pd.to_datetime("2025-01-24")  
branch_id = 21 

# Filter aligned test DataFrame with sequence predictions
subset = test_df_seq[
    (test_df_seq["BRANCHID"] == branch_id) &
    (test_df_seq["TXNDATE"] == query_date)
]

if subset.empty:
    print(f"No data found for BranchID={branch_id} on {query_date.date()}")
else:
    # Extract scalar values (actual and predictions)
    actual = subset["Actual"].values[0]
    lstm_pred = subset["LSTM_Pred"].values[0]
    tcn_pred = subset["TCN_Pred"].values[0]
    blended_pred = subset["Blended_Pred"].values[0]
    stacked_pred = subset["Stacked_Pred"].values[0]

    # Prepare labels and values
    labels = ["Actual", "LSTM", "TCN", "Blended", "Stacked"]
    values = [actual, lstm_pred, tcn_pred, blended_pred, stacked_pred]
    colors = ["black", "dodgerblue", "green", "purple", "red"]

    plt.figure(figsize=(8, 5))

    # Plot points with legend and value annotations
    for label, val, color in zip(labels, values, colors):
        plt.scatter(label, val, color=color, s=150, label=label)
        plt.text(label, val, f"{val:,.0f}", ha='center', va='bottom', fontsize=12)

    plt.title(f"NetCashFlow on {query_date.date()} for BranchID {branch_id}")
    plt.ylabel("Amount")
    plt.grid(axis='y')
    plt.legend()
    plt.show()