In [1]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt

## Load Data

In [2]:
neighborhood_df = pd.read_csv("../../housing-data/all_neighborhood_features_rotterdam.csv")
transaction_df = pd.read_csv("../../housing-data/rotterdam_transaction_data.csv")

In [3]:
neighborhood_df

Unnamed: 0,BUURTCODE,YEAR,LEEFBAAROMETER,GROEN,EC,NO2,PM2_5,PM10,GELUIDSHINDERTOTAAL,AFSTANDTOTHUISARTSENPRAKTIJK,...,MOTORFIETSEN,AFSTANDTOTSCHOOL,SCHOLENBINNEN3KM,OPPERVLAKTETOTAAL,OPPERVLAKTELAND,OPPERVLAKTEWATER,MEESTVOORKOMENDEPOSTCODE,DEKKINGSPERCENTAGE,MATEVANSTEDELIJKHEID,OMGEVINGSADRESSENDICHTHEID
0,0,2014,4.085167,0.019481,0.000689,0.021756,0.010006,0.017313,0.323869,0.400000,...,9.999997,0.800000,9.099997,12.999996,12.999996,0.0,2990.999057,1.000000,1.999999,2086.999342
1,1,2014,4.034464,0.041062,0.000660,0.020684,0.009676,0.016755,0.304635,0.293726,...,9.796313,0.293398,8.797291,17.599499,17.599499,0.0,2923.542404,0.977446,1.954893,1982.046788
2,2,2014,4.110514,0.031598,0.000684,0.022700,0.009859,0.017139,0.322656,0.200000,...,29.999992,0.300000,12.199997,13.999996,13.999996,0.0,2991.999193,1.000000,1.999999,1891.999490
3,3,2014,4.110993,0.037557,0.000684,0.022711,0.009891,0.017191,0.325598,0.400000,...,9.999997,0.700000,11.199997,51.999987,51.999987,0.0,2990.999252,1.000000,1.999999,1676.999580
4,4,2014,4.048586,0.038952,0.000666,0.021961,0.009820,0.017036,0.301614,0.500000,...,19.999993,0.900000,9.099997,6.999998,6.999998,0.0,2990.999025,1.000000,1.999999,1578.999485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6870,620,2024,4.042706,0.040761,0.000251,0.017493,0.007697,0.015455,0.327098,3.400000,...,30.000000,0.400000,1.500000,21.000000,20.000000,0.0,3238.000000,1.000000,5.000000,281.000000
6871,621,2024,4.061659,0.047161,0.000250,0.017436,0.007677,0.015409,0.321582,3.600000,...,60.000000,0.800000,1.000000,30.000000,27.000000,4.0,3238.000000,3.000000,5.000000,284.000000
6872,622,2024,4.269247,0.046067,0.000249,0.017399,0.007667,0.015397,0.330565,3.800000,...,10.000000,0.500000,1.000000,17.000000,11.000000,6.0,3238.000000,1.000000,5.000000,278.000000
6873,623,2024,4.166958,0.032288,0.000240,0.017008,0.007543,0.015186,0.315714,3.592785,...,25.000000,2.195591,1.197595,884.000000,809.000000,75.0,3238.000000,1.000000,5.000000,95.000000


In [4]:
transaction_df
transaction_df['DATUM'] = pd.to_datetime(transaction_df['DATUM'])
transaction_df.sort_values('DATUM', inplace=True)

transaction_df['YEAR'] = transaction_df['DATUM'].dt.year
transaction_df['MONTH'] = transaction_df['DATUM'].dt.month
transaction_df.drop([ "DATUM"], axis=1, inplace=True)


In [5]:
def combine_data(transactions, node_features):
    combined_df = transactions.merge(node_features,how="left", on=["BUURTCODE", "YEAR"])
    return combined_df


In [6]:
combined_df = combine_data(transactions=transaction_df, node_features=neighborhood_df)
combined_df

Unnamed: 0,TRANSID,WONINGTYPE,SOC,CALCOPP,KAVOPP,BOUWJAAR,BUURTCODE,LAT,LON,BESTEMMING,...,MOTORFIETSEN,AFSTANDTOTSCHOOL,SCHOLENBINNEN3KM,OPPERVLAKTETOTAAL,OPPERVLAKTELAND,OPPERVLAKTEWATER,MEESTVOORKOMENDEPOSTCODE,DEKKINGSPERCENTAGE,MATEVANSTEDELIJKHEID,OMGEVINGSADRESSENDICHTHEID
0,4329686,0.0,1182,90,0,1945,371,51.914485,4.320525,1.0,...,89.819022,0.283639,11.629200,66.182437,63.346047,2.836390,2961.191346,0.945463,1.890927,2190.638681
1,4329737,2.0,1425,164,292,1984,457,52.006799,4.546388,1.0,...,69.999978,0.400000,4.999998,26.999991,26.999991,0.000000,2664.999147,1.000000,3.999999,987.999684
2,4333802,0.0,1184,90,0,1984,178,51.925000,4.476842,1.0,...,352.517281,0.676276,44.044660,167.092214,130.386799,36.705415,2908.828268,0.966062,0.966062,6368.582264
3,4331781,3.0,1131,130,149,1964,159,51.931712,4.232454,1.0,...,89.999985,0.500000,8.699999,81.999986,61.999990,19.999997,3144.999468,1.000000,2.000000,1789.999697
4,4344392,4.0,1111,140,416,1991,523,51.831090,4.354248,1.0,...,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108994,8849514,0.0,1188,131,0,2007,192,51.902352,4.461150,1.0,...,105.000000,0.500000,20.600000,107.000000,47.000000,61.000000,3024.000000,5.000000,1.000000,3371.000000
108995,8846853,0.0,1188,87,0,1926,207,51.946392,4.463521,1.0,...,230.000000,0.400000,28.600000,127.000000,119.000000,8.000000,3051.000000,1.000000,1.000000,3168.000000
108996,8864162,3.0,1131,126,130,1998,583,51.844048,4.161957,1.0,...,75.000000,1.200000,5.300000,26.000000,22.000000,3.000000,3223.000000,1.000000,3.000000,1055.000000
108997,8854064,3.0,1136,213,104,1898,226,51.905242,4.510151,1.0,...,55.000000,0.400000,23.800000,100.000000,63.000000,37.000000,3071.000000,1.000000,1.000000,4005.000000


## LightGBM with an 80-20 split

In [7]:
# from sklearn.model_selection import train_test_split
# X = combined_df.drop(["LOG_KOOPSOM"], axis=1)
# y = combined_df["LOG_KOOPSOM"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# # Train the model
# params = {'objective': 'L2', 'n_estimators': 1000, 'learning_rate': 0.05, 
#           'num_leaves': 1000, 'min_data_in_leaves': 10, 'feature_fraction': 0.7, 'max_depth': 75,
#           'lambda_l2':10e-5, 'path_smooth': 10e-5, 'n_jobs': -1}
# model = LGBMRegressor(**params, verbose=-1)
# model.fit(X_train, y_train)

In [9]:
# # Evaluate the model
# log_y_pred = model.predict(X_test, verbose=-1)
# y_pred = np.exp(log_y_pred)
# mape = np.abs(y_pred / np.exp(y_test) - 1).mean()
# ratio =  np.mean(y_pred / np.exp(y_test))

# print(f"mean ratio: {ratio}, mape: {mape}")

# log_res = log_y_pred - y_test

# bins = np.linspace(-0.75, 0.75, 51)
# plt.hist(log_res, bins=bins, edgecolor='black')
# plt.axvline(x=0, color='red', linestyle='--')
# plt.show()

## LightGBM with Sliding Windows

In [10]:
from sklearn.metrics import root_mean_squared_error

In [11]:
combined_df["DATE"] = pd.to_datetime(combined_df["YEAR"].astype(str) + "-" + combined_df["MONTH"].astype(str))
combined_df = combined_df.sort_values("DATE")
# combined_df = combined_df.drop(columns=["YEAR", "MONTH"])

In [12]:
features = [col for col in combined_df.columns if col not in ["LOG_KOOPSOM", "DATE", "TRANSID"]]
target = "LOG_KOOPSOM"

In [21]:
from sklearn.preprocessing import StandardScaler, RobustScaler

min_date = combined_df["DATE"].min()
max_date = combined_df["DATE"].max()

start_train = min_date
end_train = start_train + pd.DateOffset(months=60)
test_month = end_train + pd.DateOffset(months=1)

model = None
scaler = StandardScaler()
all_window_preds = []

# To track relative errors per epoch
epoch_stats = []
prev_booster = None 

while test_month <= max_date:
    print("Train Start Date: ",start_train )
    print("Train End Date: ", end_train )
    print("Test Month", test_month)

    train_data = combined_df[(combined_df["DATE"] >= start_train) & (combined_df["DATE"] <= end_train)]
    test_data = combined_df[combined_df["DATE"] == test_month]
    print(len(test_data), "test data points")
    if test_data.empty:
        print("empty")
        break


    base_params = {'objective': 'L2', 'n_estimators': 50, 'learning_rate': 0.05, 
          'num_leaves': 1000, 'min_data_in_leaves': 10, 'feature_fraction': 0.7, 'max_depth': 75,
          'lambda_l2':10e-5, 'path_smooth': 10e-5, 'n_jobs': -1}
    
    # if model==None:
    #     scaler.fit(X_train)  # Fit scaler only on first window
    # else:
    #     scaler.partial_fit(X_train)  # Update scaler with new window

    # X_train_scaled = scaler.transform(X_train)
    # X_test_scaled = scaler.transform(X_test)

    if model is None:
        print("Training new model...")
        model = LGBMRegressor(**base_params, verbose=-1)
        model.fit(train_data[features], train_data[target])
    else:
        print("Updating model with new data...", flush=True)
        
        new_params = {**base_params, "n_estimators": 50}
        model = LGBMRegressor(**new_params, verbose=-1)
        model.fit(
            train_data[features],
            train_data[target],
            init_model=prev_booster           # Booster, not the wrapper
        )
        prev_booster = model.booster_ 
    print("current start:", start_train)
    start_train += pd.DateOffset(months=1)
    print("updated start", start_train)
    end_train += pd.DateOffset(months=1)
    test_month += pd.DateOffset(months=1)
    print("--------------------------------------------------")
    print("Updated Train Start Date: ", start_train)
    print("Updated Train End Date: ", end_train)
    print("Updated Test Month: ", test_month)
    print("--------------------------------------------------")

Train Start Date:  2014-10-01 00:00:00
Train End Date:  2019-10-01 00:00:00
Test Month 2019-11-01 00:00:00
918 test data points
Training new model...
current start: 2014-10-01 00:00:00
updated start 2014-11-01 00:00:00
--------------------------------------------------
Updated Train Start Date:  2014-11-01 00:00:00
Updated Train End Date:  2019-11-01 00:00:00
Updated Test Month:  2019-12-01 00:00:00
--------------------------------------------------
Train Start Date:  2014-11-01 00:00:00
Train End Date:  2019-11-01 00:00:00
Test Month 2019-12-01 00:00:00
1178 test data points
Updating model with new data...
current start: 2014-10-01 00:00:00
updated start 2014-11-01 00:00:00
--------------------------------------------------
Updated Train Start Date:  2014-11-01 00:00:00
Updated Train End Date:  2019-11-01 00:00:00
Updated Test Month:  2019-12-01 00:00:00
--------------------------------------------------
Train Start Date:  2014-11-01 00:00:00
Train End Date:  2019-11-01 00:00:00
Test 

KeyboardInterrupt: 

In [None]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler, RobustScaler

import matplotlib.pyplot as plt

# Load Data
neighborhood_df = pd.read_csv("../../housing-data/all_neighborhood_features_rotterdam.csv")
transaction_df = pd.read_csv("../../housing-data/rotterdam_transaction_data.csv")

# Preprocess transaction data
transaction_df['DATUM'] = pd.to_datetime(transaction_df['DATUM'])
transaction_df.sort_values('DATUM', inplace=True)
transaction_df['YEAR'] = transaction_df['DATUM'].dt.year
transaction_df['MONTH'] = transaction_df['DATUM'].dt.month
transaction_df.drop(["DATUM"], axis=1, inplace=True)

# Combine data
def combine_data(transactions, node_features):
    combined_df = transactions.merge(node_features, how="left", on=["BUURTCODE", "YEAR"])
    return combined_df

combined_df = combine_data(transactions=transaction_df, node_features=neighborhood_df)

# Add DATE column and sort
combined_df["DATE"] = pd.to_datetime(combined_df["YEAR"].astype(str) + "-" + combined_df["MONTH"].astype(str))
combined_df = combined_df.sort_values("DATE")

# Feature selection
features = [col for col in combined_df.columns if col not in ["LOG_KOOPSOM", "DATE", "TRANSID"]]
target = "LOG_KOOPSOM"

# LightGBM with Sliding Windows
min_date = combined_df["DATE"].min()
max_date = combined_df["DATE"].max()
start_train = min_date
end_train = start_train + pd.DateOffset(months=60)
test_month = end_train + pd.DateOffset(months=1)

model = None
scaler = StandardScaler()
all_window_preds = []
epoch_stats = []
prev_booster = None

while test_month <= max_date:
    print("Train Start Date: ", start_train)
    print("Train End Date: ", end_train)
    print("Test Month", test_month)

    train_data = combined_df[(combined_df["DATE"] >= start_train) & (combined_df["DATE"] <= end_train)]
    test_data = combined_df[combined_df["DATE"] == test_month]
    print(len(test_data), "test data points")
    if test_data.empty:
        print("empty")
        break

    base_params = {'objective': 'L2', 'n_estimators': 50, 'learning_rate': 0.05,
                   'num_leaves': 1000, 'min_data_in_leaves': 10, 'feature_fraction': 0.7, 'max_depth': 75,
                   'lambda_l2': 10e-5, 'path_smooth': 10e-5, 'n_jobs': -1}

    if model is None:
        print("Training new model...")
        model = LGBMRegressor(**base_params, verbose=-1)
        model.fit(train_data[features], train_data[target])
        prev_booster = model.booster_
    else:
        print("Updating model with new data...", flush=True)
        new_params = {**base_params, "n_estimators": 50}
        model = LGBMRegressor(**new_params, verbose=-1)
        model.fit(
            train_data[features],
            train_data[target],
            init_model=prev_booster
        )
        prev_booster = model.booster_

    # Make predictions
    predictions = model.predict(test_data[features])
    actuals = test_data[target].values

    # Train predictions for metrics
    train_preds = model.predict(train_data[features])
    train_mse = np.mean((train_data[target] - train_preds) ** 2)
    train_mape = np.mean(np.abs((np.exp(train_data[target]) - np.exp(train_preds)) / np.exp(train_data[target]))) * 100

    print(f"Train MSE: {train_mse}")
    print(f"Train MAPE: {train_mape}")

    mse = np.mean((actuals - predictions) ** 2)
    print(f"MSE: {mse}")
    mape = np.mean(np.abs((np.exp(actuals) - np.exp(predictions)) / np.exp(predictions))) * 100
    print(f"MAPE: {mape}")

    epoch_stats.append({
        "window_start": start_train.strftime('%Y-%m'),
        "epoch": 1,
        "train_mape": train_mape,
        "test_mape": mape,
        "train_mse": train_mse,
        "test_mse": mse,
    })
    preds_df = pd.DataFrame({
        "window_start": start_train.strftime('%Y-%m'),
        "BUURTCODE": test_data["BUURTCODE"].values,
        "YEAR": test_data["YEAR"].values,
        "MONTH": test_data["MONTH"].values,
        "TRANSID": test_data["TRANSID"].values,
        "y_true": actuals,
        "y_pred": predictions,
    })

    all_window_preds.append(preds_df)
    # Move window forward
    print("current start:", start_train)
    start_train += pd.DateOffset(months=1)
    print("updated start", start_train)
    end_train += pd.DateOffset(months=1)
    test_month += pd.DateOffset(months=1)
    print("--------------------------------------------------")
    print("Updated Train Start Date: ", start_train)
    print("Updated Train End Date: ", end_train)
    print("Updated Test Month: ", test_month)
    print("--------------------------------------------------")

# Save predictions and stats
final_preds_df = pd.concat(all_window_preds, ignore_index=True)
final_preds_df.to_csv("./outputs/all_test_predictions_ml.csv", index=False)

stats_df = pd.DataFrame(epoch_stats)
stats_df.to_csv("./outputs/training_stats_ml.csv", index=False)

# Online Learning Approach (example, not run above)
# df = combined_df
# scaler = RobustScaler()
# params = {'objective': 'L2', 'n_estimators': 1000, 'learning_rate': 0.05,
#           'num_leaves': 1000, 'min_data_in_leaves': 10, 'feature_fraction': 0.7, 'max_depth': 75,
#           'lambda_l2': 10e-5, 'path_smooth': 10e-5, 'n_jobs': -1}
# model = LGBMRegressor(**params, verbose=-1)
# min_date = df["DATE"].min()
# max_date = df["DATE"].max()
# start_train = min_date
# test_month = start_train + pd.DateOffset(months=60)
# predictions = []
# actuals = []
# while test_month <= max_date:
#     train_idx = df["DATE"] < test_month
#     test_idx = df["DATE"] == test_month
#     print("Test Month:", test_month)
#     X_train, y_train = df.loc[train_idx, features], df.loc[train_idx, target]
#     X_test, y_test = df.loc[test_idx, features], df.loc[test_idx, target]
#     if X_train.empty or X_test.empty:
#         break
#     if start_train == min_date:
#         model.fit(X_train, y_train)
#     else:
#         model.set_params(n_estimators=model.n_estimators + 20)
#         model.fit(X_train, y_train, init_model=model)
#     preds = model.predict(X_test)
#     predictions.extend(preds)
#     actuals.extend(y_test.values)
#     rmse = root_mean_squared_error(actuals, predictions)
#     print(f"RMSE: {rmse}")
#     mape = np.mean(np.abs((np.exp(actuals) - np.exp(predictions)) / np.exp(predictions))) * 100
#     print(f"MAPE: {mape}")
#     test_month += pd.DateOffset(months=1)
# rmse = root_mean_squared_error(actuals, predictions)
# print(f"Final RMSE: {rmse}")
# mape = np.mean(np.abs((np.exp(actuals) - np.exp(predictions)) / np.exp(predictions))) * 100
# print(f"Final MAPE: {mape}")

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler


min_date = combined_df["DATE"].min()
max_date = combined_df["DATE"].max()

start_train = min_date
end_train = start_train + pd.DateOffset(months=60)
test_month = end_train + pd.DateOffset(months=1)
model = None
scaler = StandardScaler()
all_window_preds = []

# To track relative errors per epoch
epoch_stats = []


while test_month <= max_date:
    print("🔁 New loop iteration - start_train:", start_train)
    predictions = []
    actuals = []
    print("Train Start Date: ",start_train )
    print("Train End Date: ", end_train )
    print("Test Month", test_month)
    # Train/Test Split
    train_data = combined_df[(combined_df["DATE"] >= start_train) & (combined_df["DATE"] <= end_train)]
    test_data = combined_df[combined_df["DATE"] == test_month]
    print(len(test_data), "test data points")
    if test_data.empty:
        print("empty")
        break

    # LightGBM dataset
    # train_set = lgb.Dataset(train_data[features], label=train_data[target])
    # test_set = lgb.Dataset(test_data[features], label=test_data[target], reference=train_set)

    # Train model
 
    params = {'objective': 'L2', 'n_estimators': 50, 'learning_rate': 0.05, 
          'num_leaves': 1000, 'min_data_in_leaves': 10, 'feature_fraction': 0.7, 'max_depth': 75,
          'lambda_l2':10e-5, 'path_smooth': 10e-5, 'n_jobs': -1}
    
    # if model==None:
    #     scaler.fit(X_train)  # Fit scaler only on first window
    # else:
    #     scaler.partial_fit(X_train)  # Update scaler with new window

    # X_train_scaled = scaler.transform(X_train)
    # X_test_scaled = scaler.transform(X_test)

    if model is None:
        model = LGBMRegressor(**params, verbose=-1)
        model.fit(train_data[features], train_data[target])
    else:
        model.set_params(n_estimators=50)
        model.fit(train_data[features], train_data[target], init_model=model)

    # Make predictions
    preds = model.predict(test_data[features])

    # Store results
    predictions = preds
    actuals= test_data[target].values


    # Train predictions for metrics
    train_preds = model.predict(train_data[features])
    train_mse = np.mean((train_data[target] - train_preds) ** 2)
    train_mape = np.mean(np.abs((np.exp(train_data[target]) - np.exp(train_preds)) / np.exp(train_data[target]))) * 100

    print(f"Train MSE: {train_mse}")
    print(f"Train MAPE: {train_mape}")


    mse = np.mean((actuals - predictions) ** 2)
    print(f"MSE: {mse}")
    mape = np.mean(np.abs((np.exp(actuals) - np.exp(predictions)) / np.exp(predictions))) * 100
    print(f"MAPE: {mape}")

    epoch_stats.append({
        "window_start": start_train.strftime('%Y-%m'),
        "epoch":  1,
        "train_mape": train_mape,
        "test_mape": mape,
        "train_mse": train_mse,
        "test_mse": mse,
    })
    preds_df = pd.DataFrame({
        "window_start": start_train.strftime('%Y-%m'),
        "BUURTCODE": test_data["BUURTCODE"].values,
        "YEAR": test_data["YEAR"].values,
        "MONTH": test_data["MONTH"].values,
        "TRANSID": test_data["TRANSID"].values,
        "y_true": actuals,
        "y_pred": predictions,
    })

    all_window_preds.append(preds_df)
    # Move window forward
    print("current start:", start_train)
    start_train += pd.DateOffset(months=1)
    print("updated start", start_train)
    end_train += pd.DateOffset(months=1)
    test_month += pd.DateOffset(months=1)
    print("--------------------------------------------------")
    print("Updated Train Start Date: ", start_train)
    print("Updated Train End Date: ", end_train)
    print("Updated Test Month: ", test_month)
    print("--------------------------------------------------")



🔁 New loop iteration - start_train: 2014-10-01 00:00:00
Train Start Date:  2014-10-01 00:00:00
Train End Date:  2019-10-01 00:00:00
Test Month 2019-11-01 00:00:00
918 test data points
Train MSE: 0.012669311992354595
Train MAPE: 8.490367452729698
MSE: 0.01660803436109644
MAPE: 10.20702473330138
current start: 2014-10-01 00:00:00
updated start 2014-11-01 00:00:00
--------------------------------------------------
Updated Train Start Date:  2014-11-01 00:00:00
Updated Train End Date:  2019-11-01 00:00:00
Updated Test Month:  2019-12-01 00:00:00
--------------------------------------------------
🔁 New loop iteration - start_train: 2014-11-01 00:00:00
Train Start Date:  2014-11-01 00:00:00
Train End Date:  2019-11-01 00:00:00
Test Month 2019-12-01 00:00:00
1178 test data points
Train MSE: 0.012669311992354595
Train MAPE: 8.490367452729698
MSE: 0.01660803436109644
MAPE: 10.20702473330138
current start: 2014-10-01 00:00:00
updated start 2014-11-01 00:00:00
------------------------------------

KeyboardInterrupt: 

In [None]:
final_preds_df = pd.concat(all_window_preds, ignore_index=True)
final_preds_df.to_csv("./outputs/all_test_predictions_ml.csv", index=False)

# Save epoch stats
stats_df = pd.DataFrame(epoch_stats)
stats_df.to_csv("./outputs/training_stats_ml.csv", index=False)

## Online Learning Approach

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import root_mean_squared_error

# # Load data
# df = pd.read_csv("your_data.csv")
# df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str))
# df = df.sort_values("date").drop(columns=["year", "month"])

# # Features and target
# features = [col for col in df.columns if col not in ["target", "date"]]
# target = "target"
df= combined_df
# Initialize scaler (StandardScaler for normal distribution, MinMaxScaler for range scaling)
scaler = RobustScaler()

# Initialize model
params = {'objective': 'L2', 'n_estimators': 1000, 'learning_rate': 0.05, 
          'num_leaves': 1000, 'min_data_in_leaves': 10, 'feature_fraction': 0.7, 'max_depth': 75,
          'lambda_l2':10e-5, 'path_smooth': 10e-5, 'n_jobs': -1}
model = LGBMRegressor(**params, verbose=-1)

# Define rolling window parameters
min_date = df["DATE"].min()
max_date = df["DATE"].max()
start_train = min_date
test_month = start_train + pd.DateOffset(months=60)

predictions = []
actuals = []

while test_month <= max_date:
    # Select training and testing data
    train_idx = df["DATE"] < test_month
    test_idx = df["DATE"] == test_month
    print("Test Month:", test_month)
    X_train, y_train = df.loc[train_idx, features], df.loc[train_idx, target]
    X_test, y_test = df.loc[test_idx, features], df.loc[test_idx, target]

    if X_train.empty or X_test.empty:
        break

    # **Online Feature Scaling**
    # if start_train == min_date:
    #     scaler.fit(X_train)  # Fit scaler only on first window
    # else:
    #     scaler.partial_fit(X_train)  # Update scaler with new window

    # X_train_scaled = scaler.transform(X_train)
    # X_test_scaled = scaler.transform(X_test)

    # **Online Model Training**
    if start_train == min_date:
        model.fit(X_train, y_train)  # Train on first batch
    else:
        model.set_params(n_estimators=model.n_estimators + 20)  # Add new trees
        model.fit(X_train, y_train, init_model=model)

    # Predictions
    preds = model.predict(X_test)

    # Store results
    predictions.extend(preds)
    actuals.extend(y_test.values)

    rmse = root_mean_squared_error(actuals, predictions)
    print(f"RMSE: {rmse}")
    mape = np.mean(np.abs((np.exp(actuals) - np.exp(predictions)) / np.exp(predictions))) * 100
    print(f"MAPE: {mape}")
    # Move window forward
    test_month += pd.DateOffset(months=1)

# Evaluate model
rmse = root_mean_squared_error(actuals, predictions)
print(f"Final RMSE: {rmse}")
mape = np.mean(np.abs((np.exp(actuals) - np.exp(predictions)) / np.exp(predictions))) * 100
print(f"Final MAPE: {mape}")


Test Month: 2019-10-01 00:00:00
RMSE: 0.1477269928581607
MAPE: 11.297929704263776
Test Month: 2019-11-01 00:00:00
RMSE: 0.1376991619843752
MAPE: 10.487919022062417
Test Month: 2019-12-01 00:00:00
RMSE: 0.13644349109291237
MAPE: 10.453727734098088
Test Month: 2020-01-01 00:00:00
RMSE: 0.13571400975141956
MAPE: 10.412903234560158
Test Month: 2020-02-01 00:00:00
RMSE: 0.13484406451349926
MAPE: 10.29009970190649
Test Month: 2020-03-01 00:00:00
RMSE: 0.13322527662717226
MAPE: 10.177226541421131
Test Month: 2020-04-01 00:00:00
RMSE: 0.13230793836131105
MAPE: 10.14182133546273
Test Month: 2020-05-01 00:00:00
RMSE: 0.13155308815695144
MAPE: 10.059542687602722
Test Month: 2020-06-01 00:00:00
RMSE: 0.13063058719143186
MAPE: 9.99038633711236
Test Month: 2020-07-01 00:00:00
RMSE: 0.13018470926450842
MAPE: 9.94484297618403
Test Month: 2020-08-01 00:00:00
RMSE: 0.12957488803178663
MAPE: 9.922042973513186
Test Month: 2020-09-01 00:00:00
RMSE: 0.12934474435491058
MAPE: 9.913381616781065
Test Month: 20