In [0]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.multioutput import MultiOutputRegressor
import pmdarima as pm
import xgboost as xgb
from xgboost import XGBRegressor
from prophet import Prophet
from pyspark.sql.functions import to_date, to_timestamp, col, last, when, lit, current_date, date_sub, datediff, substring, sequence, explode, coalesce, sum as spark_sum, min as spark_min
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from datetime import datetime, timedelta
import joblib
import holidays
import mlflow
import re
mlflow.autolog(disable=True)

#importing price and meta data

price_history = spark.read.table('data_prod.silver_sanezdb.priceinspection').select('segment', 'when', F.col('standardprice').cast('double'))
sales_history = spark.read.table('data_experience_commercial.cbt_1423_rtsuite.master_uat').select('flightkey', F.col('charge_dt').cast('date'), 'unt_pre', 'rev_pre', 'chargeproduct', 'dtg', 'bkg_is_ejhsss')
dimensions_history = spark.read.table('data_experience_commercial.cbt_0923_segmentfinder.dimensions_history').select('flightkey', 'onsale_dt', 'ty_capacity', 'routetype', 'region', 'flight_dt', 'flight_wk')
filtered_dh = dimensions_history.filter((F.col('routetype') == 'Domestic') & (F.col('region') == 'UK-London') & (F.col('flight_dt') >= '2020-01-01') & (F.datediff(F.col('flight_dt'), F.col('onsale_dt')) >= 168))

#forward filling flightkey price history to 1d frequency

ph_renamed = price_history.withColumnRenamed('when', 'charge_dt').withColumnRenamed('segment', 'flightkey').withColumn('charge_dt', F.col('charge_dt').cast('date')) 
dph = filtered_dh.join(ph_renamed, on='flightkey', how='inner')
dphsmooth = dph.groupby('flightkey','charge_dt').agg(F.avg('standardprice').alias('price'), F.first('flight_dt').alias('flight_dt'), F.first('onsale_dt').alias('onsale_dt')).orderBy('charge_dt')
date_range = dphsmooth.groupBy('flightkey').agg(F.min('onsale_dt').alias('start_date'), F.least(F.first('flight_dt'), F.lit(datetime.now().date())).alias('end_date'))
index = date_range.withColumn('charge_dt_ts', F.explode(F.sequence(F.col('start_date'), F.col('end_date')))).withColumn('charge_dt', F.col('charge_dt_ts').cast('date')).drop('charge_dt_ts')
dphjoin = index.join(dphsmooth, on=['flightkey', 'charge_dt'], how='left').drop('flight_dt', 'onsale_dt')
ff_window_spec = Window.partitionBy('flightkey').orderBy('charge_dt')
bf_window_spec = Window.partitionBy('flightkey').orderBy('charge_dt').rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
dphfilled = dphjoin.withColumn('price', F.coalesce(F.last('price', ignorenulls=True).over(ff_window_spec),F.first('price', ignorenulls=True).over(bf_window_spec))).drop('start_date', 'end_date') #forward fills from first flight price, then back fills from first price if first price date after onsale date
finaldph = dphfilled.join(filtered_dh, on='flightkey', how='left').drop('onsale_dt')
aggregated_price_history = finaldph.groupby('charge_dt', 'flight_dt').agg(F.avg('price').cast('double').alias('price'))

#assembling corresponding sales history

dsh = filtered_dh.join(sales_history, on='flightkey', how='left').drop('onsale_dt')
filtered_dsh = dsh.filter((F.col('chargeproduct') == 'Ticket') & (F.col('dtg') >= 0) & (F.col('bkg_is_ejhsss') == 0)).drop('chargeproduct', 'dtg', 'bkg_is_ejhsss')
dshsmooth = filtered_dsh.groupby('flightkey','charge_dt').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('rev_pre').alias('rev_pre'))
dshjoin = index.join(dshsmooth, on=['flightkey', 'charge_dt'], how='left').drop('start_date', 'end_date', 'region', 'routetype', 'flight_dt', 'ty_capacity').fillna(0)
window_spec2 = Window.partitionBy('flightkey').orderBy(F.col('charge_dt'))
dsh_pax = dshjoin.withColumn('pax_net', F.sum('unt_pre').over(window_spec2))
final_dsh = dsh_pax.join(dimensions_history, on='flightkey', how='left').drop('onsale_dt')
aggregated_sales_history = final_dsh.groupby('charge_dt', 'flight_dt').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('ty_capacity').alias('ty_capacity'), F.sum('pax_net').alias('pax_net'), F.sum('rev_pre').alias('rev_pre'))

#final dataframe

df = aggregated_price_history.join(aggregated_sales_history, on=['charge_dt', 'flight_dt'], how='left').toPandas()
df['dtg'] = (df['flight_dt'] - df['charge_dt']).dt.days.astype(int)
df = df.drop(['pax_net'], axis=1)


df.info()
df_original = df.copy()

In [0]:
df = df_original.copy()

In [0]:
# Data Optimisation

def optimize_df(df):

    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Initial memory usage: {start_mem:.2f} MB")

    for col in df.columns:

        if '_dt' in col:
            df[col] = pd.to_datetime(df[col])

        elif 'rev_pre' in col or col.startswith(('R_', 'r_', 'r(target)')):
            df[col] = df[col].astype('float32')

        elif df[col].dtype == 'object':
            df[col] = df[col].astype('category')

        elif 'float' in str(df[col].dtype):
            df[col] = df[col].astype(np.float16)
            
        elif 'int' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='integer')

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f"Final memory usage: {end_mem:.2f} MB ({reduction:.2f}% reduction)")

    return df

optimize_df(df)
df.info()

In [0]:
df = df[(df['dtg'] <= 275) & (df['dtg'] > 0)]
current_date = datetime.now().strftime('%Y-%m-%d')
df = df[df['charge_dt'] < current_date]

In [0]:
#Sales Over Charge Date

total_sales = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
# Removing covid-19 outlier data before 2022. Keeping Dec 2021 for now to generate lag features.

df = df[df['charge_dt'] >= '2021-10-01']
total_sales_by_charge_dt = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_charge_dt.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
sales_by_flight_dt = df.groupby('flight_dt')['unt_pre'].sum().reset_index()
sales_by_flight_dt = sales_by_flight_dt[(sales_by_flight_dt['flight_dt'] > '2022-11-01') & (sales_by_flight_dt['flight_dt'] < '2025-11-01')]
plt.style.use('ggplot')
sales_by_flight_dt.plot(style='-', figsize=(20,5), title = 'sales by flight date', y='unt_pre', x='flight_dt')
plt.show()

In [0]:
total_revenue_by_charge_dt = df.groupby('charge_dt')['rev_pre'].sum().reset_index()
plt.style.use('ggplot')
total_revenue_by_charge_dt.plot(style='-', figsize=(20,5), title = 'Revenue by charge date', y='rev_pre', x='charge_dt')
plt.show()

In [0]:
revenue_by_flight_dt = df.groupby('flight_dt')['unt_pre'].sum().reset_index()
revenue_by_flight_dt = revenue_by_flight_dt[(revenue_by_flight_dt['flight_dt'] > '2022-11-01') & (revenue_by_flight_dt['flight_dt'] < '2025-11-01')]
plt.style.use('ggplot')
revenue_by_flight_dt.plot(style='-', figsize=(20,5), title = 'sales by flight date', y='unt_pre', x='flight_dt')
plt.show()

In [0]:
daily_mean = df.groupby('charge_dt')[['dtg', 'unt_pre', 'price', 'ty_capacity', 'rev_pre']].mean()
daily_corr = daily_mean.corr()
sns.heatmap(daily_corr, xticklabels=daily_mean.columns,yticklabels=daily_mean.columns, annot=True,fmt=".2f",cmap='viridis',linewidths=.1)


In [0]:
from statsmodels.tsa.seasonal import seasonal_decompose
def plot_decomposition(target):
    decomposition = seasonal_decompose(target, model='additive')
    fig, axes = plt.subplots(4, 1, sharex=True, figsize=(30, 10))
    axes[0].plot(target)
    axes[1].plot(decomposition.trend)
    axes[2].plot(decomposition.seasonal)
    axes[3].plot(decomposition.resid)
    plt.show()

total_sales_by_charge_dt.set_index('charge_dt', inplace=True)
plot_decomposition(total_sales_by_charge_dt['unt_pre'])

In [0]:
#ACF charge-date
df_ACF = df.groupby('charge_dt')['unt_pre'].sum()
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(18, 8))

plot_acf(df_ACF, ax=ax1, lags=70)
ax1.set_title('Autocorrelation Function (ACF)')

plot_pacf(df_ACF, ax=ax2, lags=70, method='ywm') 
ax2.set_title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

In [0]:
# Typical Booking Curve

total_sales_by_dtg = df.groupby('dtg')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_dtg.plot(style='-', figsize=(20,5), title = 'sales by dtg since 2022', y='unt_pre', x='dtg')
plt.show()

In [0]:
# Booking Curve Over Time

df_dynamic_plot = df.reset_index()
df_dynamic_plot['charge_dt'] = df_dynamic_plot['charge_dt'].astype(str)
df_dynamic_plot = df_dynamic_plot.groupby(['dtg', 'charge_dt'])['unt_pre'].sum().reset_index()
fig = px.line(df_dynamic_plot, x='dtg', y='unt_pre', animation_frame='charge_dt', title='Sales by DTG with charge date variations')
fig.update_layout(xaxis_title='Days To Go', yaxis_title='Sales', legend_title='Charge Date', height=900, width=1800)

fig.show()

In [0]:
def lag_array(df, charge_lags, flight_lags, features_config):

    # 1. Create a lookup table with a MultiIndex containing ALL feature columns
    feature_names = list(features_config.keys())
    df_lookup = df.set_index(['charge_dt', 'flight_dt'])[feature_names]
    all_new_features = [df]

    # 2. Iterate through each lag combination
    for c_lag in charge_lags:
        for f_lag in flight_lags:
            
            target_charge_dts = df['charge_dt'] - pd.to_timedelta(c_lag, unit='d')
            target_flight_dts = df['flight_dt'] - pd.to_timedelta(f_lag, unit='d')
            target_index = pd.MultiIndex.from_arrays([target_charge_dts, target_flight_dts])

            # 3. Perform the lookup. This returns a DataFrame with the lagged values.
            lagged_df = df_lookup.reindex(target_index)
            lagged_df.index = df.index 

            # 4. Rename columns with the specified convention 
            new_column_names = {}
            for original_name, prefix in features_config.items():
                new_column_names[original_name] = f"{prefix}_C{c_lag}F{f_lag}"
            
            lagged_df = lagged_df.rename(columns=new_column_names)
            
            all_new_features.append(lagged_df)

    # 5. Concatenate all new feature columns
    final_df = pd.concat(all_new_features, axis=1)
    
    # Fill any missing values that resulted from the lookups
    final_df.fillna(-1, inplace=True)
    
    return final_df

In [0]:
features_to_lag = {'unt_pre': 'S', 'rev_pre': 'R'}

flight_lags = [-14, -7, 0, 7, 14, 21, 28, 35, 42, 49, 56, 63, 70]
charge_lags = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70] 

df = lag_array(df, charge_lags, flight_lags, features_to_lag)

In [0]:
def plot_correlation_heatmap(df, target_col, feature_prefix, title):

    # 1. Select the relevant feature columns based on the prefix
    feature_cols = [col for col in df.columns if col.startswith(feature_prefix)]
    all_cols_for_corr = [target_col] + feature_cols
    
    # 2. Calculate the correlation matrix for the subset of columns
    corr_matrix = df[all_cols_for_corr].corr()
    
    # Isolate the correlations of the features with the target variable
    target_corrs = corr_matrix[target_col].drop(target_col).reset_index()
    target_corrs.columns = ['feature', 'correlation']

    # 3. Define a function to parse C_lag and F_lag from the new column names
    def parse_lags(feature_name):
        # Updated regex to handle the prefix, e.g., 'S_C7F0'
        match = re.match(rf'{feature_prefix}C(\d+)F(-?\d+)', feature_name)
        if match:
            c_lag = int(match.group(1))
            f_lag = int(match.group(2))
            return c_lag, f_lag
        return None, None

    # Apply the parsing function to get the lag values
    lags = target_corrs['feature'].apply(parse_lags)
    target_corrs[['charge_lag', 'flight_lag']] = pd.DataFrame(lags.tolist(), index=lags.index)

    # 4. Pivot the data to create the matrix for the heatmap
    corr_pivot = target_corrs.pivot_table(index='charge_lag', columns='flight_lag', values='correlation')

    # 5. Plot the heatmap
    plt.figure(figsize=(16, 10))
    sns.heatmap(corr_pivot,annot=True,fmt=".2f",cmap='viridis',linewidths=.1)
    plt.title(title, fontsize=16)
    plt.xlabel('Flight Date Lag')
    plt.ylabel('Charge Date Lag')
    plt.show()

In [0]:
plot_correlation_heatmap(df=df, target_col='unt_pre', feature_prefix='S_', title='Correlation of Target (unt_pre) with Sales Lag Features')

In [0]:
plot_correlation_heatmap(df=df, target_col='unt_pre', feature_prefix='R_', title='Correlation of Target (unt_pre) with Revenue Lag Features')

In [0]:
plot_correlation_heatmap(df=df, target_col='rev_pre', feature_prefix='R_', title='Correlation of Target (rev_pre) with Revenue Lag Features')

In [0]:
plot_correlation_heatmap(df=df, target_col='rev_pre', feature_prefix='S_', title='Correlation of Target (rev_pre) with Sales Lag Features')

In [0]:
df = df.sort_values(by='dtg')

max_price = df['P_C1F0'].max() * 1.05

price_bins = pd.cut(df['P_C1F0'], bins=250)
grouped_sales = df.groupby(['dtg', price_bins])['unt_pre'].sum()
grouped_revenue = df.groupby(['dtg', price_bins])['rev_pre'].sum()

max_avg_sales = grouped_sales.max() * 1.05
max_avg_revenue = grouped_revenue.max() * 1.05

fig_sales = px.histogram(df, x='P_C1F0', y='unt_pre', histfunc='sum', animation_frame='dtg', barmode='group', nbins=250, title='Price vs. Sales (by Days-to-Go)', range_x=[15, 225], range_y=[0,7000])
fig_sales.update_layout(xaxis_title="Price (£) - Bucketed", yaxis_title="Total Sales (Sum)")

fig_sales.update_layout(height=800, width=1600)
fig_sales.show()

fig_revenue = px.histogram(df, x='P_C1F0', y='rev_pre', histfunc='sum', animation_frame='dtg', barmode='group', nbins=250, title='Price vs. Revenue (by Days-to-Go)', range_x=[15, 225], range_y=[0, 600000])

fig_revenue.update_layout(xaxis_title="Price (£) - Bucketed", yaxis_title="Total Revenue (Sum)")

fig_revenue.update_layout(height=800, width=1600)
fig_revenue.show()

In [0]:
cols_to_keep = []

for col in df.columns:
    match = re.search(r'[SR]_C(\d+)F(-?\d+)', col)
    if match:
        c_lag_str = match.group(1)
        f_lag_str = match.group(2)
        if c_lag_str == f_lag_str:
            cols_to_keep.append(col)
    else:
        cols_to_keep.append(col)

df = df[cols_to_keep]

# Verify the result
print(f"Original number of columns: {len(df.columns)}")
#print(f"Number of columns after filtering: {len(df_filtered.columns)}")
df.head()

In [0]:
df = df[df['charge_dt'] >= '2022-01-01']
df = df[(df['dtg'] <= 200) & (df['dtg'] > 0)]

In [0]:
# Seasonality Analysis

def create_features(df):
    df['flight_month'] = df['flight_dt'].dt.month.astype(int)
    df['flight_dow'] = df['flight_dt'].dt.dayofweek.astype(int)
    df['flight_dom'] = df['flight_dt'].dt.day.astype(int)
    df['flight_doy'] = df['flight_dt'].dt.dayofyear.astype(int)
    df['flight_year'] = df['flight_dt'].dt.year.astype(int)
    df['charge_month'] = df['charge_dt'].dt.month.astype(int)
    df['charge_dow'] = df['charge_dt'].dt.dayofweek.astype(int)
    df['charge_dom'] = df['charge_dt'].dt.day.astype(int)
    df['charge_doy'] = df['charge_dt'].dt.dayofyear.astype(int)
    df['charge_year'] = df['charge_dt'].dt.year.astype(int)
    df['day_number'] = (df['charge_dt'] - pd.to_datetime('2022-01-01')).dt.days.astype(int)

#create_features(df)
#optimize_df(df)

In [0]:
# Flight Month vs Charge Month

df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_month', 'charge_month'], var_name='month_type', value_name='month')
mean_sales = df_melt.groupby(['month', 'month_type'])['unt_pre'].mean().reset_index()

plt.figure(figsize=(20, 8))
sns.barplot(data=mean_sales, x='month', y='unt_pre', hue='month_type')
plt.style.use('ggplot')
plt.title('Historic Sales by Month since 2022')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.show()

In [0]:
# Flight DoW vs Charge DoW

df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_dow', 'charge_dow'], var_name='dow_type', value_name='dow')
mean_sales = df_melt.groupby(['dow', 'dow_type'])['unt_pre'].mean().reset_index()
plt.figure(figsize=(15, 8))
sns.barplot(data=mean_sales, x='dow', y='unt_pre', hue='dow_type')
plt.style.use('ggplot')
plt.title('Historic Sales by DoW since 2022')
plt.xlabel('DoW')
plt.ylabel('Total Sales')
display(plt.show())

In [0]:
# Weekly Seasonality

df[(df['charge_dt'] > '2024-01-01') & (df['charge_dt'] < '2024-01-31')].groupby('charge_dt')['unt_pre'].sum().plot(figsize=(20,5), title = 'sales by charge date (Jan24)', y='unt_pre', linewidth=10)
plt.style.use('ggplot')
plt.show()

In [0]:
# DoM Price vs Sales

price_vs_slaes = df.copy()
#[(df['charge_month'] != 12) & (df['charge_month'] != 3) & (df['charge_month'] != 4) & ((df['charge_month'] != 1))]

mean_sales = price_vs_slaes.groupby('charge_dom')['unt_pre'].mean().reset_index()
mean_price = price_vs_slaes.groupby('charge_dom')['price'].mean().reset_index()

fig, ax1 = plt.subplots(figsize=(15, 8))
plt.style.use('ggplot')

ax1.plot(mean_sales['charge_dom'], mean_sales['unt_pre'], marker='o', linewidth=12, color='tab:red', label='Mean Sales')
ax1.set_xlabel('Charge Day of Month')
ax1.set_ylabel('Mean Sales', color='tab:red')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.plot(mean_price['charge_dom'], mean_price['price'], marker='s', linewidth=12, color='tab:blue', label='Mean Price')
ax2.set_ylabel('Mean Price', color='tab:blue')
ax2.tick_params(axis='y', labelcolor='tab:blue')

fig.suptitle('Mean Sales and Price by Charge Day of Month (2022 - ToDate)')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()

In [0]:
# Holiday Features

uk_holidays = holidays.UK(years=range(2024, 2026))
holidays_df = pd.DataFrame([(date, name) for date, name in uk_holidays.items()], columns=['ds', 'holiday'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df

In [0]:
holidays_curated = holidays_df

In [0]:
additional_holidays = pd.DataFrame([
    {'ds': '2022-04-18', 'holiday': 'Easter Monday'},
    {'ds': '2022-08-29', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2023-04-10', 'holiday': 'Easter Monday'},
    {'ds': '2023-08-28', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2024-04-01', 'holiday': 'Easter Monday'},
    {'ds': '2024-08-26', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2025-04-21', 'holiday': 'Easter Monday'},
    {'ds': '2025-08-25', 'holiday': 'Summer Bank Holiday'},
])
holidays_df = pd.concat([holidays_df, additional_holidays], ignore_index=True)
holidays_df.drop_duplicates(inplace=True)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df.reset_index(drop=True, inplace=True)
holidays_df
   

In [0]:
holidays_df["ds"] = pd.to_datetime(holidays_df["ds"])
df = df.merge(holidays_df.rename(columns={'ds': 'charge_dt', 'holiday': 'charge_dt_holiday'}), how='left', on='charge_dt')
df = df.merge(holidays_df.rename(columns={'ds': 'flight_dt', 'holiday': 'flight_dt_holiday'}), how='left', on='flight_dt')
df['is_charge_date_holiday'] = df['charge_dt_holiday'].notnull().astype(int)
df['is_flight_date_holiday'] = df['flight_dt_holiday'].notnull().astype(int)
df.drop(['charge_dt_holiday', 'flight_dt_holiday'], axis=1, inplace=True)
df = df.sort_values(by=['charge_dt', 'dtg'], ascending=[True, True])
#df.set_index('charge_dt', inplace=True)
optimize_df(df)

In [0]:
#df = original_df.copy()

In [0]:
original_df = df.copy()

In [0]:
lag_prefixes = ('S_', 'R_')
lag_cols = [col for col in df.columns if col.startswith(lag_prefixes)]
normal_num_cols = ['dtg', 'ty_capacity'] + lag_cols
standard_num_cols=['charge_year','flight_year', 'day_number']
cyclic_cols=['flight_dom', 'flight_doy', 'charge_dom', 'charge_doy', 'flight_month', 'charge_month', 'flight_dow', 'charge_dow']
index_cols = ['charge_dt', 'flight_dt']
def encode_cyclic_features(df, cols):
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = (np.sin(2 * np.pi * df[col] / max_val)).astype('float16')
        df[col + '_cos'] = (np.cos(2 * np.pi * df[col] / max_val)).astype('float16')
        df.drop(col, axis=1, inplace=True)
    return df

df = encode_cyclic_features(df, cyclic_cols)

for col in index_cols:
    df[col] = original_df[col]
df.set_index(index_cols, inplace=True)
df_preprocessed = df.copy()
df = df.drop(['price', 'rev_pre'], axis=1)
optimize_df(df)

In [0]:
#LR train test

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
LR_scores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask]
    val_data = df[val_mask]

    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    
    ax = axs[fold // 3, fold % 3]

    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='unt_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)

    ax.set_title(f'Fold {fold+1}')
    fold += 1

    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()

    train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
    train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])

    val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
    val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']

    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    LR_model = LinearRegression()
    LR_model.fit(X_train, y_train)

    y_pred = LR_model.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    LR_score = np.sqrt(mean_squared_error(y_test, y_pred))
    LR_scores.append(LR_score)

plt.tight_layout()
plt.show()

In [0]:
#LR Results

print(f'individual scores: {LR_scores}')
print(f'combined score: {np.mean(LR_scores)}')
print(f'std: {np.std(LR_scores)}')  

In [0]:
full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)
y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()
rmse_LR = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_LR:.2f}')
plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('LR Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#LR Plot (dynamic)

results_df = pd.DataFrame({'actual': full_y_test, 'predicted': full_y_pred})
results_df.reset_index(inplace=True)
results_df['flight_dt'] = pd.to_datetime(results_df['flight_dt'])
results_df['charge_dt'] = pd.to_datetime(results_df['charge_dt'])

results_df['dtg'] = (results_df['flight_dt'] - results_df['charge_dt']).dt.days
results_df['charge_dt'] = results_df['charge_dt'].astype(str)
results_df.sort_values(['charge_dt', 'dtg'], inplace=True)

agg_df = results_df.groupby(['charge_dt', 'dtg'])[['actual', 'predicted']].sum().reset_index()

melted_df = pd.melt(agg_df, id_vars=['charge_dt', 'dtg'], value_vars=['actual', 'predicted'], var_name='sales_type', value_name='sales_value')

fig = px.line(melted_df, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='LR Model: Actual vs. Predicted Sales by Days To Go (Linear Regression)', color_discrete_map={'actual': 'red', 'predicted': 'blue'})

fig.update_layout(height=800, width=1600)

fig.show()

In [0]:
#LR Revenue model

df = df_preprocessed.copy()
df = df.drop(['price', 'unt_pre'], axis=1)
normal_num_cols = ['dtg', 'ty_capacity']
optimize_df(df)

In [0]:
unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
LR_scores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask]
    val_data = df[val_mask]

    total_sales_train = train_data.groupby('charge_dt')['rev_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['rev_pre'].sum().reset_index()
    
    ax = axs[fold // 3, fold % 3]

    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='rev_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='rev_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)

    ax.set_title(f'Fold {fold+1}')
    fold += 1

    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()

    train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
    train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])

    val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
    val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

    X_train = train_data.drop('rev_pre', axis=1)
    y_train = train_data['rev_pre']

    X_test = val_data.drop('rev_pre', axis=1)
    y_test = val_data['rev_pre']

    LR_model = LinearRegression()
    LR_model.fit(X_train, y_train)

    y_pred = LR_model.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    LR_score = np.sqrt(mean_squared_error(y_test, y_pred))
    LR_scores.append(LR_score)

plt.tight_layout()
plt.show()

In [0]:
print(f'individual scores: {LR_scores}')
print(f'combined score: {np.mean(LR_scores)}')
print(f'std: {np.std(LR_scores)}')  

In [0]:
full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)
y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()
rmse_LR = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_LR:.2f}')
plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('LR Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#custom grid search XGB

from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

param_grid = {'max_depth': [7]}
param_combinations = list(ParameterGrid(param_grid))
tuning_results = {}

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

for i, params in enumerate(param_combinations):
    print(f"Testing combination {i+1}/{len(param_combinations)}: {params}")
    fold_scores = []

    for train_index, val_index in tscv.split(unique_dates):
        train_dates = unique_dates[train_index]
        val_dates = unique_dates[val_index]

        train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
        val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

        train_data = df[train_mask].copy() 
        val_data = df[val_mask].copy()

        std_scaler = StandardScaler()
        min_max_scaler = MinMaxScaler()
        train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
        train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
        val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
        val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

        X_train = train_data.drop('unt_pre', axis=1)
        y_train = train_data['unt_pre']
        X_test = val_data.drop('unt_pre', axis=1)
        y_test = val_data['unt_pre']

        XGB_model = xgb.XGBRegressor(**params, base_score=0.5, booster='gbtree', n_estimators=2000, early_stopping_rounds=50, learning_rate=0.01, objective='reg:pseudohubererror', enable_categorical=True)
        XGB_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100) 


        y_pred = XGB_model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        fold_scores.append(score)

    mean_score = np.mean(fold_scores)
    print(f"  -> Mean RMSE across folds: {mean_score:.4f}\n")
    tuning_results[tuple(params.items())] = mean_score

best_params_tuple = min(tuning_results, key=tuning_results.get)
best_params = dict(best_params_tuple)
best_score = tuning_results[best_params_tuple]

print("--- Tuning Complete ---")
print(f"Best Parameters Found: {best_params}")
print(f"Best Mean RMSE: {best_score:.4f}")

In [0]:
fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
XGBscores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask].copy()
    val_data = df[val_mask].copy()
    
    ax = axs[fold // 3, fold % 3]
    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='unt_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)
    ax.set_title(f'Fold {fold+1}')
    
    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()
    train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
    train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
    val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
    val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']
    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    XGB_model_final = xgb.XGBRegressor(**best_params, base_score=0.5, booster='gbtree', n_estimators=2000, early_stopping_rounds=50, learning_rate=0.01, objective='reg:pseudohubererror', enable_categorical=True)
    XGB_model_final.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    y_pred = XGB_model_final.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    XGBscore = np.sqrt(mean_squared_error(y_test, y_pred))
    XGBscores.append(XGBscore)
    
    fold += 1

print(f"\nFinal Mean RMSE with best params: {np.mean(XGBscores):.4f}")
plt.tight_layout()
plt.show()

In [0]:
# XGB Results

print(f'individual scores: {XGBscores}')
print(f'combined score: {np.mean(XGBscores)}')
print(f'std: {np.std(XGBscores)}')      

In [0]:
#XGB Plot

full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)

y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()

rmse_xgb = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_xgb:.2f}')

plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('XGB Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#XGB Plot (dynamic)

results_df = pd.DataFrame({'actual': full_y_test, 'predicted': full_y_pred})
results_df.reset_index(inplace=True)
results_df['flight_dt'] = pd.to_datetime(results_df['flight_dt'])
results_df['charge_dt'] = pd.to_datetime(results_df['charge_dt'])

results_df['dtg'] = (results_df['flight_dt'] - results_df['charge_dt']).dt.days
results_df['charge_dt'] = results_df['charge_dt'].astype(str)
results_df.sort_values(['charge_dt', 'dtg'], inplace=True)

agg_df = results_df.groupby(['charge_dt', 'dtg'])[['actual', 'predicted']].sum().reset_index()

melted_df = pd.melt(agg_df, id_vars=['charge_dt', 'dtg'], value_vars=['actual', 'predicted'], var_name='sales_type', value_name='sales_value')

fig = px.line(melted_df, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='XGB Model: Actual vs. Predicted Sales by Days To Go (Linear Regression)', color_discrete_map={'actual': 'red', 'predicted': 'blue'})

fig.update_layout(height=800, width=1600)

fig.show()

In [0]:
#LGBM Sales Forecast

#custom grid search lgbm

import lightgbm as lgb

param_grid = {'max_depth': [7], 'learning_rate': [0.5]}
param_combinations = list(ParameterGrid(param_grid))
tuning_results = {}

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

for i, params in enumerate(param_combinations):
    print(f"Testing combination {i+1}/{len(param_combinations)}: {params}")
    fold_scores = []

    for train_index, val_index in tscv.split(unique_dates):
        train_dates = unique_dates[train_index]
        val_dates = unique_dates[val_index]

        train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
        val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

        train_data = df[train_mask].copy() 
        val_data = df[val_mask].copy()

        std_scaler = StandardScaler()
        min_max_scaler = MinMaxScaler()
        train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
        train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
        val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
        val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

        X_train = train_data.drop('unt_pre', axis=1)
        y_train = train_data['unt_pre']
        X_test = val_data.drop('unt_pre', axis=1)
        y_test = val_data['unt_pre']

        LGBM_model = lgb.LGBMRegressor(**params, n_estimators=2000, objective='huber')
        LGBM_model.fit(X_train,y_train,eval_set=[(X_test, y_test)],callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)])

        y_pred = LGBM_model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        fold_scores.append(score)

    mean_score = np.mean(fold_scores)
    print(f"  -> Mean RMSE across folds: {mean_score:.4f}\n")
    tuning_results[tuple(params.items())] = mean_score

best_params_tuple = min(tuning_results, key=tuning_results.get)
best_params = dict(best_params_tuple)
best_score = tuning_results[best_params_tuple]

print("--- Tuning Complete ---")
print(f"Best Parameters Found: {best_params}")
print(f"Best Mean RMSE: {best_score:.4f}")

In [0]:
#LGBM CV

import lightgbm as lgb
unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
LGBMscores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask].copy()
    val_data = df[val_mask].copy()
    
    ax = axs[fold // 3, fold % 3]
    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='unt_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)
    ax.set_title(f'Fold {fold+1}')
    
    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()
    train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
    train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
    val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
    val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']
    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    LGBM_model_final = lgb.LGBMRegressor(max_depth = 7, learning_rate = 0.5, n_estimators=2000, objective='huber')
    LGBM_model_final.fit(X_train,y_train,eval_set=[(X_test, y_test)],callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)])

    y_pred = LGBM_model_final.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    LGBMscore = np.sqrt(mean_squared_error(y_test, y_pred))
    LGBMscores.append(LGBMscore)
    
    fold += 1

print(f"\nFinal Mean RMSE with best params: {np.mean(LGBMscores):.4f}")
plt.tight_layout()
plt.show()

In [0]:
# LGBM Results

print(f'individual scores: {LGBMscores}')
print(f'combined score: {np.mean(LGBMscores)}')
print(f'std: {np.std(LGBMscores)}')     

In [0]:
#LGBM Plot

full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)

y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()

rmse_lgbm = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_lgbm:.2f}')

plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('LGBM Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#LGBM Plot (dynamic)

results_df = pd.DataFrame({'actual': full_y_test, 'predicted': full_y_pred})
results_df.reset_index(inplace=True)
results_df['flight_dt'] = pd.to_datetime(results_df['flight_dt'])
results_df['charge_dt'] = pd.to_datetime(results_df['charge_dt'])

results_df['dtg'] = (results_df['flight_dt'] - results_df['charge_dt']).dt.days
results_df['charge_dt'] = results_df['charge_dt'].astype(str)
results_df.sort_values(['charge_dt', 'dtg'], inplace=True)

agg_df = results_df.groupby(['charge_dt', 'dtg'])[['actual', 'predicted']].sum().reset_index()

melted_df = pd.melt(agg_df, id_vars=['charge_dt', 'dtg'], value_vars=['actual', 'predicted'], var_name='sales_type', value_name='sales_value')

fig = px.line(melted_df, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='LGBM Model: Actual vs. Predicted Sales by Days To Go', color_discrete_map={'actual': 'red', 'predicted': 'blue'})

fig.update_layout(height=800, width=1600)

fig.show()

In [0]:
#LGBM Revenue Forecast

df = df_preprocessed.copy()
df = df.drop(['price', 'unt_pre'], axis=1)
optimize_df(df)

In [0]:
#LGBM Sales Forecast

#custom grid search lgbm
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb

param_grid = {'max_depth': [7], 'learning_rate': [10, 50, 100]}
param_combinations = list(ParameterGrid(param_grid))
tuning_results = {}

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

for i, params in enumerate(param_combinations):
    print(f"Testing combination {i+1}/{len(param_combinations)}: {params}")
    fold_scores = []

    for train_index, val_index in tscv.split(unique_dates):
        train_dates = unique_dates[train_index]
        val_dates = unique_dates[val_index]

        train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
        val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

        train_data = df[train_mask].copy() 
        val_data = df[val_mask].copy()

        std_scaler = StandardScaler()
        min_max_scaler = MinMaxScaler()
        train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
        train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
        val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
        val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

        X_train = train_data.drop('rev_pre', axis=1)
        y_train = train_data['rev_pre']
        X_test = val_data.drop('rev_pre', axis=1)
        y_test = val_data['rev_pre']

        LGBM_model = lgb.LGBMRegressor(**params, n_estimators=2000, objective='huber')
        LGBM_model.fit(X_train,y_train,eval_set=[(X_test, y_test)],callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)])

        y_pred = LGBM_model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        fold_scores.append(score)

    mean_score = np.mean(fold_scores)
    print(f"  -> Mean RMSE across folds: {mean_score:.4f}\n")
    tuning_results[tuple(params.items())] = mean_score

best_params_tuple = min(tuning_results, key=tuning_results.get)
best_params = dict(best_params_tuple)
best_score = tuning_results[best_params_tuple]

print("--- Tuning Complete ---")
print(f"Best Parameters Found: {best_params}")
print(f"Best Mean RMSE: {best_score:.4f}")

In [0]:
import lightgbm as lgb
unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
LGBMscores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask].copy()
    val_data = df[val_mask].copy()
    
    ax = axs[fold // 3, fold % 3]
    total_sales_train = train_data.groupby('charge_dt')['rev_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['rev_pre'].sum().reset_index()
    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='rev_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='rev_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)
    ax.set_title(f'Fold {fold+1}')
    
    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()
    train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
    train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
    val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
    val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

    X_train = train_data.drop('rev_pre', axis=1)
    y_train = train_data['rev_pre']
    X_test = val_data.drop('rev_pre', axis=1)
    y_test = val_data['rev_pre']

    LGBM_model_final = lgb.LGBMRegressor(max_depth = 7, learning_rate = 50, n_estimators=2000, objective='huber')
    LGBM_model_final.fit(X_train,y_train,eval_set=[(X_test, y_test)],callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)])

    y_pred = LGBM_model_final.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='rev_pre_pred')
    
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    LGBMscore = np.sqrt(mean_squared_error(y_test, y_pred))
    LGBMscores.append(LGBMscore)
    
    fold += 1

print(f"\nFinal Mean RMSE with best params: {np.mean(LGBMscores):.4f}")
plt.tight_layout()
plt.show()

In [0]:

print(f'individual scores: {LGBMscores}')
print(f'combined score: {np.mean(LGBMscores)}')
print(f'std: {np.std(LGBMscores)}') 

In [0]:
#LGBM Plot

full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)

y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()

rmse_lgbm = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_lgbm:.2f}')

plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('LGBM Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (rev_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#RF Grid Search

from sklearn.ensemble import RandomForestRegressor

param_grid = {'max_depth': [5]}
param_combinations = list(ParameterGrid(param_grid))
tuning_results = {}

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

for i, params in enumerate(param_combinations):
    print(f"Testing combination {i+1}/{len(param_combinations)}: {params}")
    fold_scores = []

    for train_index, val_index in tscv.split(unique_dates):
        train_dates = unique_dates[train_index]
        val_dates = unique_dates[val_index]

        train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
        val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

        train_data = df[train_mask].copy() 
        val_data = df[val_mask].copy()

        std_scaler = StandardScaler()
        min_max_scaler = MinMaxScaler()
        train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
        train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
        val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
        val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

        X_train = train_data.drop('unt_pre', axis=1)
        y_train = train_data['unt_pre']
        X_test = val_data.drop('unt_pre', axis=1)
        y_test = val_data['unt_pre']

        RF_model = RandomForestRegressor(**params, n_jobs=-1, n_estimators=50)
        RF_model.fit(X_train, y_train)

        y_pred = RF_model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        fold_scores.append(score)

    mean_score = np.mean(fold_scores)
    print(f"  -> Mean RMSE across folds: {mean_score:.4f}\n")
    tuning_results[tuple(params.items())] = mean_score

best_params_tuple = min(tuning_results, key=tuning_results.get)
best_params = dict(best_params_tuple)
best_score = tuning_results[best_params_tuple]

print("--- Tuning Complete ---")
print(f"Best Parameters Found: {best_params}")
print(f"Best Mean RMSE: {best_score:.4f}")

In [0]:
#RF cv

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
RFscores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask].copy()
    val_data = df[val_mask].copy()
    
    ax = axs[fold // 3, fold % 3]
    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='unt_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)
    ax.set_title(f'Fold {fold+1}')
    
    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()
    train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
    train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
    val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
    val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']
    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    RF_model_final = RandomForestRegressor(**best_params, n_estimators=50, n_jobs=-1)
    RF_model_final.fit(X_train, y_train)

    y_pred = RF_model_final.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    RFscore = np.sqrt(mean_squared_error(y_test, y_pred))
    RFscores.append(RFscore)
    
    fold += 1

print(f"\nFinal Mean RMSE with best params: {np.mean(RFscores):.4f}")
plt.tight_layout()
plt.show()

In [0]:
#RF Results

print(f'individual scores: {RFscores}')
print(f'combined score: {np.mean(RFscores)}')
print(f'std: {np.std(RFscores)}')  

In [0]:
#RF Plot

full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)

y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()

rmse_rf = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_rf:.2f}')

plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('RF Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#RF Plot (dynamic)

results_df = pd.DataFrame({'actual': full_y_test, 'predicted': full_y_pred})
results_df.reset_index(inplace=True)
results_df['flight_dt'] = pd.to_datetime(results_df['flight_dt'])
results_df['charge_dt'] = pd.to_datetime(results_df['charge_dt'])

results_df['dtg'] = (results_df['flight_dt'] - results_df['charge_dt']).dt.days
results_df['charge_dt'] = results_df['charge_dt'].astype(str)
results_df.sort_values(['charge_dt', 'dtg'], inplace=True)

agg_df = results_df.groupby(['charge_dt', 'dtg'])[['actual', 'predicted']].sum().reset_index()

melted_df = pd.melt(agg_df, id_vars=['charge_dt', 'dtg'], value_vars=['actual', 'predicted'], var_name='sales_type', value_name='sales_value')

fig = px.line(melted_df, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='RF Model: Actual vs. Predicted Sales by Days To Go', color_discrete_map={'actual': 'red', 'predicted': 'blue'})

fig.update_layout(height=800, width=1600)

fig.show()

In [0]:
#AB Grid search CV

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.metrics import mean_squared_error

param_grid = {'n_estimators': [1], 'learning_rate': [0.1, 1.0], 'loss': ['square']}
param_combinations = list(ParameterGrid(param_grid))
tuning_results = {}

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

for i, params in enumerate(param_combinations):
    print(f"Testing combination {i+1}/{len(param_combinations)}: {params}")
    fold_scores = []

    for train_index, val_index in tscv.split(unique_dates):
        train_dates = unique_dates[train_index]
        val_dates = unique_dates[val_index]
        train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
        val_mask = df.index.get_level_values('charge_dt').isin(val_dates)
        train_data = df[train_mask].copy()
        val_data = df[val_mask].copy()

        X_train = train_data.drop('unt_pre', axis=1)
        y_train = train_data['unt_pre']
        X_test = val_data.drop('unt_pre', axis=1)
        y_test = val_data['unt_pre']

        ada_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=5),**params)
        ada_model.fit(X_train, y_train)

        y_pred = ada_model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_pred))
        fold_scores.append(score)

    mean_score = np.mean(fold_scores)
    print(f"  -> Mean RMSE across folds: {mean_score:.4f}\n")
    tuning_results[tuple(params.items())] = mean_score

best_params_tuple = min(tuning_results, key=tuning_results.get)
best_params = dict(best_params_tuple)
best_score = tuning_results[best_params_tuple]

print("--- Tuning Complete ---")
print(f"Best Parameters Found: {best_params}")
print(f"Best Mean RMSE: {best_score:.4f}")


In [0]:
#AB cross validation

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
ABscores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask].copy()
    val_data = df[val_mask].copy()
    
    ax = axs[fold // 3, fold % 3]
    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='unt_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)
    ax.set_title(f'Fold {fold+1}')
    
    std_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()
    train_data[normal_num_cols] = std_scaler.fit_transform(train_data[normal_num_cols])
    train_data[standard_num_cols] = min_max_scaler.fit_transform(train_data[standard_num_cols])
    val_data[normal_num_cols] = std_scaler.transform(val_data[normal_num_cols])
    val_data[standard_num_cols] = min_max_scaler.transform(val_data[standard_num_cols])

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']
    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    AB_model_final = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=5), **best_params)
    AB_model_final.fit(X_train, y_train)

    y_pred = AB_model_final.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    ABscore = np.sqrt(mean_squared_error(y_test, y_pred))
    ABscores.append(ABscore)
    
    fold += 1

print(f"\nFinal Mean RMSE with best params: {np.mean(ABscores):.4f}")
plt.tight_layout()
plt.show()

In [0]:
#AB Results

print(f'individual scores: {ABscores}')
print(f'combined score: {np.mean(ABscores)}')
print(f'std: {np.std(ABscores)}')  

In [0]:
#AB Plot

full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)

y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()

rmse_ab = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_ab:.2f}')

plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('AB Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#AB Plot (dynamic)

results_df = pd.DataFrame({'actual': full_y_test, 'predicted': full_y_pred})
results_df.reset_index(inplace=True)
results_df['flight_dt'] = pd.to_datetime(results_df['flight_dt'])
results_df['charge_dt'] = pd.to_datetime(results_df['charge_dt'])

results_df['dtg'] = (results_df['flight_dt'] - results_df['charge_dt']).dt.days
results_df['charge_dt'] = results_df['charge_dt'].astype(str)
results_df.sort_values(['charge_dt', 'dtg'], inplace=True)

agg_df = results_df.groupby(['charge_dt', 'dtg'])[['actual', 'predicted']].sum().reset_index()

melted_df = pd.melt(agg_df, id_vars=['charge_dt', 'dtg'], value_vars=['actual', 'predicted'], var_name='sales_type', value_name='sales_value')

fig = px.line(melted_df, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='AB Model: Actual vs. Predicted Sales by Days To Go', color_discrete_map={'actual': 'red', 'predicted': 'blue'})

fig.update_layout(height=800, width=1600)

fig.show()

Multi Horizon Forecast

In [0]:
df = df_original.copy()
df = df.drop('price', axis=1)
optimize_df(df)

In [0]:
df = df[(df['dtg'] <= 252) & (df['dtg'] >= 0)]
current_date = datetime.now().strftime('%Y-%m-%d')
df = df[df['charge_dt'] < current_date]
df = df[df['charge_dt'] >= '2021-10-01']
df.info()

In [0]:
def charge_dt_temporal_features(df):
    df['charge_dow'] = df['charge_dt'].dt.dayofweek.astype(int)
    df['charge_dom'] = df['charge_dt'].dt.day.astype(int)
    df['charge_doy'] = df['charge_dt'].dt.dayofyear.astype(int)
    df['charge_wom'] = ((df['charge_dt'].dt.day - 1) // 7 + 1).astype(int)
    df['charge_woy'] = df['charge_dt'].dt.weekofyear.astype(int)
    df['charge_moy'] = df['charge_dt'].dt.month.astype(int)
    df['charge_day'] = (df['charge_dt'] - pd.to_datetime('2022-01-01')).dt.days.astype(int)
    df['charge_year'] = df['charge_dt'].dt.year.astype(int)

def flight_dt_temporal_features(df):
    df['flight_dow'] = df['flight_dt'].dt.dayofweek.astype(int)
    df['flight_dom'] = df['flight_dt'].dt.day.astype(int)
    df['flight_doy'] = df['flight_dt'].dt.dayofyear.astype(int)
    df['flight_wom'] = ((df['flight_dt'].dt.day - 1) // 7 + 1).astype(int)
    df['flight_woy'] = df['flight_dt'].dt.weekofyear.astype(int)
    df['flight_moy'] = df['flight_dt'].dt.month.astype(int)
    df['flight_day'] = (df['flight_dt'] - pd.to_datetime('2022-01-01')).dt.days.astype(int)
    df['flight_year'] = df['flight_dt'].dt.year.astype(int)

charge_dt_temporal_features(df)
flight_dt_temporal_features(df)

In [0]:
uk_holidays = holidays.UK(years=range(2022, 2026))
holidays_df = pd.DataFrame([(date, name) for date, name in uk_holidays.items()], columns=['ds', 'holiday'])
holidays_df.sort_values(by='ds', inplace=True)
additional_holidays = pd.DataFrame([
    {'ds': '2022-04-18', 'holiday': 'Easter Monday'},
    {'ds': '2022-08-29', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2023-04-10', 'holiday': 'Easter Monday'},
    {'ds': '2023-08-28', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2024-04-01', 'holiday': 'Easter Monday'},
    {'ds': '2024-08-26', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2025-04-21', 'holiday': 'Easter Monday'},
    {'ds': '2025-08-25', 'holiday': 'Summer Bank Holiday'},
])
holidays_df = pd.concat([holidays_df, additional_holidays], ignore_index=True)
holidays_df.drop_duplicates(inplace=True)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df.reset_index(drop=True, inplace=True)
holidays_df["ds"] = pd.to_datetime(holidays_df["ds"])
df = df.merge(holidays_df.rename(columns={'ds': 'charge_dt', 'holiday': 'charge_dt_holiday'}), how='left', on='charge_dt')
holidays_df = holidays_df.rename(columns={'ds': 'flight_dt', 'holiday': 'flight_dt_holiday'})
holidays_df['flight_dt'] = holidays_df['flight_dt'] - pd.to_timedelta(holidays_df['flight_dt'].dt.dayofweek, unit='d')
holidays_df = holidays_df.groupby('flight_dt').agg({'flight_dt_holiday': 'first'}).reset_index()
df = df.merge(holidays_df, how='left', on='flight_dt')
df['is_charge_date_holiday'] = df['charge_dt_holiday'].notnull().astype(int)
df['is_flight_date_holiday'] = df['flight_dt_holiday'].notnull().astype(int)
df.drop(['charge_dt_holiday', 'flight_dt_holiday'], axis=1, inplace=True)
df = df.sort_values(by=['charge_dt', 'dtg'], ascending=[True, True])
optimize_df(df)

In [0]:
def lag_array(df, charge_lags, flight_lags, features_config):

    # 1. Create a lookup table with a MultiIndex containing ALL feature columns
    feature_names = list(features_config.keys())
    df_lookup = df.set_index(['charge_dt', 'flight_dt'])[feature_names]
    all_new_features = [df]

    # 2. Iterate through each lag combination
    for c_lag in charge_lags:
        for f_lag in flight_lags:
            
            target_charge_dts = df['charge_dt'] - pd.to_timedelta(c_lag, unit='d')
            target_flight_dts = df['flight_dt'] - pd.to_timedelta(f_lag, unit='d')
            target_index = pd.MultiIndex.from_arrays([target_charge_dts, target_flight_dts])

            # 3. Perform the lookup. This returns a DataFrame with the lagged values.
            lagged_df = df_lookup.reindex(target_index)
            lagged_df.index = df.index 

            # 4. Rename columns with the specified convention 
            new_column_names = {}
            for original_name, prefix in features_config.items():
                new_column_names[original_name] = f"{prefix}_C{c_lag}F{f_lag}"
            
            lagged_df = lagged_df.rename(columns=new_column_names)
            
            all_new_features.append(lagged_df)

    # 5. Concatenate all new feature columns
    final_df = pd.concat(all_new_features, axis=1)
    
    # Fill any missing values that resulted from the lookups
    final_df.fillna(-1, inplace=True)
    
    return final_df

In [0]:
# Long-Term Lags

features_to_lag = {'unt_pre': 'S', 'rev_pre': 'R'}

flight_lags = [-14, -7, 0, 7, 14, 21, 28, 35, 42, 49, 56, 63, 70]
charge_lags = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70] 

df = lag_array(df, charge_lags, flight_lags, features_to_lag)

In [0]:
def plot_correlation_heatmap(df, target_col, feature_prefix, title):

    # 1. Select the relevant feature columns based on the prefix
    feature_cols = [col for col in df.columns if col.startswith(feature_prefix)]
    all_cols_for_corr = [target_col] + feature_cols
    
    # 2. Calculate the correlation matrix for the subset of columns
    corr_matrix = df[all_cols_for_corr].corr()
    
    # Isolate the correlations of the features with the target variable
    target_corrs = corr_matrix[target_col].drop(target_col).reset_index()
    target_corrs.columns = ['feature', 'correlation']

    # 3. Define a function to parse C_lag and F_lag from the new column names
    def parse_lags(feature_name):
        # Updated regex to handle the prefix, e.g., 'S_C7F0'
        match = re.match(rf'{feature_prefix}C(\d+)F(-?\d+)', feature_name)
        if match:
            c_lag = int(match.group(1))
            f_lag = int(match.group(2))
            return c_lag, f_lag
        return None, None

    # Apply the parsing function to get the lag values
    lags = target_corrs['feature'].apply(parse_lags)
    target_corrs[['charge_lag', 'flight_lag']] = pd.DataFrame(lags.tolist(), index=lags.index)

    # 4. Pivot the data to create the matrix for the heatmap
    corr_pivot = target_corrs.pivot_table(index='charge_lag', columns='flight_lag', values='correlation')

    # 5. Plot the heatmap
    plt.figure(figsize=(16, 10))
    sns.heatmap(corr_pivot,annot=True,fmt=".2f",cmap='viridis',linewidths=.1)
    plt.title(title, fontsize=16)
    plt.xlabel('Flight Date Lag')
    plt.ylabel('Charge Date Lag')
    plt.show()

In [0]:
plot_correlation_heatmap(df=df, target_col='unt_pre', feature_prefix='S_', title='Correlation of Target (unt_pre) with Sales Lag Features')

In [0]:
cols_to_keep = []

for col in df.columns:
    match = re.search(r'[SR]_C(\d+)F(-?\d+)', col)
    if match:
        c_lag_str = match.group(1)
        f_lag_str = match.group(2)
        if c_lag_str == f_lag_str:
            cols_to_keep.append(col)
    else:
        cols_to_keep.append(col)

df = df[cols_to_keep]

In [0]:
# Short-Term Lags

features_to_lag = {'unt_pre': 's', 'rev_pre': 'r'}

flight_lags = [0, 1, 2, 3, 4, 5, 6, 7, 8]
charge_lags = [0, 1, 2, 3, 4, 5, 6, 7, 8] 

df = lag_array(df, charge_lags, flight_lags, features_to_lag)

In [0]:

plot_correlation_heatmap(df=df, target_col='unt_pre', feature_prefix='s_', title='Correlation of Target (unt_pre) with Sales Lag Features')

In [0]:
cols_to_keep = []

for col in df.columns:
    match = re.search(r'[sr]_C(\d+)F(-?\d+)', col)
    if match:
        c_lag_str = match.group(1)
        f_lag_str = match.group(2)
        if (int(f_lag_str) == 0) & (int(c_lag_str) < 8) & (int(c_lag_str) > 0):
            cols_to_keep.append(col)
    else:
        cols_to_keep.append(col)

df = df[cols_to_keep]
optimize_df(df)

In [0]:
df = df[df['charge_dt'] >= '2022-01-01']
df = df[(df['dtg'] <= 168) & (df['dtg'] > 0)]

In [0]:
original_df = df.copy()

lag_prefixes = ('S_', 'R_', 's_', 'r_')
lag_cols = [col for col in df.columns if col.startswith(lag_prefixes)]

index_cols = ['charge_dt', 'flight_dt']
normal_num_cols = ['dtg', 'ty_capacity'] + lag_cols
standard_num_cols=['charge_year','flight_year', 'flight_day', 'charge_day']
cyclic_cols=['flight_dow', 'flight_dom', 'flight_doy', 'flight_wom', 'flight_woy', 'flight_moy', 
             'charge_dow', 'charge_dom', 'charge_doy', 'charge_wom', 'charge_woy', 'charge_moy']

def encode_cyclic_features(df, cols):
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = (np.sin(2 * np.pi * df[col] / max_val)).astype('float16')
        df[col + '_cos'] = (np.cos(2 * np.pi * df[col] / max_val)).astype('float16')
        df.drop(col, axis=1, inplace=True)
    return df

df = encode_cyclic_features(df, cyclic_cols)

min_max_scaler = MinMaxScaler()
df[standard_num_cols] = min_max_scaler.fit_transform(df[standard_num_cols])

optimize_df(df)

In [0]:
# creating targets 

features_to_lag = {'unt_pre': 's(target)', 'rev_pre': 'r(target)'}

flight_lags = [0]
charge_lags = [0, -1, -2, -3, -4, -5, -6] 

df = lag_array(df, charge_lags, flight_lags, features_to_lag)
df = df.drop(['unt_pre', 'rev_pre'], axis=1)
optimize_df(df)

In [0]:
for col in index_cols:
    df[col] = original_df[col]
df.set_index(index_cols, inplace=True)

In [0]:
# defining targets + features for each horizon

all_target_cols = []
for i in range(0, 7): 
    c_lag_str = f"C{i}" if i == 0 else f"C-{i}" 
    all_target_cols.append(f's(target)_{c_lag_str}F0')
    all_target_cols.append(f'r(target)_{c_lag_str}F0')

base_features = [col for col in df.columns if col not in all_target_cols]

# Dictionaries to hold the data for each horizon
X_data_dict = {}
y_data_dict = {}

for h in range(1, 8):
    # Define target for this horizon
    y_sales = df[f's(target)_C{1-h}F0']
    y_revenue = df[f'r(target)_C{1-h}F0']
    
    # Define features for this horizon, dropping leaky lags
    cols_to_drop_for_h = []
    if h > 1:
        for j in range(1, h):
            cols_to_drop_for_h.append(f's_C{j}F0')
            cols_to_drop_for_h.append(f'r_C{j}F0')

    features_for_h = [col for col in base_features if col not in cols_to_drop_for_h]
    X = df[features_for_h]
    
    # Store in dictionaries
    X_data_dict[f'h{h}'] = X
    y_data_dict[f'sales_h{h}'] = y_sales
    y_data_dict[f'revenue_h{h}'] = y_revenue

In [0]:
import lightgbm as lgb

# Master dictionary to store all scores
all_horizon_scores = {}
unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()

# Outer loop: Iterate through each forecast horizon
for h in range(1, 8):    

    tscv = TimeSeriesSplit(n_splits=12, test_size=h)

    X = X_data_dict[f'h{h}']
    y_sales = y_data_dict[f'sales_h{h}']
    y_revenue = y_data_dict[f'revenue_h{h}']
    valid_sales_mask = (y_sales != -1)
    valid_revenue_mask = (y_revenue != -1)
    valid_mask = valid_sales_mask & valid_revenue_mask
    X_clean = X[valid_mask]
    y_sales_clean = y_sales[valid_mask]
    y_revenue_clean = y_revenue[valid_mask]
    unique_dates_clean = X_clean.index.get_level_values('charge_dt').unique().sort_values()

    plt.style.use('ggplot')
    fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
    fig.suptitle(f"CV Folds for Horizon h={h}", fontsize=16)

    fold = 0
    
    sales_scores = []
    revenue_scores = []
    all_y_test_sales = []
    all_y_pred_sales_series = []
    all_y_test_revenue = []
    all_y_pred_revenue_series = []

    # Inner loop: Iterate through each CV fold
    for train_index, val_index in tscv.split(unique_dates_clean):
        train_dates = unique_dates_clean[train_index]
        val_dates = unique_dates_clean[val_index]

        train_mask = X_clean.index.get_level_values('charge_dt').isin(train_dates)
        val_mask = X_clean.index.get_level_values('charge_dt').isin(val_dates)
        
        X_train, X_test = X_clean.loc[train_mask], X_clean.loc[val_mask]
        
        y_train_sales, y_test_sales = y_sales_clean.loc[train_mask], y_sales_clean.loc[val_mask]
        y_train_revenue, y_test_revenue = y_revenue_clean.loc[train_mask], y_revenue_clean.loc[val_mask]

        ax = axs[fold // 3, fold % 3]
        y_train_sales.groupby('charge_dt').sum().plot(ax=ax, label='Train Sales')
        y_test_sales.groupby('charge_dt').sum().plot(ax=ax, label='Val Sales')
        ax.axvline(y_test_sales.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
        ax.set_title(f'Fold {fold+1}')
        end_date = val_dates.max()
        start_date = end_date - pd.DateOffset(months=6)
        ax.set_xlim(start_date, end_date)
        ax.set_title(f'Fold {fold+1}')

        # --- Feature Scaling ---
        #std_scaler = StandardScaler()
        #current_normal_cols = [col for col in normal_num_cols if col in X_train.columns]
        #X_train[current_normal_cols] = std_scaler.fit_transform(X_train[current_normal_cols])
        #X_test[current_normal_cols] = std_scaler.transform(X_test[current_normal_cols])

        # --- Train DUAL models ---
        
        # Sales Model
        sales_model = lgb.LGBMRegressor(max_depth=5, learning_rate=0.5, n_estimators=2000, objective='huber', verbose=-1)
        sales_model.fit(X_train, y_train_sales, eval_set=[(X_test, y_test_sales)], callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)])
        
        # Revenue Model
        revenue_model = lgb.LGBMRegressor(max_depth=7, learning_rate=50, n_estimators=2000, objective='huber', verbose=-1)
        revenue_model.fit(X_train, y_train_revenue, eval_set=[(X_test, y_test_revenue)], callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)])

        # --- Predict and Score ---
        
        y_pred_sales = sales_model.predict(X_test)
        y_pred_revenue = revenue_model.predict(X_test)
        
        sales_rmse = np.sqrt(mean_squared_error(y_test_sales, y_pred_sales))
        revenue_rmse = np.sqrt(mean_squared_error(y_test_revenue, y_pred_revenue))
        
        sales_scores.append(sales_rmse)
        revenue_scores.append(revenue_rmse)

        y_pred_sales_series = pd.Series(y_pred_sales, index=X_test.index, name='sales_pred')
        y_pred_revenue_series = pd.Series(y_pred_revenue, index=X_test.index, name='revenue_pred')
        
        all_y_test_sales.append(y_test_sales)
        all_y_pred_sales_series.append(y_pred_sales_series)
        
        all_y_test_revenue.append(y_test_revenue)
        all_y_pred_revenue_series.append(y_pred_revenue_series)
        
        fold += 1

    plt.tight_layout()
    plt.show()

    # --- Report and Store Scores for this Horizon ---
    mean_sales_rmse = np.mean(sales_scores)
    mean_revenue_rmse = np.mean(revenue_scores)
    
    print(f"--- Results for h={h} ---")
    print(f"Mean Sales RMSE: {mean_sales_rmse:.4f}")
    print(f"Mean Revenue RMSE: {mean_revenue_rmse:.4f}")
    
    all_horizon_scores[f'h{h}'] = {'sales_rmse_mean': mean_sales_rmse, 'revenue_rmse_mean': mean_revenue_rmse, 'sales_scores_fold': sales_scores, 'revenue_scores_fold': revenue_scores}

In [0]:
print(f'individual scores: {sales_scores}')
print(f'individual scores: {revenue_scores}')

In [0]:

for h in range(1,8):
    # 1. Sales Plot
    full_y_test_sales = pd.concat(all_y_test_sales)
    full_y_pred_sales = pd.concat(all_y_pred_sales_series)

    y_test_sales_total = full_y_test_sales.groupby(level='charge_dt').sum()
    y_pred_sales_total = full_y_pred_sales.groupby(level='charge_dt').sum()

    plt.figure(figsize=(15, 6))
    plt.plot(y_test_sales_total.index, y_test_sales_total, label='Actual Daily Sales', marker='.', linestyle='-')
    plt.plot(y_pred_sales_total.index, y_pred_sales_total, label='Predicted Daily Sales', marker='.', linestyle='--')
    plt.title(f'Sales: Actual vs. Predicted Daily Totals (All Folds) - Horizon h={h}')
    plt.xlabel('Date')
    plt.ylabel('Total Sales')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 2. Revenue Plot
    full_y_test_revenue = pd.concat(all_y_test_revenue)
    full_y_pred_revenue = pd.concat(all_y_pred_revenue_series)

    y_test_revenue_total = full_y_test_revenue.groupby(level='charge_dt').sum()
    y_pred_revenue_total = full_y_pred_revenue.groupby(level='charge_dt').sum()

    plt.figure(figsize=(15, 6))
    plt.plot(y_test_revenue_total.index, y_test_revenue_total, label='Actual Daily Revenue', marker='.', linestyle='-')
    plt.plot(y_pred_revenue_total.index, y_pred_revenue_total, label='Predicted Daily Revenue', marker='.', linestyle='--')
    plt.title(f'Revenue: Actual vs. Predicted Daily Totals (All Folds) - Horizon h={h}')
    plt.xlabel('Date')
    plt.ylabel('Total Revenue')
    plt.legend()
    plt.grid(True)
    plt.show()