In [0]:
#importing data

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import pmdarima as pm
import xgboost as xgb
from xgboost import XGBRegressor
from prophet import Prophet
from pyspark.sql.functions import to_date, to_timestamp, col, last, when, lit, dayofweek, dayofmonth, dayofyear, month, weekofyear, current_date, date_sub, datediff, sum as spark_sum
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import joblib
import holidays
import mlflow

mlflow.autolog(disable=True)

df = spark.read.table('data_experience_commercial.cbt_1423_rtsuite.master').select('charge_dt','dtg','chargeproduct','unt_pre','flightkey','flight_dt','rev_pre','channel','unt_net')
dmh = spark.read.table('data_experience_commercial.cbt_0923_segmentfinder.dimensions_history').select('flightkey','onsale_dt','ty_capacity','routetype','region','route')
df = df.join(dmh, on='flightkey', how='left')
df = df[(df['chargeproduct']=='Ticket') & (df['dtg'] >= 0) & (df['region'].isin(['UK-London','UK-Regions'])) & (df['routetype'] == 'Domestic')]
df = df.filter((col('dtg') < (datediff(col('flight_dt'), col('onsale_dt')) - 25)) & (col('charge_dt') >= '2019-01-01'))
window_spec = Window.partitionBy('flightkey').orderBy(col('dtg').desc())
df = df.withColumn('pax_net', spark_sum('unt_net').over(window_spec))
df = df.withColumn('load_factor', col('pax_net')/col('ty_capacity'))
df = df.withColumn('unt_pre', F.when(col('unt_pre') < 0, 0).otherwise(col('unt_pre')))
df = df.withColumn('rev_pre', F.when(col('rev_pre') < 0, 0).otherwise(col('rev_pre')))
df = df.withColumn('yield', F.when(col('unt_pre') == 0, 0).otherwise(col('rev_pre')/col('unt_pre')))
df = df[df['dtg'] <= 364]
#df = df.withColumn('channel_index', when(col('channel') == 'Web/App', 1).otherwise(0))
df = df.groupby('route','charge_dt','dtg').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('ty_capacity').alias('ty_capacity'), F.avg('load_factor').alias('load_factor'), F.avg('yield').alias('yield')).orderBy('charge_dt','dtg')
df = df.toPandas()
df.info()

In [0]:
def optimize_df(df):

    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Initial memory usage: {start_mem:.2f} MB")

    for col in df.columns:

        if '_dt' in col:
            df[col] = pd.to_datetime(df[col])

        elif df[col].dtype == 'object':
            df[col] = df[col].astype('category')

        elif 'float' in str(df[col].dtype):
            df[col] = df[col].astype(np.float16)
            
        elif 'int' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='integer')

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f"Final memory usage: {end_mem:.2f} MB ({reduction:.2f}% reduction)")

    return df

optimize_df(df)
df.info()

In [0]:
layers = ['Network', 'Region', 'RouteType', 'Route']
fig, ax = plt.subplots(figsize=(8, 12))

for i, layer in enumerate(layers):
    ax.text(0.5, (0.8 - i * 0.2), layer, fontsize=18, ha='center', va='center', bbox={'boxstyle':'square,pad=0.5', 'facecolor':'white'})
    ax.plot([0.5, 0.5], [(0.75 - i * 0.2), (0.65 - i * 0.2)])

ax.axis('off')
ax.set_ylim(0, 1)
ax.set_xlim(0, 1)

plt.show()

In [0]:
total_sales = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
df = df[df['charge_dt'] >= '2022-01-01']
total_sales_by_charge_dt = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_charge_dt.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
total_sales_by_dtg = df.groupby('dtg')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_dtg.plot(style='-', figsize=(20,5), title = 'sales by dtg since 2022', y='unt_pre', x='dtg')
plt.show()

In [0]:
df_dynamic_plot = df.reset_index()
df_dynamic_plot['charge_dt'] = df_dynamic_plot['charge_dt'].astype(str)
df_dynamic_plot = df_dynamic_plot.groupby(['dtg', 'charge_dt'])['unt_pre'].sum().reset_index()
fig = px.line(df_dynamic_plot, x='dtg', y='unt_pre', animation_frame='charge_dt', title='Sales by DTG with charge date variations')
fig.update_layout(xaxis_title='Days To Go', yaxis_title='Sales', legend_title='Charge Date', height=900, width=1400)

fig.show()

In [0]:
#ACF

df_ACF = df.groupby('charge_dt')['unt_pre'].sum()
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(18, 8))

plot_acf(df_ACF, ax=ax1, lags=56)
ax1.set_title('Autocorrelation Function (ACF)')

plot_pacf(df_ACF, ax=ax2, lags=56, method='ywm') 
ax2.set_title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

In [0]:
# Lag Features

lag_days = [7, 14, 21, 28, 364]
cols_to_lag = ['unt_pre', 'load_factor', 'yield']

def generate_lag_features(df, columns, lag_days):
    df['charge_dt'] = pd.to_datetime(df['charge_dt'])
    df = df.set_index(['route', 'dtg', 'charge_dt'])
    df = df.sort_index()
    for col in columns:
        for lag in lag_days:
            lag_col_name = f'{col}_lag{lag}'
            df[lag_col_name] = df.groupby(level=[0, 1])[col].shift(lag).fillna(0)
    return df.reset_index()

df = generate_lag_features(df, cols_to_lag, lag_days)
optimize_df(df)

In [0]:
df[(df['route'] == 'LTNEDI') & (df['dtg'] == 10) & (df['charge_dt'] >= '2025-07-01') & (df['charge_dt'] <= '2025-07-31')][['dtg','route','charge_dt', 'unt_pre', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28', 'load_factor', 'load_factor_lag7', 'load_factor_lag14', 'load_factor_lag21', 'load_factor_lag28', 'yield', 'yield_lag7', 'yield_lag14', 'yield_lag21', 'yield_lag28']].head(31)

In [0]:
optimize_df(df)
df.info()

In [0]:
# Seasonality Analysis

def create_features(df):
    df['flight_dt'] = pd.to_datetime(df['charge_dt'] + pd.to_timedelta(df['dtg'], unit='D'))
    df['charge_dt'] = pd.to_datetime(df['charge_dt'])
    df['flight_month'] = df['flight_dt'].dt.month.astype(int)
    df['flight_dow'] = df['flight_dt'].dt.dayofweek.astype(int)
    df['flight_dom'] = df['flight_dt'].dt.day.astype(int)
    df['flight_doy'] = df['flight_dt'].dt.dayofyear.astype(int)
    df['flight_year'] = df['flight_dt'].dt.year.astype(int)
    df['charge_month'] = df['charge_dt'].dt.month.astype(int)
    df['charge_dow'] = df['charge_dt'].dt.dayofweek.astype(int)
    df['charge_dom'] = df['charge_dt'].dt.day.astype(int)
    df['charge_doy'] = df['charge_dt'].dt.dayofyear.astype(int)
    df['charge_year'] = df['charge_dt'].dt.year.astype(int)

create_features(df)
df.tail()

In [0]:
optimize_df(df)
df.info()

In [0]:
df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_month', 'charge_month'], var_name='month_type', value_name='month')
mean_sales = df_melt.groupby(['month', 'month_type'])['unt_pre'].sum().reset_index()

plt.figure(figsize=(20, 8))
sns.barplot(data=mean_sales, x='month', y='unt_pre', hue='month_type')
plt.style.use('ggplot')
plt.title('Historic Sales by Month since 2022')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.show()

In [0]:
df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_dow', 'charge_dow'], var_name='dow_type', value_name='dow')
mean_sales = df_melt.groupby(['dow', 'dow_type'])['unt_pre'].sum().reset_index()
plt.figure(figsize=(15, 8))
sns.barplot(data=mean_sales, x='dow', y='unt_pre', hue='dow_type')
plt.style.use('ggplot')
plt.title('Historic Sales by DoW since 2022')
plt.xlabel('DoW')
plt.ylabel('Total Sales')
display(plt.show())

In [0]:
df[(df['charge_dt'] > '2024-01-01') & (df['charge_dt'] < '2024-01-31')].groupby('charge_dt')['unt_pre'].sum().plot(figsize=(20,5), title = 'sales by charge date (Jan24)', y='unt_pre', linewidth=10)
plt.style.use('ggplot')
plt.show()

In [0]:
mean_sales = df.groupby('charge_dom')['unt_pre'].mean().reset_index()
mean_yield = df.groupby('charge_dom')['yield'].mean().reset_index()

fig, ax1 = plt.subplots(figsize=(15, 8))
plt.style.use('ggplot')

ax1.plot(mean_sales['charge_dom'], mean_sales['unt_pre'], marker='o', linewidth=12, color='tab:red', label='Mean Sales')
ax1.set_xlabel('Charge Day of Month')
ax1.set_ylabel('Mean Sales', color='tab:red')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.plot(mean_yield['charge_dom'], mean_yield['yield'], marker='s', linewidth=12, color='tab:blue', label='Mean Yield')
ax2.set_ylabel('Mean Yield', color='tab:blue')
ax2.tick_params(axis='y', labelcolor='tab:blue')

fig.suptitle('Mean Sales and Yield by Charge Day of Month (2022 - ToDate)')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()

In [0]:
uk_holidays = holidays.UK(years=range(2022, 2026))
holidays_df = pd.DataFrame([(date, name) for date, name in uk_holidays.items()], columns=['ds', 'holiday'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df

In [0]:
additional_holidays = pd.DataFrame([
    {'ds': '2022-04-18', 'holiday': 'Easter Monday'},
    {'ds': '2022-08-29', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2023-04-10', 'holiday': 'Easter Monday'},
    {'ds': '2023-08-28', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2024-04-01', 'holiday': 'Easter Monday'},
    {'ds': '2024-08-26', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2025-04-21', 'holiday': 'Easter Monday'},
    {'ds': '2025-08-25', 'holiday': 'Summer Bank Holiday'},
])
holidays_df = pd.concat([holidays_df, additional_holidays], ignore_index=True)
holidays_df.drop_duplicates(inplace=True)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df.reset_index(drop=True, inplace=True)
holidays_df
   

In [0]:
holidays_df["ds"] = pd.to_datetime(holidays_df["ds"])
df = df.merge(holidays_df.rename(columns={'ds': 'charge_dt', 'holiday': 'charge_dt_holiday'}), how='left', on='charge_dt')
df = df.merge(holidays_df.rename(columns={'ds': 'flight_dt', 'holiday': 'flight_dt_holiday'}), how='left', on='flight_dt')
df['is_charge_date_holiday'] = df['charge_dt_holiday'].notnull().astype(int)
df['is_flight_date_holiday'] = df['flight_dt_holiday'].notnull().astype(int)
df.drop(['charge_dt_holiday', 'flight_dt_holiday'], axis=1, inplace=True)
df = df.sort_values(by=['charge_dt', 'dtg'], ascending=[True, True])
df.set_index('charge_dt', inplace=True)
df.head()

In [0]:
#Feature Engineering
original_df = df.copy()

df.drop(['load_factor'], axis=1, inplace=True)
df.drop(['yield'], axis=1, inplace=True)
df.drop(['flight_dt'], axis=1, inplace=True)

cyclic_cols=['flight_dom', 'flight_doy', 'charge_dom', 'charge_doy', 'flight_month', 'charge_month', 'flight_dow', 'charge_dow']
num_cols=['charge_year','flight_year']

def encode_cyclic_features(df, cyclic_cols):
    for col in cyclic_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
        df.drop(col, axis=1, inplace=True)
    return df

def scale_num_cols(df, num_cols):
    for col in num_cols:
        scaler = StandardScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

encode_cyclic_features(df, cyclic_cols)
scale_num_cols(df, num_cols)
df.head()

In [0]:
optimize_df(df)
df.info()

In [0]:
#capacity proportions by route for disaggregation 
df_cap = df.reset_index()
total_daily_capacity = df_cap.groupby(['charge_dt','dtg'])['ty_capacity'].sum().reset_index().rename(columns={'ty_capacity':'capacity_total'})
route_daily_capacity = df_cap[['charge_dt','dtg','route','ty_capacity']]
capacity_proportions = route_daily_capacity.merge(total_daily_capacity, on=['charge_dt','dtg'], how='left')
capacity_proportions['capacity_proportion'] = capacity_proportions['ty_capacity'] / capacity_proportions['capacity_total']
capacity_proportions = capacity_proportions[['dtg','charge_dt','route','capacity_proportion']]
optimize_df(capacity_proportions)

In [0]:
# SARIMA
df = df.reset_index()
df_SARIMA = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
auto_model = pm.auto_arima(df_SARIMA['unt_pre'], start_p=1, start_q=1, test='adf', max_p=3, max_q=3, m=7, d=0, seasonal=True, start_P=0, D=None, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True)
print(auto_model.summary())

In [0]:
#Rolling Forecast

split_date = pd.to_datetime(spark.sql('SELECT current_date()').collect()[0][0]) - pd.DateOffset(days=168)
df_SARIMA_train = df_SARIMA.loc[df_SARIMA['charge_dt'] < split_date]
df_SARIMA_test = df_SARIMA.loc[df_SARIMA['charge_dt'] >= split_date]

history = list(df_SARIMA_train['unt_pre'])
all_predictions = []
forecast_horizon = 7

for i in range(0, len(df_SARIMA_test), 7):
    
    SARIMA_model = pm.ARIMA(order=(1,0,3), seasonal_order=(0,1,2,7))
    SARIMA_model.fit(history)
    
    next_forecast = SARIMA_model.predict(n_periods=forecast_horizon)[:7]
    all_predictions.extend(next_forecast)
    
    actuals_for_period = df_SARIMA_test['unt_pre'][i : i + 7]
    history.extend(actuals_for_period)


final_predictions = all_predictions[:len(df_SARIMA_test)]

rmse_SARIMA = np.sqrt(mean_squared_error(df_SARIMA_test['unt_pre'], final_predictions))
mae_SARIMA = mean_absolute_error(df_SARIMA_test['unt_pre'], final_predictions)

print(f"RMSE: {rmse_SARIMA:.2f}")
print(f"MAE: {mae_SARIMA:.2f}")

# Plot the results
plt.figure(figsize=(14, 7))
plt.plot(df_SARIMA_test.index, df_SARIMA_test['unt_pre'], color='green', label='Actual Test Data')
plt.plot(df_SARIMA_test.index, final_predictions, color='red', linestyle='--', label='Rolling Forecast')
plt.title('SARIMA Rolling Forecast Backtest')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
# SARIMAX
optimize_df(original_df)
df_SARIMAX = original_df.reset_index()
split_date = pd.to_datetime(spark.sql('SELECT current_date()').collect()[0][0]) - pd.DateOffset(days=168)
df_SARIMAX = df_SARIMAX.groupby('charge_dt')[['unt_pre', 'ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28', 'unt_pre_lag364', 'charge_dom', 'charge_doy', 'charge_month', 'charge_dow', 'is_charge_date_holiday']].sum().reset_index()
df_SARIMAX = df_SARIMAX.set_index('charge_dt')

y_SARIMAX = df_SARIMAX[['unt_pre']]
X_SARIMAX = df_SARIMAX.drop('unt_pre', axis=1)

X_SARIMAX_train = X_SARIMAX.loc[X_SARIMAX.index < split_date]
X_SARIMAX_test = X_SARIMAX.loc[X_SARIMAX.index >= split_date]
y_SARIMAX_train = y_SARIMAX.loc[y_SARIMAX.index < split_date]
y_SARIMAX_test = y_SARIMAX.loc[y_SARIMAX.index >= split_date]
SARIMAX_model = pm.auto_arima(y_SARIMAX_train, X=X_SARIMAX_train, seasonal=True,trace=True, start_p=1, start_q=1, test='adf', max_p=3, max_q=3, m=7, d=0, start_P=0, D=None, error_action='ignore', suppress_warnings=True, stepwise=True)
predictions = SARIMAX_model.predict(n_periods=len(y_SARIMAX_test), X=X_SARIMAX_test)

print(SARIMAX_model.summary())

In [0]:
history_y = list(y_SARIMAX_train['unt_pre'])
history_X = X_SARIMAX_train.copy()
all_predictions = []
forecast_horizon = 7

for i in range(0, len(y_SARIMAX_test), 7):

    X_future = X_SARIMAX_test[i : i + forecast_horizon]
    
    SARIMAX_model = pm.ARIMA(order=(0,0,1), seasonal_order=(0,0,0,7))
    SARIMAX_model.fit(history_y, X=history_X)
    
    next_forecast = SARIMAX_model.predict(n_periods=len(X_future), X=X_future)[:7]
    all_predictions.extend(next_forecast)
    
    actuals_y_for_period = y_SARIMAX_test['unt_pre'][i : i + 7]
    actuals_X_for_period = X_SARIMAX_test[i : i + 7]

    history_y.extend(actuals_y_for_period)
    history_X = pd.concat([history_X, actuals_X_for_period])


final_predictions = all_predictions[:len(y_SARIMAX_test)]

rmse_SARIMAX = np.sqrt(mean_squared_error(y_SARIMAX_test['unt_pre'], final_predictions))
mae_SARIMAX = mean_absolute_error(y_SARIMAX_test['unt_pre'], final_predictions)

print(f"RMSE: {rmse_SARIMAX:.2f}")
print(f"MAE: {mae_SARIMAX:.2f}")

# Plot the results
plt.figure(figsize=(14, 7))
plt.plot(y_SARIMAX_test.index, y_SARIMAX_test['unt_pre'], color='green', label='Actual Test Data')
plt.plot(y_SARIMAX_test.index, final_predictions, color='red', linestyle='--', label='Rolling Forecast')
plt.title('SARIMAX Rolling Forecast Backtest')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#Top-Down Linear Regression

sum_cols = ['unt_pre', 'ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28', 'unt_pre_lag364']
avg_cols = ['load_factor_lag7','load_factor_lag14','load_factor_lag21','load_factor_lag28','load_factor_lag364','yield_lag7','yield_lag14','yield_lag21','yield_lag28','yield_lag364','flight_year','charge_year','is_charge_date_holiday','is_flight_date_holiday','flight_dom_sin','flight_dom_cos','flight_doy_sin','flight_doy_cos','charge_dom_sin','charge_dom_cos','charge_doy_cos','charge_doy_sin','flight_month_cos','flight_month_sin','charge_month_sin','charge_month_cos','flight_dow_sin','flight_dow_cos','charge_dow_sin','charge_dow_cos']
agg_dict = {col: 'mean' for col in avg_cols}
agg_dict.update({col: 'sum' for col in sum_cols})
df_LR = df.groupby(['charge_dt', 'dtg']).agg(agg_dict)
df_LR = df_LR.reset_index() 
df_LR['charge_dt'] = pd.to_datetime(df_LR['charge_dt']) 
optimize_df(df_LR)
df_LR.head()

In [0]:
split_date = pd.to_datetime(spark.sql("SELECT current_date()").collect()[0][0]) - pd.DateOffset(days=168)
df_LR_train = df_LR.loc[df_LR['charge_dt'] < split_date]
df_LR_test = df_LR.loc[df_LR['charge_dt'] >= split_date]

df_LR_train_plot = df_LR_train.groupby('charge_dt')['unt_pre'].sum().reset_index()
df_LR_test_plot = df_LR_test.groupby('charge_dt')['unt_pre'].sum().reset_index()

# Plot the data
plt.figure(figsize=(18, 6))
plt.plot(df_LR_train_plot['charge_dt'], df_LR_train_plot['unt_pre'], label='Train', color='blue')
plt.plot(df_LR_test_plot['charge_dt'], df_LR_test_plot['unt_pre'], label='Validation', color='red')
plt.axvline(pd.to_datetime(split_date), linestyle='--', color='black', label='Split Date')
plt.legend()
plt.title('Sales Before and After Split Date')
plt.xlabel('Charge Date')
plt.ylabel('Sales')
plt.show()

In [0]:
df_LR_train = df_LR_train.set_index('charge_dt')
df_LR_test = df_LR_test.set_index('charge_dt')

X_train = df_LR_train.drop(['unt_pre'], axis=1)
X_test = df_LR_test.drop(['unt_pre'], axis=1)
y_train = df_LR_train['unt_pre']
y_test = df_LR_test['unt_pre']

LR_model = LinearRegression()
LR_model.fit(X_train, y_train)
y_pred_LR = LR_model.predict(X_test)

predictions_df = X_test.copy()
predictions_df['total_prediction_LR'] = y_pred_LR
predictions_df = predictions_df.reset_index()[['charge_dt', 'dtg', 'total_prediction_LR']]

disaggregated_df = pd.merge(predictions_df, capacity_proportions, on=['charge_dt', 'dtg'], how='left')
disaggregated_df['route_forecast_LR'] = disaggregated_df['total_prediction_LR'] * disaggregated_df['capacity_proportion']

df_with_actuals = df.reset_index()
actuals_to_merge = df_with_actuals[['charge_dt', 'dtg', 'route', 'unt_pre']]
final_df = pd.merge(disaggregated_df, actuals_to_merge, on=['charge_dt', 'dtg', 'route'], how='left')

rmse_LR = np.sqrt(mean_squared_error(final_df['unt_pre'], final_df['route_forecast_LR']))
mae_LR = mean_absolute_error(final_df['unt_pre'], final_df['route_forecast_LR'])
print(f'RMSE: {rmse_LR:.2f}')
print(f'MAE: {mae_LR:.2f}')

In [0]:
LR_agg = final_df.groupby(['charge_dt'])[['unt_pre', 'route_forecast_LR']].sum().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(LR_agg['charge_dt'], LR_agg['unt_pre'], label='Actual', marker='o')
plt.plot(LR_agg['charge_dt'], LR_agg['route_forecast_LR'], label='Forecast (LR)', marker='x')
plt.xlabel('Charge Date')
plt.ylabel('Total Sales Across All DTGs')
plt.title('LR Forecast vs Actual – Aggregated Over DTG')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [0]:
LR_agg_dynamic = final_df.groupby(['dtg', 'charge_dt'])['unt_pre', 'route_forecast_LR'].sum().reset_index()
LR_agg_dynamic['charge_dt'] = LR_agg_dynamic['charge_dt'].astype(str)

actual = LR_agg_dynamic[['charge_dt','dtg','unt_pre']].copy()
actual['sales_type'] = 'actual'
actual['sales_value'] = actual['unt_pre']

pred = LR_agg_dynamic[['charge_dt','dtg','route_forecast_LR']].copy()
pred['sales_type'] = 'predicted'
pred['sales_value'] = pred['route_forecast_LR']

melted = pd.concat([actual, pred]).sort_values(['charge_dt','dtg','sales_type'])

fig = px.line(melted, x='dtg', y='sales_value', animation_frame='charge_dt', title='Sales by DTG with charge date variations')

fig = px.line(melted, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='Actual vs Predicted by DTG across Charge Dates', color_discrete_map={'unt_net':'red', 'prediction':'blue'})

fig.update_xaxes(range=[0, 200])
fig.update_yaxes(range=[0, 800])
fig.update_layout(xaxis_title='Days To Go', yaxis_title='Sales', legend_title='Charge Date', height=900, width=1400)

fig.show()

In [0]:
#Prophet
df_prophet = df.groupby(['dtg', 'charge_dt'])[['unt_pre', 'ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28']].sum().reset_index()
train_prophet = df_prophet.loc[df_prophet['charge_dt'] < split_date]
val_prophet = df_prophet.loc[df_prophet['charge_dt'] >= split_date]
train_df = train_prophet.reset_index()
val_df = val_prophet.reset_index()
all_forecasts = []

for dtg_value in train_df['dtg'].unique():
    train_subset = train_df[train_df['dtg'] == dtg_value][['charge_dt', 'unt_pre', 'ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28']].copy()
    train_subset.rename(columns={'charge_dt': 'ds', 'unt_pre': 'y'}, inplace=True)
    train_subset.sort_values('ds', inplace=True)

    val_subset = val_df[val_df['dtg'] == dtg_value][['charge_dt', 'unt_pre', 'ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28']].copy()
    val_subset.rename(columns={'charge_dt': 'ds'}, inplace=True)
    val_subset.sort_values('ds', inplace=True)

    if len(train_subset) < 10 or val_subset.empty:
        continue

    model = Prophet(holidays=holidays_df)
    regressors = ['ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28']
    for reg in regressors:
        model.add_regressor(reg)
    model.fit(train_subset)

    forecast = model.predict(val_subset)

    forecast['dtg'] = dtg_value
    actual_values = val_df[val_df['dtg'] == dtg_value].set_index('charge_dt').reindex(forecast['ds'])['unt_pre'].values
    forecast['actual'] = actual_values

    all_forecasts.append(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', 'dtg', 'actual']])

prophet_val_forecasts = pd.concat(all_forecasts, ignore_index=True)

In [0]:
agg_forecast = (prophet_val_forecasts.groupby('ds').agg({'yhat': 'sum','yhat_lower': 'sum', 'yhat_upper': 'sum', 'actual': 'sum'}).reset_index())
actuals_all = pd.concat([train_df, val_df])
actuals_all = actuals_all.rename(columns={'charge_dt': 'ds', 'unt_pre': 'y'})
agg_actuals = (actuals_all.groupby('ds').agg({'y': 'sum'}).reset_index())
plot_df = pd.merge(agg_forecast, agg_actuals, on='ds', how='outer').sort_values('ds')

plt.style.use('seaborn-v0_8-darkgrid')
fig, ax = plt.subplots(figsize=(18, 6))

plot_df_val = plot_df[plot_df['ds'] >= split_date]
plot_df_train = plot_df[plot_df['ds'] < split_date]

ax.plot(plot_df_val['ds'], plot_df_val['y'], 'k.', alpha=0.6, label='Actual')
ax.plot(plot_df_train['ds'], plot_df_train['y'], 'red', linewidth=1.5, label='Train') 

ax.plot(plot_df['ds'], plot_df['yhat'], color='blue', label='Forecast')
ax.fill_between(plot_df['ds'], plot_df['yhat_lower'], plot_df['yhat_upper'], color='skyblue', alpha=0.3, label='Confidence Interval')

ax.axvline(x=split_date, color='black', linestyle='--', lw=2, label='Forecast Start')
ax.set_title('Forecasted Demand with Prophet (Aggregated)', fontsize=18, fontweight='bold')
ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Sales', fontsize=14)
ax.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [0]:
plt.figure(figsize=(12, 6))
plt.plot(agg_forecast['ds'], agg_forecast['actual'], label='Actual', marker='o')
plt.plot(agg_forecast['ds'], agg_forecast['yhat'], label='Forecast (Prophet)', marker='x')
plt.xlabel('Charge Date')
plt.ylabel('Total Sales Across All DTGs')
plt.title('Prophet Forecast vs Actual – Aggregated Over DTG')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [0]:
mse_prophet = np.sqrt(mean_squared_error(agg_forecast['actual'], agg_forecast['yhat']))
#mape_prophet = mean_absolute_percentage_error(agg_forecast['actual'], agg_forecast['yhat'])
print(f'MSE: {mse_prophet:.2f}')
#print(f'MAPE: {mape_prophet:.2f}')

In [0]:
'''%python
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor

param_grid = {
    'learning_rate': [0.005, 0.01, 0.1],
    #'subsample': [0.8, 1.0],
    #'colsample_bytree': [0.8, 1.0],
    #'lambda': [0.1, 1.0],
    #'alpha': [0, 0.1],
}

rmse_scorer = make_scorer(mean_squared_error, squared=False)

outer_cv = TimeSeriesSplit(n_splits=5)
inner_cv = TimeSeriesSplit(n_splits=3)

outer_scores = []

for train_idx, test_idx in outer_cv.split(df):
    X_train_outer = df.iloc[train_idx].drop('unt_net', axis=1).copy()
    y_train_outer = df.iloc[train_idx]['unt_net'].copy()
    X_test_outer = df.iloc[test_idx].drop('unt_net', axis=1).copy()
    y_test_outer = df.iloc[test_idx]['unt_net'].copy()

    X_train_processed = preprocessor.fit_preprocess(X_train_outer)
    X_test_processed = preprocessor.transform_preprocess(X_test_outer)

    model = XGBRegressor(objective='reg:pseudohubererror', base_score=0.5, boosting='gbtree', early_stopping_rounds=50, max_depth=3, n_estimators=500, learning_rate=0.01)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, n_jobs=-1, verbose=1, scoring=rmse_scorer)

    grid_search.fit(X_train_processed, y_train_outer, eval_set=[(X_test_processed, y_test_outer)])

    best_model = grid_search.best_estimator_
    test_score = best_model.score(X_test_processed, y_test_outer)
    outer_scores.append(test_score)

    print(f'Outer fold score: {test_score}, best params: {grid_search.best_params_}')'''

In [0]:
#XGBoost
df = df.set_index('charge_dt')
unique_dates = df.index.sort_values().unique()
tscv = TimeSeriesSplit(n_splits=5, test_size=168)

fig, axs = plt.subplots(5, 1, figsize=(20, 10), sharex=True)
plt.style.use('ggplot')
fold=0
preds = []
scores = []
# Nested cross-validation
for train_index, val_index in tscv.split(unique_dates):

    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_data = df.loc[train_dates]
    val_data = df.loc[val_dates]

    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_train.plot(ax=axs[fold], label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=axs[fold], label='Val', x='charge_dt', y='unt_pre', style='-')
    axs[fold].axvline(val_data.index.min(), linestyle='--', color='black')
    axs[fold].set_title(f'Fold {fold+1}')
    fold += 1

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']

    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    XGB_model = xgb.XGBRegressor(base_score=0.5, booster ='gbtree', n_estimators=500, early_stopping_rounds=50, max_depth=3, learning_rate=0.01, objective='reg:pseudohubererror', enable_categorical=True)
    XGB_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    y_pred = XGB_model.predict(X_test)
    preds.extend(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

In [0]:
print(f'individual scores: {scores}')
print(f'combined score: {np.mean(scores)}')
print(f'std: {np.std(scores)}')      

In [0]:
predictions = pd.DataFrame({'date': X_test.index, 'prediction': y_pred, 'dtg': X_test['dtg'], 'route': X_test['route']})
actual = pd.DataFrame({'date': X_test.index, 'unt_pre': y_test, 'dtg': X_test['dtg'], 'route': X_test['route']})
y_pred_total = predictions.groupby('date')['prediction'].sum().reset_index()
y_test_total = actual.groupby('date')['unt_pre'].sum().reset_index()
y_pred_total.set_index('date', inplace=True)
y_test_total.set_index('date', inplace=True)
#mape_xgb = mean_absolute_percentage_error(y_test_total, y_pred_total)
mse_xgb = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'MSE: {mse_xgb:.2f}')
#print(f'MAPE: {mape_xgb:.2f}')


In [0]:
plt.figure(figsize=(12, 6))
plt.plot(y_test_total.index, y_test_total['unt_pre'], label='Actual', marker='o')
plt.plot(y_pred_total.index, y_pred_total['prediction'], label='Forecast', marker='x')
plt.xlabel('Charge Date')
plt.ylabel('Total Sales Across All DTGs')
plt.title('XGB Forecast vs Actual – Aggregated Over DTG')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [0]:
df_reset_actual = actual.reset_index()
df_reset_actual['date'] = df_reset_actual['date'].astype(str)
df_reset_actual = df_reset_actual.groupby(['dtg', 'date'])['unt_pre'].sum().reset_index()

df_reset_pred = predictions.reset_index()
df_reset_pred['date'] = df_reset_pred['date'].astype(str)
df_reset_pred = df_reset_pred.groupby(['dtg', 'date'])['prediction'].sum().reset_index()

df_plot_actual = df_reset_actual[df_reset_actual['unt_pre'].notnull()].copy()
df_plot_pred = df_reset_pred[df_reset_pred['prediction'].notnull()].copy()

df_actual = df_plot_actual[['date','dtg','unt_pre']].copy()
df_actual['sales_type'] = 'actual'
df_actual['sales_value'] = df_actual['unt_pre']

df_pred = df_plot_pred[['date','dtg','prediction']].copy()
df_pred['sales_type'] = 'predicted'
df_pred['sales_value'] = df_pred['prediction']

df_melted = pd.concat([df_actual, df_pred]).sort_values(['date','dtg','sales_type'])

min_dtg = 0
max_dtg = 200
min_sales = 0
max_sales = 1200

fig = px.line(df_melted, x='dtg', y='sales_value', color='sales_type', animation_frame='date', title='Actual vs Predicted by DTG across Charge Dates', color_discrete_map={'unt_net':'red', 'prediction':'blue'})
fig.update_xaxes(range=[min_dtg, max_dtg], title='DTG')
fig.update_yaxes(range=[min_sales, max_sales], title='Sales')
fig.update_layout(legend_title='sales type', height=900, width=1200)
fig.show()