In [0]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import pmdarima as pm
import xgboost as xgb
from xgboost import XGBRegressor
from prophet import Prophet
from pyspark.sql.functions import to_date, to_timestamp, col, last, when, lit, current_date, date_sub, datediff, substring, sequence, explode, coalesce, sum as spark_sum, min as spark_min
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from datetime import datetime, timedelta
import joblib
import holidays
import mlflow
mlflow.autolog(disable=True)

#importing price and meta data

price_history = spark.read.table('data_prod.silver_sanezdb.priceinspection').select('segment', 'when', F.col('standardprice').cast('double'))
sales_history = spark.read.table('data_experience_commercial.cbt_1423_rtsuite.master').select('flightkey', F.col('charge_dt').cast('date'), 'unt_pre', 'chargeproduct', 'dtg')
dimensions_history = spark.read.table('data_experience_commercial.cbt_0923_segmentfinder.dimensions_history').select('flightkey', 'onsale_dt', 'ty_capacity', 'routetype', 'region', 'flight_dt')
filtered_dh = dimensions_history.filter((F.col('routetype') == 'Domestic') & (F.col('region').isin(['UK-London', 'UK-Regions'])) & (F.col('flight_dt') >= '2022-01-01'))

#forward filling flightkey price history to 1d frequency

ph_renamed = price_history.withColumnRenamed('when', 'charge_dt').withColumnRenamed('segment', 'flightkey').withColumn('charge_dt', F.col('charge_dt').cast('date')) 
dph = filtered_dh.join(ph_renamed, on='flightkey', how='inner')
dphsmooth = dph.groupby('flightkey','charge_dt').agg(F.avg('standardprice').alias('price'), F.first('flight_dt').alias('flight_dt'), F.first('onsale_dt').alias('onsale_dt')).orderBy('charge_dt')
date_range = dphsmooth.groupBy('flightkey').agg(F.min('onsale_dt').alias('start_date'), F.least(F.first('flight_dt'), F.lit(datetime.now().date())).alias('end_date'))
index = date_range.withColumn('charge_dt_ts', F.explode(F.sequence(F.col('start_date'), F.col('end_date')))).withColumn('charge_dt', F.col('charge_dt_ts').cast('date')).drop('charge_dt_ts')
dphjoin = index.join(dphsmooth, on=['flightkey', 'charge_dt'], how='left').drop('flight_dt', 'onsale_dt')
ff_window_spec = Window.partitionBy('flightkey').orderBy('charge_dt')
bf_window_spec = Window.partitionBy('flightkey').orderBy('charge_dt').rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
dphfilled = dphjoin.withColumn('price', F.coalesce(F.last('price', ignorenulls=True).over(ff_window_spec),F.first('price', ignorenulls=True).over(bf_window_spec))).drop('start_date', 'end_date') #forward fills from first flight price, then back fills from first price if first price date after onsale date
finaldph = dphfilled.join(filtered_dh, on='flightkey', how='left').drop('onsale_dt')
aggregated_price_history = finaldph.groupby('region', 'routetype', 'charge_dt', 'flight_dt').agg(F.avg('price').cast('double').alias('price'))

#assembling corresponding sales history

dsh = filtered_dh.join(sales_history, on='flightkey', how='left').drop('onsale_dt')
filtered_dsh = dsh.filter((F.col('chargeproduct') == 'Ticket') & (F.col('dtg') >= 0)).drop('chargeproduct', 'dtg')
dshsmooth = filtered_dsh.groupby('flightkey','charge_dt').agg(F.sum('unt_pre').alias('unt_pre'))
dshjoin = index.join(dshsmooth, on=['flightkey', 'charge_dt'], how='left').drop('start_date', 'end_date', 'region', 'routetype', 'flight_dt', 'ty_capacity').fillna(0)
window_spec2 = Window.partitionBy('flightkey').orderBy(F.col('charge_dt'))
dsh_pax = dshjoin.withColumn('pax_net', F.sum('unt_pre').over(window_spec2))
final_dsh = dsh_pax.join(dimensions_history, on='flightkey', how='left').drop('onsale_dt')
aggregated_sales_history = final_dsh.groupby('region', 'routetype', 'charge_dt', 'flight_dt').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('ty_capacity').alias('ty_capacity'), F.sum('pax_net').alias('pax_net'))

#final dataframe

df = aggregated_price_history.join(aggregated_sales_history, on=['region', 'routetype', 'charge_dt', 'flight_dt'], how='left').toPandas()

df.info()
df_original = df.copy()

In [0]:
df = df_orginal.copy()

In [0]:
# Data Optimisation

def optimize_df(df):

    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Initial memory usage: {start_mem:.2f} MB")

    for col in df.columns:

        if '_dt' in col:
            df[col] = pd.to_datetime(df[col])

        elif df[col].dtype == 'object':
            df[col] = df[col].astype('category')

        elif 'float' in str(df[col].dtype):
            df[col] = df[col].astype(np.float16)
            
        elif 'int' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='integer')

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f"Final memory usage: {end_mem:.2f} MB ({reduction:.2f}% reduction)")

    return df

optimize_df(df)
df.info()

In [0]:
#Sales Over Charge Date

total_sales = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
# Removing covid-19 outlier data before 2022. Keeping Dec 2021 for now to generate lag features.

df = df[df['charge_dt'] >= '2022-01-01']
total_sales_by_charge_dt = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_charge_dt.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
total_sales_by_charge_dt.set_index('charge_dt', inplace=True)
total_sales_by_charge_dt.head()

In [0]:
from statsmodels.tsa.seasonal import seasonal_decompose
def plot_decomposition(target):
    decomposition = seasonal_decompose(target, model='additive')
    fig, axes = plt.subplots(4, 1, sharex=True, figsize=(30, 10))
    axes[0].plot(target)
    axes[1].plot(decomposition.trend)
    axes[2].plot(decomposition.seasonal)
    axes[3].plot(decomposition.resid)
    plt.show()

plot_decomposition(total_sales_by_charge_dt['unt_pre'])

In [0]:
sales_by_routetype = df.groupby(['charge_dt', 'routetype'])['unt_pre'].mean().reset_index()
fig, ax = plt.subplots(figsize=(20,5))
for routetype in sales_by_routetype['routetype'].unique():
    subset = sales_by_routetype[sales_by_routetype['routetype'] == routetype]
    ax.plot(subset['charge_dt'], subset['unt_pre'], label=routetype)
plt.title('Sales by routetype')
plt.xlabel('Charge Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

In [0]:
# Typical Booking Curve

total_sales_by_dtg = df.groupby('dtg')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_dtg.plot(style='-', figsize=(20,5), title = 'sales by dtg since 2022', y='unt_pre', x='dtg')
plt.show()

In [0]:
total_sales_by_log_dtg = df.groupby('dtg')['unt_pre'].sum().reset_index()
total_sales_by_log_dtg['log_dtg'] = np.log(total_sales_by_log_dtg['dtg'])
plt.style.use('ggplot')
total_sales_by_log_dtg.plot(style='-', figsize=(20,5), title = 'sales by dtg since 2022', y='unt_pre', x='log_dtg')
plt.show()

In [0]:
# Booking Curve Over Time

df_dynamic_plot = df.reset_index()
df_dynamic_plot['charge_dt'] = df_dynamic_plot['charge_dt'].astype(str)
df_dynamic_plot = df_dynamic_plot.groupby(['dtg', 'charge_dt'])['unt_pre'].sum().reset_index()
fig = px.line(df_dynamic_plot, x='dtg', y='unt_pre', animation_frame='charge_dt', title='Sales by DTG with charge date variations')
fig.update_layout(xaxis_title='Days To Go', yaxis_title='Sales', legend_title='Charge Date', height=900, width=1400)

fig.show()

In [0]:
#ACF charge-date
df_ACF = df.groupby('charge_dt')['unt_pre'].sum()
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(18, 8))

plot_acf(df_ACF, ax=ax1, lags=56)
ax1.set_title('Autocorrelation Function (ACF)')

plot_pacf(df_ACF, ax=ax2, lags=56, method='ywm') 
ax2.set_title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

In [0]:
#ACF flight-date
df_ACF = df.groupby('flight_dt')['unt_pre'].sum()
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(18, 8))

plot_acf(df_ACF, ax=ax1, lags=56)
ax1.set_title('Autocorrelation Function (ACF)')

plot_pacf(df_ACF, ax=ax2, lags=56, method='ywm') 
ax2.set_title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

In [0]:
df = df_orginal.copy()
df = optimize_df(df)
df = df[df['charge_dt'] >= '2021-10-01']

In [0]:
def generate_lag_features(df, charge_lags, flight_lags):

    df['charge_dt'] = pd.to_datetime(df['charge_dt'])
    df['flight_dt'] = pd.to_datetime(df['flight_dt'])
    df_source = df.copy()

    for c_lag in charge_lags:
        for f_lag in flight_lags:
            f_lag_str = str(f_lag)
            feature_col_name = f'C{c_lag}F{f_lag_str}'
            
            df_temp = df_source.copy()
            df_temp['merge_key_charge_dt'] = df_temp['charge_dt'] + timedelta(days=c_lag)
            df_temp['merge_key_flight_dt'] = df_temp['flight_dt'] + timedelta(days=f_lag)
            df_temp = df_temp.rename(columns={'unt_pre': feature_col_name})
            df = pd.merge(df, df_temp[['merge_key_charge_dt', 'merge_key_flight_dt', feature_col_name]], left_on=['charge_dt', 'flight_dt'], right_on=['merge_key_charge_dt', 'merge_key_flight_dt'], how='left')
            df = df.drop(columns=['merge_key_charge_dt', 'merge_key_flight_dt'])
            df.fillna(-1, inplace=True)

    return df

flight_lags = [0, 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 364]
charge_lags = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 364] 
df = generate_lag_features(df, charge_lags, flight_lags)
df.info()


In [0]:
df = df[['charge_dt', 'flight_dt', 'dtg', 'unt_pre', 'ty_capacity', 'load_factor', 'yield', 'price', 'C7F0', 'C7F7', 'C7F14', 'C7F21', 'C7F28', 'C7F35', 'C7F42', 'C7F49', 'C7F56', 'C7F63', 'C7F70', 'C14F0', 'C14F7', 'C14F14', 'C14F21', 'C14F28', 'C14F35', 'C14F42', 'C14F49', 'C14F56', 'C14F63', 'C14F70', 'C21F0', 'C21F7', 'C21F14', 'C21F21', 'C21F28', 'C21F35', 'C21F42', 'C21F49', 'C21F56', 'C21F63', 'C21F70', 'C28F0', 'C28F7', 'C28F14', 'C28F21', 'C28F28', 'C28F35', 'C28F42', 'C28F49', 'C28F56', 'C28F63', 'C28F70', 'C35F0', 'C35F7', 'C35F14', 'C35F21', 'C35F28', 'C35F35', 'C35F42', 'C35F49', 'C35F56', 'C35F63', 'C35F70', 'C42F0', 'C42F7', 'C42F14', 'C42F21', 'C42F28', 'C42F35', 'C42F42', 'C42F49', 'C42F56', 'C42F63', 'C42F70', 'C49F0', 'C49F7', 'C49F14', 'C49F21', 'C49F28', 'C49F35', 'C49F42', 'C49F49', 'C49F56', 'C49F63', 'C49F70', 'C56F0', 'C56F7', 'C56F14', 'C56F21', 'C56F28', 'C56F35', 'C56F42', 'C56F49', 'C56F56', 'C56F63', 'C56F70', 'C63F0', 'C63F7', 'C63F14', 'C63F21', 'C63F28', 'C63F35', 'C63F42', 'C63F49', 'C63F56', 'C63F63', 'C63F70', 'C70F0', 'C70F7', 'C70F14', 'C70F21', 'C70F28', 'C70F35', 'C70F42', 'C70F49', 'C70F56', 'C70F63', 'C70F70', 'C364F364']]

In [0]:
df = df[df['charge_dt'] >= '2022-01-01']
optimize_df(df)

In [0]:
import re
feature_cols = [col for col in df.columns if col.startswith('C')]
target_col = 'unt_pre'

unique_feature_cols = list(set(feature_cols))
all_cols_for_corr = [target_col] + unique_feature_cols
corr_matrix = df[all_cols_for_corr].dropna().corr()
target_corrs = corr_matrix[[target_col]].drop(target_col)
target_corrs = target_corrs.reset_index().rename(columns={'index': 'feature', target_col: 'correlation'})

def parse_lags(feature_name):
    match = re.match(r'C(\d+)F(-?\d+)', feature_name)
    if match:
        c_lag = int(match.group(1))
        f_lag = int(match.group(2))
        return c_lag, f_lag
    return None, None

target_corrs[['charge_lag', 'flight_lag']] = target_corrs['feature'].apply(lambda x: pd.Series(parse_lags(x)))

target_corrs = target_corrs.drop_duplicates(subset=['feature'])
corr_pivot = target_corrs.pivot(index='charge_lag', columns='flight_lag', values='correlation')
corr_pivot = corr_pivot.sort_index().sort_index(axis=1)
plt.figure(figsize=(16, 12))
sns.heatmap(corr_pivot,annot=True,fmt=".2f",cmap='viridis',cbar=True,linewidths=.1)
plt.title('Correlation of Lag Features with Target ("unt_pre")')
plt.xlabel("Flight Date Lag (days)")
plt.ylabel("Charge Date Lag (days)")
plt.tight_layout()
plt.show()

In [0]:
# Seasonality Analysis

def create_features(df):
    df['flight_dt'] = pd.to_datetime(df['charge_dt'] + pd.to_timedelta(df['dtg'], unit='D'))
    df['charge_dt'] = pd.to_datetime(df['charge_dt'])
    df['flight_month'] = df['flight_dt'].dt.month.astype(int)
    df['flight_dow'] = df['flight_dt'].dt.dayofweek.astype(int)
    df['flight_dom'] = df['flight_dt'].dt.day.astype(int)
    df['flight_doy'] = df['flight_dt'].dt.dayofyear.astype(int)
    df['flight_year'] = df['flight_dt'].dt.year.astype(int)
    df['charge_month'] = df['charge_dt'].dt.month.astype(int)
    df['charge_dow'] = df['charge_dt'].dt.dayofweek.astype(int)
    df['charge_dom'] = df['charge_dt'].dt.day.astype(int)
    df['charge_doy'] = df['charge_dt'].dt.dayofyear.astype(int)
    df['charge_year'] = df['charge_dt'].dt.year.astype(int)
    df['day_number'] = (df['charge_dt'] - pd.to_datetime('2022-01-01')).dt.days.astype(int)

create_features(df)
optimize_df(df)

In [0]:
# Flight Month vs Charge Month

df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_month', 'charge_month'], var_name='month_type', value_name='month')
mean_sales = df_melt.groupby(['month', 'month_type'])['unt_pre'].sum().reset_index()

plt.figure(figsize=(20, 8))
sns.barplot(data=mean_sales, x='month', y='unt_pre', hue='month_type')
plt.style.use('ggplot')
plt.title('Historic Sales by Month since 2022')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.show()

In [0]:
# Flight DoW vs Charge DoW

df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_dow', 'charge_dow'], var_name='dow_type', value_name='dow')
mean_sales = df_melt.groupby(['dow', 'dow_type'])['unt_pre'].sum().reset_index()
plt.figure(figsize=(15, 8))
sns.barplot(data=mean_sales, x='dow', y='unt_pre', hue='dow_type')
plt.style.use('ggplot')
plt.title('Historic Sales by DoW since 2022')
plt.xlabel('DoW')
plt.ylabel('Total Sales')
display(plt.show())

In [0]:
# Weekly Seasonality

df[(df['charge_dt'] > '2024-01-01') & (df['charge_dt'] < '2024-01-31')].groupby('charge_dt')['unt_pre'].sum().plot(figsize=(20,5), title = 'sales by charge date (Jan24)', y='unt_pre', linewidth=10)
plt.style.use('ggplot')
plt.show()

In [0]:
# DoM Price vs Sales

price_vs_slaes = df.copy()
#[(df['charge_month'] != 12) & (df['charge_month'] != 3) & (df['charge_month'] != 4) & ((df['charge_month'] != 1))]

mean_sales = price_vs_slaes.groupby('charge_dom')['unt_pre'].mean().reset_index()
mean_price = price_vs_slaes.groupby('charge_dom')['price'].mean().reset_index()

fig, ax1 = plt.subplots(figsize=(15, 8))
plt.style.use('ggplot')

ax1.plot(mean_sales['charge_dom'], mean_sales['unt_pre'], marker='o', linewidth=12, color='tab:red', label='Mean Sales')
ax1.set_xlabel('Charge Day of Month')
ax1.set_ylabel('Mean Sales', color='tab:red')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.plot(mean_price['charge_dom'], mean_price['price'], marker='s', linewidth=12, color='tab:blue', label='Mean Price')
ax2.set_ylabel('Mean Price', color='tab:blue')
ax2.tick_params(axis='y', labelcolor='tab:blue')

fig.suptitle('Mean Sales and Price by Charge Day of Month (2022 - ToDate)')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()

In [0]:
# Holiday Features

uk_holidays = holidays.UK(years=range(2022, 2026))
holidays_df = pd.DataFrame([(date, name) for date, name in uk_holidays.items()], columns=['ds', 'holiday'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df

In [0]:
additional_holidays = pd.DataFrame([
    {'ds': '2022-04-18', 'holiday': 'Easter Monday'},
    {'ds': '2022-08-29', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2023-04-10', 'holiday': 'Easter Monday'},
    {'ds': '2023-08-28', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2024-04-01', 'holiday': 'Easter Monday'},
    {'ds': '2024-08-26', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2025-04-21', 'holiday': 'Easter Monday'},
    {'ds': '2025-08-25', 'holiday': 'Summer Bank Holiday'},
])
holidays_df = pd.concat([holidays_df, additional_holidays], ignore_index=True)
holidays_df.drop_duplicates(inplace=True)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df.reset_index(drop=True, inplace=True)
holidays_df
   

In [0]:
holidays_df["ds"] = pd.to_datetime(holidays_df["ds"])
df = df.merge(holidays_df.rename(columns={'ds': 'charge_dt', 'holiday': 'charge_dt_holiday'}), how='left', on='charge_dt')
df = df.merge(holidays_df.rename(columns={'ds': 'flight_dt', 'holiday': 'flight_dt_holiday'}), how='left', on='flight_dt')
df['is_charge_date_holiday'] = df['charge_dt_holiday'].notnull().astype(int)
df['is_flight_date_holiday'] = df['flight_dt_holiday'].notnull().astype(int)
df.drop(['charge_dt_holiday', 'flight_dt_holiday'], axis=1, inplace=True)
df = df.sort_values(by=['charge_dt', 'dtg'], ascending=[True, True])
df.set_index('charge_dt', inplace=True)
optimize_df(df)

In [0]:
index_cols = ['charge_dt', 'flight_dt']
cyclic_cols=['flight_dom', 'flight_doy', 'charge_dom', 'charge_doy', 'flight_month', 'charge_month', 'flight_dow', 'charge_dow']
normal_num_cols=['dtg','ty_capacity','C7F0', 'C7F7', 'C7F14', 'C7F21', 'C7F28', 'C7F35', 'C7F42', 'C7F49', 'C7F56', 'C7F63', 'C7F70', 'C14F0', 'C14F7', 'C14F14', 'C14F21', 'C14F28', 'C14F35', 'C14F42', 'C14F49', 'C14F56', 'C14F63', 'C14F70', 'C21F0', 'C21F7', 'C21F14', 'C21F21', 'C21F28', 'C21F35', 'C21F42', 'C21F49', 'C21F56', 'C21F63', 'C21F70', 'C28F0', 'C28F7', 'C28F14', 'C28F21', 'C28F28', 'C28F35', 'C28F42', 'C28F49', 'C28F56', 'C28F63', 'C28F70', 'C35F0', 'C35F7', 'C35F14', 'C35F21', 'C35F28', 'C35F35', 'C35F42', 'C35F49', 'C35F56', 'C35F63', 'C35F70', 'C42F0', 'C42F7', 'C42F14', 'C42F21', 'C42F28', 'C42F35', 'C42F42', 'C42F49', 'C42F56', 'C42F63', 'C42F70', 'C49F0', 'C49F7', 'C49F14', 'C49F21', 'C49F28', 'C49F35', 'C49F42', 'C49F49', 'C49F56', 'C49F63', 'C49F70', 'C56F0', 'C56F7', 'C56F14', 'C56F21', 'C56F28', 'C56F35', 'C56F42', 'C56F49', 'C56F56', 'C56F63', 'C56F70', 'C63F0', 'C63F7', 'C63F14', 'C63F21', 'C63F28', 'C63F35', 'C63F42', 'C63F49', 'C63F56', 'C63F63', 'C63F70', 'C70F0', 'C70F7', 'C70F14', 'C70F21', 'C70F28', 'C70F35', 'C70F42', 'C70F49', 'C70F56', 'C70F63', 'C70F70', 'C364F364']
standard_num_cols=['charge_year','flight_year', 'day_number']
df.drop(['load_factor', 'yield', 'price'], axis=1, inplace=True)
df = df.reset_index()
original_df = df.copy()

def encode_cyclic_features(df, cols):
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = (np.sin(2 * np.pi * df[col] / max_val)).astype('float16')
        df[col + '_cos'] = (np.cos(2 * np.pi * df[col] / max_val)).astype('float16')
        df.drop(col, axis=1, inplace=True)
    return df

def scale_normal_cols(df, cols):
    for col in cols:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(df[[col]])
    return df

def scale_standard_cols(df, cols):
    for col in cols:
        scaler = MinMaxScaler()
        df[col] = scaler.fit_transform(df[[col]])
    return df

df = encode_cyclic_features(df, cyclic_cols)
df = scale_normal_cols(df, normal_num_cols)
df = scale_standard_cols(df, standard_num_cols)
#df = pd.get_dummies(df, columns=['region', 'routetype'])

for col in index_cols:
    df[col] = original_df[col]
df.set_index(index_cols, inplace=True)

df = optimize_df(df)
df.head()

In [0]:
#LR train test

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
LR_scores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask]
    val_data = df[val_mask]

    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    
    ax = axs[fold // 3, fold % 3]

    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='unt_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)

    ax.set_title(f'Fold {fold+1}')
    fold += 1

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']

    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    LR_model = LinearRegression()
    LR_model.fit(X_train, y_train)

    y_pred = LR_model.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    LR_score = np.sqrt(mean_squared_error(y_test, y_pred))
    LR_scores.append(LR_score)

plt.tight_layout()
plt.show()

In [0]:
#LR Results

print(f'individual scores: {LR_scores}')
print(f'combined score: {np.mean(LR_scores)}')
print(f'std: {np.std(LR_scores)}')  

In [0]:
#LR Plot

full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)

y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()

rmse_LR = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_LR:.2f}')

plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('LR Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#LR Plot (dynamic)

results_df = pd.DataFrame({'actual': full_y_test, 'predicted': full_y_pred})
results_df.reset_index(inplace=True)
results_df['flight_dt'] = pd.to_datetime(results_df['flight_dt'])
results_df['charge_dt'] = pd.to_datetime(results_df['charge_dt'])

results_df['dtg'] = (results_df['flight_dt'] - results_df['charge_dt']).dt.days
results_df['charge_dt'] = results_df['charge_dt'].astype(str)
results_df.sort_values(['charge_dt', 'dtg'], inplace=True)

agg_df = results_df.groupby(['charge_dt', 'dtg'])[['actual', 'predicted']].sum().reset_index()

melted_df = pd.melt(agg_df, id_vars=['charge_dt', 'dtg'], value_vars=['actual', 'predicted'], var_name='sales_type', value_name='sales_value')

fig = px.line(melted_df, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='LR Model: Actual vs. Predicted Sales by Days To Go (Linear Regression)', color_discrete_map={'actual': 'red', 'predicted': 'blue'})

fig.update_layout(height=800, width=1600)

fig.show()

In [0]:
#XGB Train Test

unique_dates = df.index.get_level_values('charge_dt').unique().sort_values()
tscv = TimeSeriesSplit(n_splits=12, test_size=7)

fig, axs = plt.subplots(4, 3, figsize=(16, 8), sharex=True)
plt.style.use('ggplot')
fold = 0
preds = []
XGBscores = []
all_y_test = []
all_y_pred_series = []

for train_index, val_index in tscv.split(unique_dates):
    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_mask = df.index.get_level_values('charge_dt').isin(train_dates)
    val_mask = df.index.get_level_values('charge_dt').isin(val_dates)

    train_data = df[train_mask]
    val_data = df[val_mask]

    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    
    ax = axs[fold // 3, fold % 3]

    total_sales_train.plot(ax=ax, label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=ax, label='Val', x='charge_dt', y='unt_pre', style='-')
    ax.axvline(val_data.index.get_level_values('charge_dt').min(), linestyle='--', color='black')
    
    end_date = val_dates.max()
    start_date = end_date - pd.DateOffset(months=6)
    ax.set_xlim(start_date, end_date)

    ax.set_title(f'Fold {fold+1}')
    fold += 1

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']

    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    XGB_model = xgb.XGBRegressor(base_score=0.5,booster='gbtree',n_estimators=2000,early_stopping_rounds=50,max_depth=5,learning_rate=0.01,objective='reg:pseudohubererror',enable_categorical=True)
    XGB_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    y_pred = XGB_model.predict(X_test)
    y_pred_series = pd.Series(y_pred, index=X_test.index, name='unt_pre_pred')
    all_y_test.append(y_test)
    all_y_pred_series.append(y_pred_series)
    preds.extend(y_pred)
    XGBscore = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(XGBscore)

plt.tight_layout()
plt.show()

In [0]:
# XGB Results

print(f'individual scores: {XGBscores}')
print(f'combined score: {np.mean(XGBscores)}')
print(f'std: {np.std(XGBscores)}')      

In [0]:
#XGB Plot

full_y_test = pd.concat(all_y_test)
full_y_pred = pd.concat(all_y_pred_series)

y_test_total = full_y_test.groupby(level='charge_dt').sum()
y_pred_total = full_y_pred.groupby(level='charge_dt').sum()

rmse_xgb = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'Overall RMSE on Daily Totals (All Folds): {rmse_xgb:.2f}')

plt.figure(figsize=(15, 6))
plt.plot(y_test_total.index, y_test_total, label='Actual Daily Total', marker='.', linestyle='-')
plt.plot(y_pred_total.index, y_pred_total, label='Predicted Daily Total', marker='.', linestyle='--')
plt.title('XGB Model: Actual vs. Predicted Daily Totals (All Folds Combined)')
plt.xlabel('Date')
plt.ylabel('Total Sales (unt_pre)')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
#XGB Plot (dynamic)

results_df = pd.DataFrame({'actual': full_y_test, 'predicted': full_y_pred})
results_df.reset_index(inplace=True)
results_df['flight_dt'] = pd.to_datetime(results_df['flight_dt'])
results_df['charge_dt'] = pd.to_datetime(results_df['charge_dt'])

results_df['dtg'] = (results_df['flight_dt'] - results_df['charge_dt']).dt.days
results_df['charge_dt'] = results_df['charge_dt'].astype(str)
results_df.sort_values(['charge_dt', 'dtg'], inplace=True)

agg_df = results_df.groupby(['charge_dt', 'dtg'])[['actual', 'predicted']].sum().reset_index()

melted_df = pd.melt(agg_df, id_vars=['charge_dt', 'dtg'], value_vars=['actual', 'predicted'], var_name='sales_type', value_name='sales_value')

fig = px.line(melted_df, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='XGB Model: Actual vs. Predicted Sales by Days To Go (Linear Regression)', color_discrete_map={'actual': 'red', 'predicted': 'blue'})

fig.update_layout(height=800, width=1600)

fig.show()