In [0]:
import pandas as pd

df = pd.read_csv('/Workspace/Users/barney.hodge@easyjet.com/LGWBHD - Test.csv')
df.info()

In [0]:
df.head()

In [0]:
%pip install holidays

In [0]:
import holidays
uk_holidays = holidays.UK(years=range(2022, 2026))
holidays_df = pd.DataFrame([(date, name) for date, name in uk_holidays.items()], columns=['ds', 'holiday'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df

In [0]:
additional_holidays = pd.DataFrame([
    {'ds': '2022-04-18', 'holiday': 'Easter Monday'},
    {'ds': '2022-08-29', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2023-04-10', 'holiday': 'Easter Monday'},
    {'ds': '2023-08-28', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2024-04-01', 'holiday': 'Easter Monday'},
    {'ds': '2024-08-26', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2025-04-21', 'holiday': 'Easter Monday'},
    {'ds': '2025-08-25', 'holiday': 'Summer Bank Holiday'},
])
holidays_df = pd.concat([holidays_df, additional_holidays], ignore_index=True)
holidays_df.drop_duplicates(inplace=True)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df.reset_index(drop=True, inplace=True)
holidays_df

In [0]:
df['charge_dt'] = pd.to_datetime(df['charge_dt'])
df['flight_dt'] = pd.to_datetime(df['flight_dt'])
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])

df = df.merge(holidays_df.rename(columns={'ds': 'charge_dt', 'holiday': 'charge_dt_holiday'}), how='left', on='charge_dt')
df = df.merge(holidays_df.rename(columns={'ds': 'flight_dt', 'holiday': 'flight_dt_holiday'}), how='left', on='flight_dt')
df['is_charge_date_holiday'] = df['charge_dt_holiday'].notnull().astype(int)
df['is_flight_date_holiday'] = df['flight_dt_holiday'].notnull().astype(int)
df.drop(['charge_dt_holiday', 'flight_dt_holiday'], axis=1, inplace=True)
df.head()

In [0]:
df = df.drop(['flight_time','sector','route','region','routetype','total_optionality_score','combined_bp','combined_bp_DoW','base','dest','prop_from_base','prop_from_dest','time_quality_score'], axis=1)
df.info()

In [0]:
df['is_charge_date_weekday'] = (df['charge_dt'].dt.dayofweek < 5).astype(int)
df['is_flight_date_weekday'] = (df['flight_dt'].dt.dayofweek < 5).astype(int)

In [0]:
# Data Optimisation
import numpy as np
def optimize_df(df):

    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Initial memory usage: {start_mem:.2f} MB")

    for col in df.columns:

        if 'flightkey' in col:
            df[col] = df[col].astype('category')

        elif 'float' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='float')
            
        elif 'int' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='integer')

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f"Final memory usage: {end_mem:.2f} MB ({reduction:.2f}% reduction)")

    return df

optimize_df(df)
df.info()

In [0]:
cyclic_cols = ['flight_dow', 'charge_dow', 'flight_dom', 'charge_dom', 'flight_mth', 'charge_mth']

def encode_cyclic_features(df, cols):
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = (np.sin(2 * np.pi * df[col] / max_val)).astype('float16')
        df[col + '_cos'] = (np.cos(2 * np.pi * df[col] / max_val)).astype('float16')
        df.drop(col, axis=1, inplace=True)
    return df

df = encode_cyclic_features(df, cyclic_cols)

In [0]:
df['flight_year'] = df['flight_dt'].dt.year.astype(int)
df['charge_year'] = df['charge_dt'].dt.year.astype(int)
df['day_number'] = (df['charge_dt'] - pd.to_datetime('2022-10-01')).dt.days.astype(int)

In [0]:
optimize_df(df)
df.info()

In [0]:
df = df.sort_values(by=['charge_dt','dtg'])
df.head()

In [0]:
df.tail()

In [0]:
df = df[df['dtg'] > 7]

In [0]:
df.set_index(['flightkey','charge_dt','flight_dt'], inplace=True)
df.head()

In [0]:
df.tail()

In [0]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
mlflow.autolog(disable=True)


features_to_scale = ['dtg', 'ty_capacity', 'cumulative_sales','Loadfactor','sales_lag_1','sales_lag_2','sales_lag_3','sales_lag_4','sales_lag_5','sales_lag_6','sales_lag_7','sales_lag_14','sales_lag_21','sales_lag_28','sale_length','sale_period_progress','SF7C7','SF14C14','SF21C21','SF28C28','flight_year','charge_year','day_number']

features_already_scaled = ['flight_dow_sin','flight_dow_cos','charge_dow_sin','charge_dow_cos','flight_dom_sin','flight_dom_cos','charge_dom_sin','charge_dom_cos','flight_mth_sin','flight_mth_cos','charge_mth_sin','charge_mth_cos','is_charge_date_holiday','is_flight_date_holiday','is_charge_date_weekday','is_flight_date_weekday']

feature_cols = features_to_scale + features_already_scaled

preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), features_to_scale)], remainder='passthrough')
pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', LinearRegression())])

results = []
feature_importance_list = []
prediction_data = []

tscv = TimeSeriesSplit(n_splits=5)

for horizon in range(1, 8):

    target_col = f'sales_lead_{horizon}'
    X = df[feature_cols]
    y = df[target_col]
    
    for fold_idx, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        results.append({'Horizon': f'Lead_{horizon}','Fold': fold_idx, 'MAE': mae, 'RMSE': rmse, 'R2': r2})
        
        model = pipeline.named_steps['regressor']
        coefs = model.coef_
        transformed_feature_names = features_to_scale + features_already_scaled

        batch_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred}, index=y_test.index).reset_index()
        
        batch_df['Horizon'] = f'Lead_{horizon}'
        batch_df['Fold'] = fold_idx
        
        prediction_data.append(batch_df)
        
        for feat_name, coef_val in zip(transformed_feature_names, coefs):
            feature_importance_list.append({'Horizon': f'Lead_{horizon}', 'Fold': fold_idx, 'Feature': feat_name, 'Coefficient': coef_val})

metrics_df = pd.DataFrame(results)
importance_df = pd.DataFrame(feature_importance_list)
all_predictions_df = pd.concat(prediction_data, ignore_index=True)

summary_metrics = metrics_df.groupby('Horizon')[['MAE', 'RMSE', 'R2']].mean()

print("\n--- Average Performance per Horizon ---")
print(summary_metrics)

avg_importance = importance_df.groupby(['Feature', 'Horizon'])['Coefficient'].mean()
importance_table = avg_importance.unstack()
importance_table = importance_table.abs()
importance_table = importance_table.sort_values(by='Lead_1', ascending=False)

print("\n--- Feature Importance Table ---")
print(importance_table)

In [0]:
import plotly.express as px
import plotly.graph_objects as go

plot_df = all_predictions_df.groupby(['charge_dt', 'Horizon'])[['Actual', 'Predicted']].sum().reset_index()
plot_df = plot_df.sort_values('charge_dt')
plot_df_long = plot_df.melt(id_vars=['charge_dt', 'Horizon'], value_vars=['Actual', 'Predicted'], var_name='Type', value_name='Sales')

fig = px.line(plot_df_long, x='charge_dt', y='Sales', color='Type', facet_col='Horizon', facet_col_wrap=1, title='Total Daily Sales: Actual vs Predicted by Horizon', height=2400, color_discrete_map={'Actual': 'blue', 'Predicted': 'orange'})

fig.update_xaxes(matches=None, showticklabels=True) 
fig.update_yaxes(matches=None) 

fig.show()

In [0]:
from sklearn.linear_model import Ridge

features_to_scale = ['dtg', 'ty_capacity', 'cumulative_sales','Loadfactor','sales_lag_1','sales_lag_2','sales_lag_3','sales_lag_4','sales_lag_5','sales_lag_6','sales_lag_7','sales_lag_14','sales_lag_21','sales_lag_28','sale_length','sale_period_progress','SF7C7','SF14C14','SF21C21','SF28C28','flight_year','charge_year','day_number']

features_already_scaled = ['flight_dow_sin','flight_dow_cos','charge_dow_sin','charge_dow_cos','flight_dom_sin','flight_dom_cos','charge_dom_sin','charge_dom_cos','flight_mth_sin','flight_mth_cos','charge_mth_sin','charge_mth_cos','is_charge_date_holiday','is_flight_date_holiday','is_charge_date_weekday','is_flight_date_weekday']

feature_cols = features_to_scale + features_already_scaled

preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), features_to_scale)], remainder='passthrough')
pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', Ridge(alpha=1.0))])

results = []
feature_importance_list = []
prediction_data = []

tscv = TimeSeriesSplit(n_splits=5)

for horizon in range(1, 8):

    target_col = f'sales_lead_{horizon}'
    X = df[feature_cols]
    y = df[target_col]
    
    for fold_idx, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        results.append({'Horizon': f'Lead_{horizon}','Fold': fold_idx, 'MAE': mae, 'RMSE': rmse, 'R2': r2})
        
        model = pipeline.named_steps['regressor']
        coefs = model.coef_
        transformed_feature_names = features_to_scale + features_already_scaled

        batch_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred}, index=y_test.index).reset_index()
        
        batch_df['Horizon'] = f'Lead_{horizon}'
        batch_df['Fold'] = fold_idx
        
        prediction_data.append(batch_df)
        
        for feat_name, coef_val in zip(transformed_feature_names, coefs):
            feature_importance_list.append({'Horizon': f'Lead_{horizon}', 'Fold': fold_idx, 'Feature': feat_name, 'Coefficient': coef_val})

metrics_df = pd.DataFrame(results)
importance_df = pd.DataFrame(feature_importance_list)
all_predictions_df = pd.concat(prediction_data, ignore_index=True)

summary_metrics = metrics_df.groupby('Horizon')[['MAE', 'RMSE', 'R2']].mean()

print("\n--- Average Performance per Horizon ---")
print(summary_metrics)

avg_importance = importance_df.groupby(['Feature', 'Horizon'])['Coefficient'].mean()
importance_table = avg_importance.unstack()
importance_table = importance_table.abs()
importance_table = importance_table.sort_values(by='Lead_1', ascending=False)

print("\n--- Feature Importance Table ---")
print(importance_table)

In [0]:
import plotly.express as px
import plotly.graph_objects as go

plot_df = all_predictions_df.groupby(['charge_dt', 'Horizon'])[['Actual', 'Predicted']].sum().reset_index()
plot_df = plot_df.sort_values('charge_dt')
plot_df_long = plot_df.melt(id_vars=['charge_dt', 'Horizon'], value_vars=['Actual', 'Predicted'], var_name='Type', value_name='Sales')

fig = px.line(plot_df_long, x='charge_dt', y='Sales', color='Type', facet_col='Horizon', facet_col_wrap=1, title='Total Daily Sales: Actual vs Predicted by Horizon', height=2400, color_discrete_map={'Actual': 'blue', 'Predicted': 'orange'})

fig.update_xaxes(matches=None, showticklabels=True) 
fig.update_yaxes(matches=None) 

fig.show()

In [0]:
from xgboost import XGBRegressor 

features_to_scale = ['dtg', 'ty_capacity', 'cumulative_sales','Loadfactor','sales_lag_1','sales_lag_2','sales_lag_3','sales_lag_4','sales_lag_5','sales_lag_6','sales_lag_7','sales_lag_14','sales_lag_21','sales_lag_28','sale_length','sale_period_progress','SF7C7','SF14C14','SF21C21','SF28C28','flight_year','charge_year','day_number']

features_already_scaled = ['flight_dow_sin','flight_dow_cos','charge_dow_sin','charge_dow_cos','flight_dom_sin','flight_dom_cos','charge_dom_sin','charge_dom_cos','flight_mth_sin','flight_mth_cos','charge_mth_sin','charge_mth_cos','is_charge_date_holiday','is_flight_date_holiday','is_charge_date_weekday','is_flight_date_weekday']

feature_cols = features_to_scale + features_already_scaled

preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), features_to_scale)], remainder='passthrough')

pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', XGBRegressor(n_estimators=1000, max_depth=5,learning_rate=0.1, verbose=-1, objective='reg:squarederror'))])

results = []
feature_importance_list = []
prediction_data = []

tscv = TimeSeriesSplit(n_splits=5)

for horizon in range(1, 8):
    target_col = f'sales_lead_{horizon}'
    
    X = df[feature_cols]
    y = df[target_col]
    
    for fold_idx, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        results.append({'Horizon': f'Lead_{horizon}','Fold': fold_idx, 'MAE': mae, 'RMSE': rmse, 'R2': r2})
        
        model = pipeline.named_steps['regressor']
        importances = model.feature_importances_ 
        transformed_feature_names = features_to_scale + features_already_scaled

        batch_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred}, index=y_test.index).reset_index()
        batch_df['Horizon'] = f'Lead_{horizon}'
        batch_df['Fold'] = fold_idx
        prediction_data.append(batch_df)
        
        for feat_name, imp_val in zip(transformed_feature_names, importances):
            feature_importance_list.append({'Horizon': f'Lead_{horizon}', 'Fold': fold_idx, 'Feature': feat_name, 'Importance': imp_val})

metrics_df = pd.DataFrame(results)
importance_df = pd.DataFrame(feature_importance_list)
all_predictions_df = pd.concat(prediction_data, ignore_index=True)

summary_metrics = metrics_df.groupby('Horizon')[['MAE', 'RMSE', 'R2']].mean()

print("\n--- Average Performance per Horizon (XGBoost) ---")
print(summary_metrics)

avg_importance = importance_df.groupby(['Feature', 'Horizon'])['Importance'].mean()
importance_table = avg_importance.unstack()
importance_table = importance_table.sort_values(by='Lead_1', ascending=False)

print("\n--- Feature Importance Table ---")
print(importance_table)

In [0]:
import plotly.express as px
import plotly.graph_objects as go

plot_df = all_predictions_df.groupby(['charge_dt', 'Horizon'])[['Actual', 'Predicted']].sum().reset_index()
plot_df = plot_df.sort_values('charge_dt')
plot_df_long = plot_df.melt(id_vars=['charge_dt', 'Horizon'], value_vars=['Actual', 'Predicted'], var_name='Type', value_name='Sales')

fig = px.line(plot_df_long, x='charge_dt', y='Sales', color='Type', facet_col='Horizon', facet_col_wrap=1, title='Total Daily Sales: Actual vs Predicted by Horizon', height=2400, color_discrete_map={'Actual': 'blue', 'Predicted': 'orange'})

fig.update_xaxes(matches=None, showticklabels=True) 
fig.update_yaxes(matches=None) 

fig.show()