In [0]:
#importing data

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from xgboost import XGBRegressor
from pyspark.sql.functions import to_date, to_timestamp, col, last, when, lit, dayofweek, dayofmonth, dayofyear, month, weekofyear, current_date, date_sub, datediff
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import ticker
from datetime import datetime, timedelta
from itertools import product
import joblib

df = spark.read.table('data_experience_commercial.cbt_1423_rtsuite.master').select('charge_dt','dtg','chargeproduct','unt_net','channel','flightkey','flight_dt')
dmh = spark.read.table('data_experience_commercial.cbt_0923_segmentfinder.dimensions_history').select('flightkey','onsale_dt')
df = df.join(dmh, on='flightkey', how='left').drop('flightkey')
df = df[(df['chargeproduct']=='Ticket') & (df['dtg'] >= 0) & (df['channel'] == 'Web/App')]
df = df.drop('chargeproduct','channel')
df = df.filter((col('dtg') < (datediff(col('flight_dt'), col('onsale_dt')) - 25)) & (col('charge_dt') >= '2019-01-01'))
df = df.withColumn('unt_net', F.when(col('unt_net') < 0, 0).otherwise(col('unt_net')))
df = df.groupby('charge_dt','dtg').agg(F.sum('unt_net').alias('unt_net')).orderBy('charge_dt','dtg')
df = df.toPandas()
df.info()

In [0]:
#Feature Engineering
df['charge_dt'] = pd.to_datetime(df['charge_dt'])

def create_features(df):
    df['flight_dt'] = pd.to_datetime(df['charge_dt'] + pd.to_timedelta(df['dtg'], unit='D'))
    df['charge_dt'] = pd.to_datetime(df['charge_dt'])

    df['flight_month'] = df['flight_dt'].dt.month.astype(str)
    df['flight_dow'] = df['flight_dt'].dt.dayofweek.astype(str)
    df['flight_dom'] = df['flight_dt'].dt.day.astype(str)
    df['flight_doy'] = df['flight_dt'].dt.dayofyear.astype(str)
    df['flight_year'] = df['flight_dt'].dt.year.astype(str)

    df['charge_month'] = df['charge_dt'].dt.month.astype(str)
    df['charge_dow'] = df['charge_dt'].dt.dayofweek.astype(str)
    df['charge_dom'] = df['charge_dt'].dt.day.astype(str)
    df['charge_doy'] = df['charge_dt'].dt.dayofyear.astype(str)
    df['charge_year'] = df['charge_dt'].dt.year.astype(str)

    df.drop(['flight_dt'], axis=1, inplace=True)
    df.set_index('charge_dt', inplace=True)
    df = df.sort_index()

create_features(df)
df.head()

In [0]:
target_map = {(row.name, row['dtg']): row['unt_net'] for _, row in df.iterrows()}
lag_periods = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28, 364]
for lag in lag_periods:
    df[f'lag{lag}'] = df.apply(lambda row: target_map.get((row.name - pd.Timedelta(f'{lag} day'), row['dtg']), None), axis=1)

df.tail()

In [0]:
total_sales = df.groupby('charge_dt')['unt_net'].sum().reset_index()
plt.style.use('ggplot')
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_net', x='charge_dt')
plt.show()

In [0]:
df = df[df.index >= '2022-01-01']
total_sales = df.groupby('charge_dt')['unt_net'].sum().reset_index()
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_net', x='charge_dt')
plt.style.use('ggplot')
plt.show()

In [0]:
df_reset = df.reset_index()
df_reset['charge_dt'] = df_reset['charge_dt'].astype(str)

fig = px.scatter(df_reset, 
                 x='dtg', 
                 y='unt_net', 
                 animation_frame='charge_dt',  # Add dtg as an animation frame
                 title='Sales by DTG with charge date variations')

fig.update_xaxes(range=[0, 200])
fig.update_yaxes(range=[0, 5000])

fig.update_layout(
    xaxis_title='Days To Go',
    yaxis_title='Sales',
    legend_title='Charge Date',
)

fig.show()

In [0]:
fig = px.scatter(df, 
                 x=df.index, 
                 y='unt_net', 
                 animation_frame='dtg',  # Add dtg as an animation frame
                 title='Sales by Charge Date with DTG Variations')

fig.update_layout(
    xaxis_title='Charge Date',
    yaxis_title='Sales',
    legend_title='DTG'
)

fig.show()

#on DTG 0, charge date 21/03/2025 an example can be seen with the Heathrow fire

In [0]:
# Assuming df is the original DataFrame
df_melt = df.melt(id_vars='unt_net', value_vars=['flight_month', 'charge_month'], var_name='month_type', value_name='month')

# Group by month and month type, and calculate mean sales
mean_sales = df_melt.groupby(['month', 'month_type'])['unt_net'].mean().reset_index()

# Convert 'month' to categorical for ordering
mean_sales['month'] = pd.Categorical(mean_sales['month'], categories=[str(i) for i in range(1, 13)], ordered=True)

# Plot the histogram
plt.figure(figsize=(20, 8))
sns.barplot(data=mean_sales, x='month', y='unt_net', hue='month_type')
plt.style.use('ggplot')
plt.title('Mean Sales by Month')
plt.xlabel('Month')
plt.ylabel('Mean Sales')
plt.show()

In [0]:
# Assuming df is the original DataFrame
df_melt = df.melt(id_vars='unt_net', value_vars=['flight_dow', 'charge_dow'], var_name='dow_type', value_name='dow')
mean_sales = df_melt.groupby(['dow', 'dow_type'])['unt_net'].mean().reset_index()
mean_sales['dow'] = pd.Categorical(mean_sales['dow'], categories=[str(i) for i in range(0, 7)], ordered=True)
plt.figure(figsize=(15, 8))
sns.barplot(data=mean_sales, x='dow', y='unt_net', hue='dow_type')
plt.style.use('ggplot')
plt.title('Mean Sales by DoW')
plt.xlabel('DoW')
plt.ylabel('Mean Sales')
display(plt.show())

In [0]:
# Group by charge day of month and calculate mean sales
mean_sales = df.groupby('charge_dom')['unt_net'].mean().reset_index()

# Ensure 'charge_dom' is of categorical type
mean_sales['charge_dom'] = pd.to_numeric(mean_sales['charge_dom'])
mean_sales = mean_sales.sort_values(by='charge_dom')

# Plot the line graph
plt.figure(figsize=(15, 8))
plt.style.use('ggplot')
plt.plot(mean_sales['charge_dom'], mean_sales['unt_net'], marker='o', linewidth=15)
plt.title('Mean Sales by Charge Day of Month')
plt.xlabel('Charge Day of Month')
plt.ylabel('Mean Sales')
display(plt.show())

In [0]:
df[(df.index > '2024-01-01') & (df.index < '2024-01-31')].groupby('charge_dt')['unt_net'].mean().plot(figsize=(20,5), title = 'sales by charge date (Jan24)', y='unt_net', linewidth=10)
plt.style.use('ggplot')
plt.show()

In [0]:
'''%python
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor

param_grid = {
    'learning_rate': [0.005, 0.01, 0.1],
    #'subsample': [0.8, 1.0],
    #'colsample_bytree': [0.8, 1.0],
    #'lambda': [0.1, 1.0],
    #'alpha': [0, 0.1],
}

rmse_scorer = make_scorer(mean_squared_error, squared=False)

outer_cv = TimeSeriesSplit(n_splits=5)
inner_cv = TimeSeriesSplit(n_splits=3)

outer_scores = []

for train_idx, test_idx in outer_cv.split(df):
    X_train_outer = df.iloc[train_idx].drop('unt_net', axis=1).copy()
    y_train_outer = df.iloc[train_idx]['unt_net'].copy()
    X_test_outer = df.iloc[test_idx].drop('unt_net', axis=1).copy()
    y_test_outer = df.iloc[test_idx]['unt_net'].copy()

    X_train_processed = preprocessor.fit_preprocess(X_train_outer)
    X_test_processed = preprocessor.transform_preprocess(X_test_outer)

    model = XGBRegressor(objective='reg:pseudohubererror', base_score=0.5, boosting='gbtree', early_stopping_rounds=50, max_depth=3, n_estimators=500, learning_rate=0.01)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, n_jobs=-1, verbose=1, scoring=rmse_scorer)

    grid_search.fit(X_train_processed, y_train_outer, eval_set=[(X_test_processed, y_test_outer)])

    best_model = grid_search.best_estimator_
    test_score = best_model.score(X_test_processed, y_test_outer)
    outer_scores.append(test_score)

    print(f'Outer fold score: {test_score}, best params: {grid_search.best_params_}')'''

In [0]:
cyclic_cols=['flight_dom', 'flight_doy', 'charge_dom', 'charge_doy', 'flight_month', 'charge_month', 'flight_dow', 'charge_dow']
num_cols=['charge_year','flight_year']

def encode_cyclic_features(df, cyclic_cols):
    for col in cyclic_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
        df.drop(col, axis=1, inplace=True)
    return df

def scale_num_cols(df, num_cols, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols])
    else:
        df[num_cols] = scaler.transform(df[num_cols])
    return df, scaler

encode_cyclic_features(df, cyclic_cols)
df, scaler = scale_num_cols(df, num_cols)
joblib.dump(scaler, 'scaler.pkl')
df.head()

In [0]:
unique_dates = df.index.sort_values().unique()
tscv = TimeSeriesSplit(n_splits=5, test_size=168)

fig, axs = plt.subplots(5, 1, figsize=(20, 10), sharex=True)
plt.style.use('ggplot')
fold=0
preds = []
scores = []
# Nested cross-validation
for train_index, val_index in tscv.split(unique_dates):

    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_data = df.loc[train_dates]
    val_data = df.loc[val_dates]

    total_sales_train = train_data.groupby('charge_dt')['unt_net'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_net'].sum().reset_index()
    total_sales_train.plot(ax=axs[fold], label='Train', x='charge_dt', y='unt_net', style='-')
    total_sales_val.plot(ax=axs[fold], label='Val', x='charge_dt', y='unt_net', style='-')
    axs[fold].axvline(val_data.index.min(), linestyle='--', color='black')
    axs[fold].set_title(f'Fold {fold+1}')
    fold += 1

    X_train = train_data.drop('unt_net', axis=1)
    y_train = train_data['unt_net']

    X_test = val_data.drop('unt_net', axis=1)
    y_test = val_data['unt_net']

    model = xgb.XGBRegressor(base_score=0.5, booster ='gbtree', n_estimators=1000, early_stopping_rounds=50, max_depth=3, learning_rate=0.01, objective='reg:pseudohubererror', enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    y_pred=model.predict(X_test)
    preds.extend(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

In [0]:
average_score = np.mean(scores)
print(f'individual scores: {scores}')
print(f'combined score: {np.mean(scores)}')
print(f'std: {np.std(scores)}')      

In [0]:
pd.DataFrame(data=model.feature_importances_, index=model.feature_names_in_, columns=['importance']).sort_values('importance', ascending=False)

In [0]:
val_data['prediction'] = model.predict(X_test)
total_sales = val_data.groupby('charge_dt')['unt_net'].sum()
total_sales_prediction = val_data.groupby('charge_dt')['prediction'].sum()
ax = total_sales.plot(x='charge_dt', y='unt_net', figsize=(20, 8), label='Actual')
total_sales_prediction.plot(x='day_count', y='prediction', ax=ax, style='-', label='Predicted', markersize=10)

plt.legend(['Actual', 'Predicted'])
plt.title('Actual vs Predicted Sales')
plt.xlabel('Charge Date')
plt.ylabel('Sales')
plt.show()

In [0]:
df_reset = val_data.reset_index()
df_reset['charge_dt'] = df_reset['charge_dt'].astype(str)

df_plot = df_reset[df_reset['prediction'].notnull()].copy()

df_actual = df_plot[['charge_dt','dtg','unt_net']].copy()
df_actual['sales_type'] = 'actual'
df_actual['sales_value'] = df_actual['unt_net']

df_pred = df_plot[['charge_dt','dtg','prediction']].copy()
df_pred['sales_type'] = 'predicted'
df_pred['sales_value'] = df_pred['prediction']

df_melted = pd.concat([df_actual, df_pred]).sort_values(['charge_dt','dtg','sales_type'])

min_dtg = 0
max_dtg = 200
min_sales = 0
max_sales = 5000

fig = px.line(df_melted, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='Actual vs Predicted by DTG across Charge Dates', color_discrete_map={'unt_net':'red', 'prediction':'blue'})
fig.update_xaxes(range=[min_dtg, max_dtg], title='DTG')
fig.update_yaxes(range=[min_sales, max_sales], title='Sales')
fig.update_layout(legend_title='sales type')
fig.show()

In [0]:
df_reset2 = val_data.reset_index()
df_reset2['charge_dt'] = df_reset2['charge_dt'].astype(str)

df_plot = df_reset2[df_reset2['prediction'].notnull()].copy()

df_actual = df_plot[['charge_dt','dtg','unt_net']].copy()
df_actual['sales_type'] = 'actual'
df_actual['sales_value'] = df_actual['unt_net']

df_pred = df_plot[['charge_dt','dtg','prediction']].copy()
df_pred['sales_type'] = 'predicted'
df_pred['sales_value'] = df_pred['prediction']

df_melted = pd.concat([df_actual, df_pred]).sort_values(['charge_dt','dtg','sales_type'])

min_sales = 0
max_sales = 5000

fig = px.line(df_melted, x='charge_dt', y='sales_value', color='sales_type', animation_frame='dtg', title='Actual vs Predicted by DTG across Charge Dates', color_discrete_map={'unt_net':'red', 'prediction':'blue'})
fig.update_xaxes(title='Charge Date')
fig.update_yaxes(range=[min_sales, max_sales], title='Sales')
fig.update_layout(legend_title='sales type')
fig.show()

In [0]:
val_data['error'] = np.abs(y_test - val_data['prediction'])
val_data.groupby(val_data.index)['error'].mean().sort_values(ascending=False).head(10)

In [0]:
val_data.groupby(val_data.index)['error'].mean().plot(figsize=(20,5))
plt.show()

In [0]:
val_data.groupby('dtg')['error'].mean().sort_values(ascending=False).head(10)

In [0]:
val_data.groupby('dtg')['error'].mean().plot(figsize=(20,5))
plt.title('Absolute Error By DTG')
plt.xlabel('DTG')
plt.ylabel('RMSError')
plt.show()

In [0]:
val_data = val_data[val_data['dtg'] <= 300]
sales_by_dtg = val_data.groupby('dtg')['unt_net'].std().reset_index()
sales_by_dtg.rename(columns={'unt_net': 'std_unt_net'}, inplace=True)

val_data = val_data.merge(sales_by_dtg, on='dtg', how='left')

min_std = 1
val_data['std_unt_net'] = val_data['std_unt_net'].fillna(min_std).apply(lambda x: max(x, min_std))

val_data['absolute_error'] = np.abs(val_data['unt_net'] - val_data['prediction'])
val_data['relative_error'] = val_data['absolute_error'] / val_data['std_unt_net']

relative_error = val_data.groupby('dtg')['relative_error'].mean()
relative_error.sort_values(ascending=False).head(10)

In [0]:
relative_error.plot(figsize=(20,5))
plt.title('Relative Error By DTG')
plt.xlabel('DTG')
plt.ylabel('Relative Error')
plt.show()

In [0]:
future = pd.date_range(start=df.index.max() + pd.DateOffset(days=1), end=(df.index.max() + pd.DateOffset(days=1)), freq='1d')
dtg_values = list(range(0, 301))
future_combinations = list(product(future, dtg_values))
future_df = pd.DataFrame(future_combinations, columns=['charge_dt', 'dtg'])
create_features(future_df)
encode_cyclic_features(future_df, cyclic_cols)
future_df, _ = scale_num_cols(future_df, num_cols, scaler = joblib.load('scaler.pkl'))
for lag in lag_periods:
    future_df[f'lag{lag}'] = future_df.apply(lambda row: target_map.get((row.name - pd.Timedelta(f'{lag} day'), row['dtg']), None), axis=1)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future.tail()

In [0]:
future_w_features = df_and_future.query('isFuture').copy()
future_w_features = future_w_features.drop(columns=['unt_net', 'isFuture'])
future_w_features['pred'] = model.predict(future_w_features)
future_w_features.plot(figsize=(20,5), x='dtg', y='pred')
plt.title('Predicted Sales')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()