In [0]:
#importing data

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from xgboost import XGBRegressor
from prophet import Prophet
from pyspark.sql.functions import to_date, to_timestamp, col, last, when, lit, dayofweek, dayofmonth, dayofyear, month, weekofyear, current_date, date_sub, datediff
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import ticker
from datetime import datetime, timedelta
from itertools import product
import joblib

df = spark.read.table('data_experience_commercial.cbt_1423_rtsuite.master').select('charge_dt','dtg','chargeproduct','unt_pre','flightkey','flight_dt','rev_pre')
dmh = spark.read.table('data_experience_commercial.cbt_0923_segmentfinder.dimensions_history').select('flightkey','onsale_dt','ty_capacity','routetype','region')
#pi = spark.read.table('data_prod.silver_sanezdb.priceinspection').select('segment','when','promoseats')
#pi = pi.withColumnRenamed('segment', 'flightkey')
#pi = pi.withColumnRenamed('when', 'charge_dt')
#pi = pi.withColumn('charge_dt', to_date('charge_dt'))
#pi = pi.withColumn('promoseats', col('promoseats').cast('int'))
#pi = pi.fillna({'promoseats': 0})
#pi = pi.groupby('flightkey','charge_dt').agg(F.max('promoseats').alias('promoseats')).orderBy('flightkey','charge_dt')
#df = df.join(dmh, on='flightkey', how='left').join(pi, on=['flightkey','charge_dt'], how='left').drop('flightkey')
df = df.join(dmh, on='flightkey', how='left')
df = df[(df['chargeproduct']=='Ticket') & (df['dtg'] >= 0) & (df['region'].isin(['UK-London','UK-Regions'])) & (df['routetype'] == 'Domestic')]
df = df.drop('chargeproduct')
df = df.filter((col('dtg') < (datediff(col('flight_dt'), col('onsale_dt')) - 25)) & (col('charge_dt') >= '2019-01-01'))
df = df.withColumn('unt_pre', F.when(col('unt_pre') < 0, 0).otherwise(col('unt_pre')))
df = df.withColumn('rev_pre', F.when(col('rev_pre') < 0, 0).otherwise(col('rev_pre')))
#df = df.groupby('charge_dt','dtg').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('rev_pre').alias('rev_pre'), F.sum('ty_capacity').alias('ty_capacity'), F.sum('promoseats').alias('promoseats')).orderBy('charge_dt','dtg')
df = df.groupby('charge_dt','dtg').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('rev_pre').alias('rev_pre'), F.sum('ty_capacity').alias('ty_capacity')).orderBy('charge_dt','dtg')
#df=df.fillna({'promoseats': 0})
df = df.toPandas()
df.info()

In [0]:
#Feature Engineering
df['charge_dt'] = pd.to_datetime(df['charge_dt'])

df['rev_pre'] = df['rev_pre'].round(0).astype(int)
df['yield'] = df['rev_pre'] / df['unt_pre']
df['yield'] = df['yield'].fillna(0) 
df['yield'] = df['yield'].round(0).astype(int)
df.drop(['rev_pre'], axis=1, inplace=True)

def create_features(df):
    df['flight_dt'] = pd.to_datetime(df['charge_dt'] + pd.to_timedelta(df['dtg'], unit='D'))
    df['charge_dt'] = pd.to_datetime(df['charge_dt'])

    df['flight_month'] = df['flight_dt'].dt.month.astype(str)
    df['flight_dow'] = df['flight_dt'].dt.dayofweek.astype(str)
    df['flight_dom'] = df['flight_dt'].dt.day.astype(str)
    df['flight_doy'] = df['flight_dt'].dt.dayofyear.astype(str)
    df['flight_year'] = df['flight_dt'].dt.year.astype(str)

    df['charge_month'] = df['charge_dt'].dt.month.astype(str)
    df['charge_dow'] = df['charge_dt'].dt.dayofweek.astype(str)
    df['charge_dom'] = df['charge_dt'].dt.day.astype(str)
    df['charge_doy'] = df['charge_dt'].dt.dayofyear.astype(str)
    df['charge_year'] = df['charge_dt'].dt.year.astype(str)

    df.drop(['flight_dt'], axis=1, inplace=True)
    df.set_index('charge_dt', inplace=True)
    df = df.sort_index()

create_features(df)
df.tail()

In [0]:
target_map = {(row.name, row['dtg']): row['unt_pre'] for _, row in df.iterrows()}
lag_periods = [7, 14, 21, 28, 364]
for lag in lag_periods:
    df[f'lag{lag}'] = df.apply(lambda row: target_map.get((row.name - pd.Timedelta(f'{lag} day'), row['dtg']), None), axis=1)

df.tail()

In [0]:
total_sales = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
'''df = df[df.index >= '2022-01-01']
total_sales = df.groupby('charge_dt')[['unt_pre','promoseats']].sum()
total_sales['promoseats'] = (total_sales['promoseats'] > 0).astype(int)
total_sales['promo_change'] = total_sales['promoseats'].ne(total_sales['promoseats'].shift()).cumsum()
promo_segments = total_sales.groupby('promo_change')
fig, ax = plt.subplots(figsize=(20, 5))
ax.plot(total_sales.index, total_sales['unt_pre'], label='Sales')
plt.title('Sales by Charge Date')
plt.style.use('ggplot')

#Shade background where promoseats == 1
label_added = False
for _, segment in promo_segments:
    if segment['promoseats'].iloc[0] == 1:
        start = segment.index[0]
        end = segment.index[-1]
        ax.axvspan(start, end, color='lightblue', alpha=1,  label='Promotion Period' if not label_added else None)
        label_added = True

ax.legend()
plt.show()'''

df = df[df.index >= '2022-01-01']
total_sales = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
df_reset = df.reset_index()
df_reset['charge_dt'] = df_reset['charge_dt'].astype(str)

fig = px.bar(df_reset, 
                 x='dtg', 
                 y='unt_pre', 
                 animation_frame='charge_dt',  # Add dtg as an animation frame
                 title='Sales by DTG with charge date variations')

fig.update_xaxes(range=[0, 200])
fig.update_yaxes(range=[0, 800])

fig.update_layout(
    xaxis_title='Days To Go',
    yaxis_title='Sales',
    legend_title='Charge Date',
)

fig.show()

In [0]:
fig = px.line(df, 
                 x=df.index, 
                 y='unt_pre', 
                 animation_frame='dtg',  # Add dtg as an animation frame
                 title='Sales by Charge Date with DTG Variations')

fig.update_layout(
    xaxis_title='Charge Date',
    yaxis_title='Sales',
    legend_title='DTG'
)

fig.update_yaxes(range=[0, 1000])

fig.show()

#on DTG 0, charge date 21/03/2025 an example can be seen with the Heathrow fire

In [0]:
# Assuming df is the original DataFrame
df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_month', 'charge_month'], var_name='month_type', value_name='month')

# Group by month and month type, and calculate mean sales
mean_sales = df_melt.groupby(['month', 'month_type'])['unt_pre'].mean().reset_index()

# Convert 'month' to categorical for ordering
mean_sales['month'] = pd.Categorical(mean_sales['month'], categories=[str(i) for i in range(1, 13)], ordered=True)

# Plot the histogram
plt.figure(figsize=(20, 8))
sns.barplot(data=mean_sales, x='month', y='unt_pre', hue='month_type')
plt.style.use('ggplot')
plt.title('Mean Sales by Month')
plt.xlabel('Month')
plt.ylabel('Mean Sales')
plt.show()

In [0]:
# Assuming df is the original DataFrame
df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_dow', 'charge_dow'], var_name='dow_type', value_name='dow')
mean_sales = df_melt.groupby(['dow', 'dow_type'])['unt_pre'].mean().reset_index()
mean_sales['dow'] = pd.Categorical(mean_sales['dow'], categories=[str(i) for i in range(0, 7)], ordered=True)
plt.figure(figsize=(15, 8))
sns.barplot(data=mean_sales, x='dow', y='unt_pre', hue='dow_type')
plt.style.use('ggplot')
plt.title('Mean Sales by DoW')
plt.xlabel('DoW')
plt.ylabel('Mean Sales')
display(plt.show())

In [0]:
# Group by charge day of month and calculate mean sales
mean_sales = df.groupby('charge_dom')['unt_pre'].mean().reset_index()
mean_yield = df.groupby('charge_dom')['yield'].mean().reset_index()

mean_sales['charge_dom'] = pd.to_numeric(mean_sales['charge_dom'])
mean_yield['charge_dom'] = pd.to_numeric(mean_yield['charge_dom'])
mean_sales = mean_sales.sort_values(by='charge_dom')
mean_yield = mean_yield.sort_values(by='charge_dom')

fig, ax1 = plt.subplots(figsize=(15, 8))
plt.style.use('ggplot')

# Plot mean sales
ax1.plot(mean_sales['charge_dom'], mean_sales['unt_pre'], marker='o', linewidth=12, color='tab:red', label='Mean Sales')
ax1.set_xlabel('Charge Day of Month')
ax1.set_ylabel('Mean Sales', color='tab:red')
ax1.tick_params(axis='y', labelcolor='tab:red')

# Create second y-axis
ax2 = ax1.twinx()
ax2.plot(mean_yield['charge_dom'], mean_yield['yield'], marker='s', linewidth=12, color='tab:blue', label='Mean Yield')
ax2.set_ylabel('Mean Yield', color='tab:blue')
ax2.tick_params(axis='y', labelcolor='tab:blue')

# Title and legend
fig.suptitle('Mean Sales and Yield by Charge Day of Month (2022 - ToDate)')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()

'''
# Plot the line graph
plt.figure(figsize=(15, 8))
plt.style.use('ggplot')
plt.plot(mean_sales['charge_dom'], mean_sales['unt_pre'], marker='o', linewidth=15)
plt.title('Mean Sales by Charge Day of Month')
plt.xlabel('Charge Day of Month')
plt.ylabel('Mean Sales')
display(plt.show())'''

In [0]:
mean_sales_1y = df[(df.index >= '2024-06-01') & (df.index < '2025-06-01')].groupby('charge_dom')['unt_pre'].mean().reset_index()
mean_yield_1y = df[(df.index >= '2024-06-01') & (df.index < '2025-06-01')].groupby('charge_dom')['yield'].mean().reset_index()

mean_sales_1y['charge_dom'] = pd.to_numeric(mean_sales_1y['charge_dom'])
mean_yield_1y['charge_dom'] = pd.to_numeric(mean_yield_1y['charge_dom'])
mean_sales_1y = mean_sales_1y.sort_values(by='charge_dom')
mean_yield_1y = mean_yield_1y.sort_values(by='charge_dom')

fig, ax1 = plt.subplots(figsize=(15, 8))
plt.style.use('ggplot')

# Plot mean sales
ax1.plot(mean_sales_1y['charge_dom'], mean_sales_1y['unt_pre'], marker='o', linewidth=12, color='tab:red', label='Mean Sales')
ax1.set_xlabel('Charge Day of Month')
ax1.set_ylabel('Mean Sales', color='tab:red')
ax1.tick_params(axis='y', labelcolor='tab:red')

# Create second y-axis
ax2 = ax1.twinx()
ax2.plot(mean_yield_1y['charge_dom'], mean_yield_1y['yield'], marker='s', linewidth=12, color='tab:blue', label='Mean Yield')
ax2.set_ylabel('Mean Yield', color='tab:blue')
ax2.tick_params(axis='y', labelcolor='tab:blue')

# Title and legend
fig.suptitle('Mean Sales and Yield by Charge Day of Month (06.2024 - 06.2025)')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()


In [0]:
df[(df.index > '2024-01-01') & (df.index < '2024-01-31')].groupby('charge_dt')['unt_pre'].mean().plot(figsize=(20,5), title = 'sales by charge date (Jan24)', y='unt_pre', linewidth=10)
plt.style.use('ggplot')
plt.show()

In [0]:
df.drop(['yield'], axis=1, inplace=True)
cyclic_cols=['flight_dom', 'flight_doy', 'charge_dom', 'charge_doy', 'flight_month', 'charge_month', 'flight_dow', 'charge_dow']
num_cols=['charge_year','flight_year', 'ty_capacity']

def encode_cyclic_features(df, cyclic_cols):
    for col in cyclic_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
        df.drop(col, axis=1, inplace=True)
    return df

def scale_num_cols(df, num_cols, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols])
    else:
        df[num_cols] = scaler.transform(df[num_cols])
    return df, scaler

encode_cyclic_features(df, cyclic_cols)
df, scaler = scale_num_cols(df, num_cols)
joblib.dump(scaler, 'scaler.pkl')
df.head()

In [0]:
#Error Metric

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [0]:
# Prophet

split_date = '2024-10-01'
train_prophet = df.loc[df.index < split_date].groupby('charge_dt')['unt_pre'].sum().reset_index()
val_prophet = df.loc[df.index >= split_date].groupby('charge_dt')['unt_pre'].sum().reset_index()

# Plot the data
plt.figure(figsize=(18, 6))
plt.plot(train_prophet['charge_dt'],train_prophet['unt_pre'], label='Train', color='blue')
plt.plot(val_prophet['charge_dt'], val_prophet['unt_pre'], label='Validation', color='red')
plt.axvline(pd.to_datetime(split_date), linestyle='--', color='black', label='Split Date')
plt.legend()
plt.title('Sales Before and After Split Date')
plt.xlabel('Charge Date')
plt.ylabel('Sales')
plt.show()

In [0]:
%%time
train_prophet = train_prophet.reset_index().rename(columns={'charge_dt': 'ds', 'unt_pre': 'y'})
model_prophet = Prophet()
model_prophet.fit(train_prophet)

In [0]:
val_prophet = val_prophet.reset_index().rename(columns={'charge_dt': 'ds', 'unt_pre': 'y'})
prophet_forecast = model_prophet.predict(val_prophet)
sns.set(style='darkgrid')
fig, ax = plt.subplots(figsize=(18, 6))
model_prophet.plot(prophet_forecast, ax=ax)
ax.set_title('Forecasted Demand with Prophet', fontsize=18, fontweight='bold')
ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Sales', fontsize=14)
ax.plot(train_prophet['ds'], train_prophet['y'], 'red', linewidth=1.5, label='Train')
ax.plot(val_prophet['ds'], val_prophet['y'], 'k.', alpha=0.6, label='Actual')
ax.plot(prophet_forecast['ds'], prophet_forecast['yhat'], color='blue', label='Forecast')
ax.fill_between(prophet_forecast['ds'], 
                prophet_forecast['yhat_lower'], 
                prophet_forecast['yhat_upper'], 
                color='skyblue', alpha=0.3, label='Confidence Interval')
ax.legend(loc='upper left')
forecast_start = train_prophet['ds'].max() 
ax.axvline(x=forecast_start, color='black', linestyle='--', lw=2, label='Forecast Start')
plt.tight_layout()
plt.show()


In [0]:
fig = model_prophet.plot_components(prophet_forecast)
plt.show()

In [0]:
lower_bound = pd.to_datetime('2025-01-01')
upper_bound = pd.to_datetime('2025-07-14')

fig, ax = plt.subplots(figsize=(18, 6))
ax.scatter(val_prophet['ds'], val_prophet['y'], color='black')
fig = model_prophet.plot(prophet_forecast, ax=ax)
ax.set_xbound(lower=lower_bound, upper=upper_bound)
ax.set_ylim(0, 50000)
ax.legend()
plot = plt.suptitle('Forecasted Demand with Prophet 2025 vs actual')

In [0]:
mse_prophet = np.sqrt(mean_squared_error(val_prophet['y'], prophet_forecast['yhat']))
mape_prophet = mean_absolute_percentage_error(val_prophet['y'], prophet_forecast['yhat'])
print(f'MSE: {mse_prophet:.2f}')
print(f'MAPE: {mape_prophet:.2f}')

In [0]:
'''%python
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor

param_grid = {
    'learning_rate': [0.005, 0.01, 0.1],
    #'subsample': [0.8, 1.0],
    #'colsample_bytree': [0.8, 1.0],
    #'lambda': [0.1, 1.0],
    #'alpha': [0, 0.1],
}

rmse_scorer = make_scorer(mean_squared_error, squared=False)

outer_cv = TimeSeriesSplit(n_splits=5)
inner_cv = TimeSeriesSplit(n_splits=3)

outer_scores = []

for train_idx, test_idx in outer_cv.split(df):
    X_train_outer = df.iloc[train_idx].drop('unt_net', axis=1).copy()
    y_train_outer = df.iloc[train_idx]['unt_net'].copy()
    X_test_outer = df.iloc[test_idx].drop('unt_net', axis=1).copy()
    y_test_outer = df.iloc[test_idx]['unt_net'].copy()

    X_train_processed = preprocessor.fit_preprocess(X_train_outer)
    X_test_processed = preprocessor.transform_preprocess(X_test_outer)

    model = XGBRegressor(objective='reg:pseudohubererror', base_score=0.5, boosting='gbtree', early_stopping_rounds=50, max_depth=3, n_estimators=500, learning_rate=0.01)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, n_jobs=-1, verbose=1, scoring=rmse_scorer)

    grid_search.fit(X_train_processed, y_train_outer, eval_set=[(X_test_processed, y_test_outer)])

    best_model = grid_search.best_estimator_
    test_score = best_model.score(X_test_processed, y_test_outer)
    outer_scores.append(test_score)

    print(f'Outer fold score: {test_score}, best params: {grid_search.best_params_}')'''

In [0]:
unique_dates = df.index.sort_values().unique()
tscv = TimeSeriesSplit(n_splits=5, test_size=168)

fig, axs = plt.subplots(5, 1, figsize=(20, 10), sharex=True)
plt.style.use('ggplot')
fold=0
preds = []
scores = []
# Nested cross-validation
for train_index, val_index in tscv.split(unique_dates):

    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_data = df.loc[train_dates]
    val_data = df.loc[val_dates]

    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_train.plot(ax=axs[fold], label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=axs[fold], label='Val', x='charge_dt', y='unt_pre', style='-')
    axs[fold].axvline(val_data.index.min(), linestyle='--', color='black')
    axs[fold].set_title(f'Fold {fold+1}')
    fold += 1

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']

    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    model = xgb.XGBRegressor(base_score=0.5, booster ='gbtree', n_estimators=1000, early_stopping_rounds=50, max_depth=3, learning_rate=0.01, objective='reg:pseudohubererror', enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    y_pred=model.predict(X_test)
    preds.extend(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

In [0]:
average_score = np.mean(scores)
print(f'individual scores: {scores}')
print(f'combined score: {np.mean(scores)}')
print(f'std: {np.std(scores)}')      

In [0]:
predictions_with_dates = pd.DataFrame({'Date': X_test.index, 'Prediction': y_pred})
actuals_with_dates = pd.DataFrame({'Date': X_test.index, 'Actual': y_test})
y_pred_total = predictions_with_dates.groupby('Date')['Prediction'].sum().reset_index()
y_test_total = actuals_with_dates.groupby('Date')['Actual'].sum().reset_index()
y_pred_total.set_index('Date', inplace=True)
y_test_total.set_index('Date', inplace=True)
mape_xgb = mean_absolute_percentage_error(y_test_total, y_pred_total)
mse_xgb = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'MSE: {mse_xgb:.2f}')
print(f'MAPE: {mape_xgb:.2f}')


In [0]:
pd.DataFrame(data=model.feature_importances_, index=model.feature_names_in_, columns=['importance']).sort_values('importance', ascending=False)

In [0]:
df_reset = val_data.reset_index()
df_reset['charge_dt'] = df_reset['charge_dt'].astype(str)

df_plot = df_reset[df_reset['prediction'].notnull()].copy()

df_actual = df_plot[['charge_dt','dtg','unt_pre']].copy()
df_actual['sales_type'] = 'actual'
df_actual['sales_value'] = df_actual['unt_pre']

df_pred = df_plot[['charge_dt','dtg','prediction']].copy()
df_pred['sales_type'] = 'predicted'
df_pred['sales_value'] = df_pred['prediction']

df_melted = pd.concat([df_actual, df_pred]).sort_values(['charge_dt','dtg','sales_type'])

min_dtg = 0
max_dtg = 200
min_sales = 0
max_sales = 800

fig = px.line(df_melted, x='dtg', y='sales_value', color='sales_type', animation_frame='charge_dt', title='Actual vs Predicted by DTG across Charge Dates', color_discrete_map={'unt_net':'red', 'prediction':'blue'})
fig.update_xaxes(range=[min_dtg, max_dtg], title='DTG')
fig.update_yaxes(range=[min_sales, max_sales], title='Sales')
fig.update_layout(legend_title='sales type')
fig.show()

In [0]:
df_reset2 = val_data.reset_index()
df_reset2['charge_dt'] = df_reset2['charge_dt'].astype(str)

df_plot = df_reset2[df_reset2['prediction'].notnull()].copy()

df_actual = df_plot[['charge_dt','dtg','unt_pre']].copy()
df_actual['sales_type'] = 'actual'
df_actual['sales_value'] = df_actual['unt_pre']

df_pred = df_plot[['charge_dt','dtg','prediction']].copy()
df_pred['sales_type'] = 'predicted'
df_pred['sales_value'] = df_pred['prediction']

df_melted = pd.concat([df_actual, df_pred]).sort_values(['charge_dt','dtg','sales_type'])

min_sales = 0
max_sales = 1000

fig = px.line(df_melted, x='charge_dt', y='sales_value', color='sales_type', animation_frame='dtg', title='Actual vs Predicted by DTG across Charge Dates', color_discrete_map={'unt_pre':'red', 'prediction':'blue'})
fig.update_xaxes(title='Charge Date')
fig.update_yaxes(range=[min_sales, max_sales], title='Sales')
fig.update_layout(legend_title='sales type')
fig.show()

In [0]:
val_data['error'] = np.abs(y_test - val_data['prediction'])
val_data.groupby(val_data.index)['error'].mean().sort_values(ascending=False).head(10)

In [0]:
val_data.groupby(val_data.index)['error'].mean().plot(figsize=(20,5))
plt.show()

In [0]:
val_data.groupby('dtg')['error'].mean().sort_values(ascending=False).head(10)

In [0]:
val_data.groupby('dtg')['error'].mean().plot(figsize=(20,5))
plt.title('Absolute Error By DTG')
plt.xlabel('DTG')
plt.ylabel('RMSError')
plt.show()

In [0]:
val_data = val_data[val_data['dtg'] <= 300]
sales_by_dtg = val_data.groupby('dtg')['unt_pre'].std().reset_index()
sales_by_dtg.rename(columns={'unt_pre': 'std_unt_pre'}, inplace=True)

val_data = val_data.merge(sales_by_dtg, on='dtg', how='left')

min_std = 1
val_data['std_unt_pre'] = val_data['std_unt_pre'].fillna(min_std).apply(lambda x: max(x, min_std))

val_data['absolute_error'] = np.abs(val_data['unt_pre'] - val_data['prediction'])
val_data['relative_error'] = val_data['absolute_error'] / val_data['std_unt_pre']

relative_error = val_data.groupby('dtg')['relative_error'].mean()
relative_error.sort_values(ascending=False).head(10)

In [0]:
relative_error.plot(figsize=(20,5))
plt.title('Relative Error By DTG')
plt.xlabel('DTG')
plt.ylabel('Relative Error')
plt.show()

In [0]:
future = pd.date_range(start=df.index.max() + pd.DateOffset(days=1), end=(df.index.max() + pd.DateOffset(days=1)), freq='1d')
dtg_values = list(range(0, 301))
future_combinations = list(product(future, dtg_values))
future_df = pd.DataFrame(future_combinations, columns=['charge_dt', 'dtg'])
create_features(future_df)
encode_cyclic_features(future_df, cyclic_cols)
future_df, _ = scale_num_cols(future_df, num_cols, scaler = joblib.load('scaler.pkl'))
for lag in lag_periods:
    future_df[f'lag{lag}'] = future_df.apply(lambda row: target_map.get((row.name - pd.Timedelta(f'{lag} day'), row['dtg']), None), axis=1)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future.tail()

In [0]:
future_w_features = df_and_future.query('isFuture').copy()
future_w_features = future_w_features.drop(columns=['unt_pre', 'isFuture'])
future_w_features['pred'] = model.predict(future_w_features)
future_w_features.plot(figsize=(20,5), x='dtg', y='pred')
plt.title('Predicted Sales')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()