In [0]:
#importing data

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from xgboost import XGBRegressor
from prophet import Prophet
from pyspark.sql.functions import to_date, to_timestamp, col, last, when, lit, dayofweek, dayofmonth, dayofyear, month, weekofyear, current_date, date_sub, datediff
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import ticker
from datetime import datetime, timedelta
from itertools import product
import joblib
import holidays

df = spark.read.table('data_experience_commercial.cbt_1423_rtsuite.master').select('charge_dt','dtg','chargeproduct','unt_pre','flightkey','flight_dt','rev_pre','channel')
dmh = spark.read.table('data_experience_commercial.cbt_0923_segmentfinder.dimensions_history').select('flightkey','onsale_dt','ty_capacity','routetype','region')
#pi = spark.read.table('data_prod.silver_sanezdb.priceinspection').select('segment','when','promoseats')
#pi = pi.withColumnRenamed('segment', 'flightkey')
#pi = pi.withColumnRenamed('when', 'charge_dt')
#pi = pi.withColumn('charge_dt', to_date('charge_dt'))
#pi = pi.withColumn('promoseats', col('promoseats').cast('int'))
#pi = pi.fillna({'promoseats': 0})
#pi = pi.groupby('flightkey','charge_dt').agg(F.max('promoseats').alias('promoseats')).orderBy('flightkey','charge_dt')
#df = df.join(dmh, on='flightkey', how='left').join(pi, on=['flightkey','charge_dt'], how='left').drop('flightkey')
df = df.join(dmh, on='flightkey', how='left')
df = df[(df['chargeproduct']=='Ticket') & (df['dtg'] >= 0) & (df['region'].isin(['UK-London','UK-Regions'])) & (df['routetype'] == 'Domestic')]
df = df.filter((col('dtg') < (datediff(col('flight_dt'), col('onsale_dt')) - 25)) & (col('charge_dt') >= '2019-01-01'))
df = df.withColumn('unt_pre', F.when(col('unt_pre') < 0, 0).otherwise(col('unt_pre')))
df = df.withColumn('rev_pre', F.when(col('rev_pre') < 0, 0).otherwise(col('rev_pre')))
df = df.withColumn('channel_index', when(col('channel') == 'Web/App', 1).otherwise(0))
#df = df.groupby('charge_dt','dtg').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('rev_pre').alias('rev_pre'), F.sum('ty_capacity').alias('ty_capacity'), F.sum('promoseats').alias('promoseats')).orderBy('charge_dt','dtg')
df = df.groupby('charge_dt','dtg').agg(F.sum('unt_pre').alias('unt_pre'), F.sum('rev_pre').alias('rev_pre'), F.sum('ty_capacity').alias('ty_capacity'), F.avg('channel_index').alias('channel_mix')).orderBy('charge_dt','dtg')
#df=df.fillna({'promoseats': 0})
df = df.toPandas()
df.info()

In [0]:
lag_days = [7, 14, 21, 28, 364]
cols_to_lag = ['unt_pre', 'channel_mix']

def generate_lag_features(df, value_columns, lag_days, date_col='charge_dt', dtg_col='dtg'):
    """
    Generate lag features for multiple value columns at specified lag days.
    
    Parameters:
    - df: pandas.DataFrame
    - value_columns: list of column names to generate lag features for
    - lag_days: list of integers, each representing a lag in days
    - date_col: name of the charge date column
    - dtg_col: name of the dtg column (to group by)
    
    Returns:
    - df with added lag features
    """
    df = df.sort_values([dtg_col, date_col])
    df = df.set_index([date_col, dtg_col])

    for col in value_columns:
        for lag in lag_days:
            lag_col_name = f'{col}_lag{lag}'
            df[lag_col_name] = df.groupby(level=1)[col].shift(lag)

    return df.reset_index()

df = generate_lag_features(df, cols_to_lag, lag_days)
df.tail()


In [0]:
#Feature Engineering
df['charge_dt'] = pd.to_datetime(df['charge_dt'])

df['rev_pre'] = df['rev_pre'].round(0).astype(int)
df['yield'] = df['rev_pre'] / df['unt_pre']
df['yield'] = df['yield'].fillna(0) 
df['yield'] = df['yield'].round(0).astype(int)
df.drop(['rev_pre'], axis=1, inplace=True)
df.drop(['channel_mix'], axis=1, inplace=True)

def create_features(df):
    df['flight_dt'] = pd.to_datetime(df['charge_dt'] + pd.to_timedelta(df['dtg'], unit='D'))
    df['charge_dt'] = pd.to_datetime(df['charge_dt'])

    df['flight_month'] = df['flight_dt'].dt.month.astype(str)
    df['flight_dow'] = df['flight_dt'].dt.dayofweek.astype(str)
    df['flight_dom'] = df['flight_dt'].dt.day.astype(str)
    df['flight_doy'] = df['flight_dt'].dt.dayofyear.astype(str)
    df['flight_year'] = df['flight_dt'].dt.year.astype(str)

    df['charge_month'] = df['charge_dt'].dt.month.astype(str)
    df['charge_dow'] = df['charge_dt'].dt.dayofweek.astype(str)
    df['charge_dom'] = df['charge_dt'].dt.day.astype(str)
    df['charge_doy'] = df['charge_dt'].dt.dayofyear.astype(str)
    df['charge_year'] = df['charge_dt'].dt.year.astype(str)

    #df.set_index('charge_dt', inplace=True)
    #df = df.sort_index()

create_features(df)
df.tail()

In [0]:
uk_holidays = holidays.UK(years=range(2022, 2026))
holidays_df = pd.DataFrame([(date, name) for date, name in uk_holidays.items()], columns=['ds', 'holiday'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df

In [0]:
additional_holidays = pd.DataFrame([
    {'ds': '2022-04-18', 'holiday': 'Easter Monday'},
    {'ds': '2022-08-29', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2023-04-10', 'holiday': 'Easter Monday'},
    {'ds': '2023-08-28', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2024-04-01', 'holiday': 'Easter Monday'},
    {'ds': '2024-08-26', 'holiday': 'Summer Bank Holiday'},
    {'ds': '2025-04-21', 'holiday': 'Easter Monday'},
    {'ds': '2025-08-25', 'holiday': 'Summer Bank Holiday'},
])
holidays_df = pd.concat([holidays_df, additional_holidays], ignore_index=True)
holidays_df.drop_duplicates(inplace=True)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
holidays_df.sort_values(by='ds', inplace=True)
holidays_df.reset_index(drop=True, inplace=True)
holidays_df
   

In [0]:
holidays_df["ds"] = pd.to_datetime(holidays_df["ds"])
df = df.merge(holidays_df.rename(columns={'ds': 'charge_dt', 'holiday': 'charge_dt_holiday'}), how='left', on='charge_dt')
df = df.merge(holidays_df.rename(columns={'ds': 'flight_dt', 'holiday': 'flight_dt_holiday'}), how='left', on='flight_dt')
df['is_charge_date_holiday'] = df['charge_dt_holiday'].notnull().astype(int)
df['is_flight_date_holiday'] = df['flight_dt_holiday'].notnull().astype(int)
df.drop(['charge_dt_holiday', 'flight_dt_holiday'], axis=1, inplace=True)
df = df.sort_values(by=['charge_dt', 'dtg'], ascending=[True, True])
df.set_index('charge_dt', inplace=True)
df.head()

In [0]:
total_sales = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
'''df = df[df.index >= '2022-01-01']
total_sales = df.groupby('charge_dt')[['unt_pre','promoseats']].sum()
total_sales['promoseats'] = (total_sales['promoseats'] > 0).astype(int)
total_sales['promo_change'] = total_sales['promoseats'].ne(total_sales['promoseats'].shift()).cumsum()
promo_segments = total_sales.groupby('promo_change')
fig, ax = plt.subplots(figsize=(20, 5))
ax.plot(total_sales.index, total_sales['unt_pre'], label='Sales')
plt.title('Sales by Charge Date')
plt.style.use('ggplot')

#Shade background where promoseats == 1
label_added = False
for _, segment in promo_segments:
    if segment['promoseats'].iloc[0] == 1:
        start = segment.index[0]
        end = segment.index[-1]
        ax.axvspan(start, end, color='lightblue', alpha=1,  label='Promotion Period' if not label_added else None)
        label_added = True

ax.legend()
plt.show()'''

df = df[df.index >= '2022-01-01']
total_sales_by_charge_dt = df.groupby('charge_dt')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_charge_dt.plot(style='-', figsize=(20,5), title = 'sales by charge date', y='unt_pre', x='charge_dt')
plt.show()

In [0]:
df.drop(['flight_dt'], axis=1, inplace=True)
total_sales_by_dtg = df.groupby('dtg')['unt_pre'].sum().reset_index()
plt.style.use('ggplot')
total_sales_by_dtg.plot(style='-', figsize=(20,5), title = 'sales by dtg since 2022', y='unt_pre', x='dtg')
plt.show()

In [0]:
df_reset = df.reset_index()
df_reset['charge_dt'] = df_reset['charge_dt'].astype(str)

fig = px.line(df_reset, 
                 x='dtg', 
                 y='unt_pre', 
                 animation_frame='charge_dt',  # Add dtg as an animation frame
                 title='Sales by DTG with charge date variations')

fig.update_xaxes(range=[0, 200])
fig.update_yaxes(range=[0, 800])

fig.update_layout(
    xaxis_title='Days To Go',
    yaxis_title='Sales',
    legend_title='Charge Date',
    height=900,
    width=1400
)

fig.show()

In [0]:
# Assuming df is the original DataFrame
df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_month', 'charge_month'], var_name='month_type', value_name='month')

# Group by month and month type, and calculate mean sales
mean_sales = df_melt.groupby(['month', 'month_type'])['unt_pre'].mean().reset_index()

# Convert 'month' to categorical for ordering
mean_sales['month'] = pd.Categorical(mean_sales['month'], categories=[str(i) for i in range(1, 13)], ordered=True)

# Plot the histogram
plt.figure(figsize=(20, 8))
sns.barplot(data=mean_sales, x='month', y='unt_pre', hue='month_type')
plt.style.use('ggplot')
plt.title('Mean Sales by Month')
plt.xlabel('Month')
plt.ylabel('Mean Sales')
plt.show()

In [0]:
# Assuming df is the original DataFrame
df_melt = df.melt(id_vars='unt_pre', value_vars=['flight_dow', 'charge_dow'], var_name='dow_type', value_name='dow')
mean_sales = df_melt.groupby(['dow', 'dow_type'])['unt_pre'].mean().reset_index()
mean_sales['dow'] = pd.Categorical(mean_sales['dow'], categories=[str(i) for i in range(0, 7)], ordered=True)
plt.figure(figsize=(15, 8))
sns.barplot(data=mean_sales, x='dow', y='unt_pre', hue='dow_type')
plt.style.use('ggplot')
plt.title('Mean Sales by DoW')
plt.xlabel('DoW')
plt.ylabel('Mean Sales')
display(plt.show())

In [0]:
# Group by charge day of month and calculate mean sales
mean_sales = df.groupby('charge_dom')['unt_pre'].mean().reset_index()
mean_yield = df.groupby('charge_dom')['yield'].mean().reset_index()

mean_sales['charge_dom'] = pd.to_numeric(mean_sales['charge_dom'])
mean_yield['charge_dom'] = pd.to_numeric(mean_yield['charge_dom'])
mean_sales = mean_sales.sort_values(by='charge_dom')
mean_yield = mean_yield.sort_values(by='charge_dom')

fig, ax1 = plt.subplots(figsize=(15, 8))
plt.style.use('ggplot')

# Plot mean sales
ax1.plot(mean_sales['charge_dom'], mean_sales['unt_pre'], marker='o', linewidth=12, color='tab:red', label='Mean Sales')
ax1.set_xlabel('Charge Day of Month')
ax1.set_ylabel('Mean Sales', color='tab:red')
ax1.tick_params(axis='y', labelcolor='tab:red')

# Create second y-axis
ax2 = ax1.twinx()
ax2.plot(mean_yield['charge_dom'], mean_yield['yield'], marker='s', linewidth=12, color='tab:blue', label='Mean Yield')
ax2.set_ylabel('Mean Yield', color='tab:blue')
ax2.tick_params(axis='y', labelcolor='tab:blue')

# Title and legend
fig.suptitle('Mean Sales and Yield by Charge Day of Month (2022 - ToDate)')
fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.tight_layout()
plt.show()

'''
# Plot the line graph
plt.figure(figsize=(15, 8))
plt.style.use('ggplot')
plt.plot(mean_sales['charge_dom'], mean_sales['unt_pre'], marker='o', linewidth=15)
plt.title('Mean Sales by Charge Day of Month')
plt.xlabel('Charge Day of Month')
plt.ylabel('Mean Sales')
display(plt.show())'''

In [0]:
df[(df.index > '2024-01-01') & (df.index < '2024-01-31')].groupby('charge_dt')['unt_pre'].mean().plot(figsize=(20,5), title = 'sales by charge date (Jan24)', y='unt_pre', linewidth=10)
plt.style.use('ggplot')
plt.show()

In [0]:
df.drop(['yield'], axis=1, inplace=True)
cyclic_cols=['flight_dom', 'flight_doy', 'charge_dom', 'charge_doy', 'flight_month', 'charge_month', 'flight_dow', 'charge_dow']
num_cols=['charge_year','flight_year', 'ty_capacity']

def encode_cyclic_features(df, cyclic_cols):
    for col in cyclic_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        max_val = df[col].max()
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
        df.drop(col, axis=1, inplace=True)
    return df

def scale_num_cols(df, num_cols, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols])
    else:
        df[num_cols] = scaler.transform(df[num_cols])
    return df, scaler

encode_cyclic_features(df, cyclic_cols)
df, scaler = scale_num_cols(df, num_cols)
joblib.dump(scaler, 'scaler.pkl')
df.head()

In [0]:
#Error Metric

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [0]:
split_date = pd.to_datetime(spark.sql("SELECT current_date()").collect()[0][0]) - pd.DateOffset(days=168)
train_prophet_plot = df.loc[df.index < split_date].groupby('charge_dt')['unt_pre'].sum().reset_index()
val_prophet_plot = df.loc[df.index >= split_date].groupby('charge_dt')['unt_pre'].sum().reset_index()

# Plot the data
plt.figure(figsize=(18, 6))
plt.plot(train_prophet_plot['charge_dt'],train_prophet_plot['unt_pre'], label='Train', color='blue')
plt.plot(val_prophet_plot['charge_dt'], val_prophet_plot['unt_pre'], label='Validation', color='red')
plt.axvline(pd.to_datetime(split_date), linestyle='--', color='black', label='Split Date')
plt.legend()
plt.title('Sales Before and After Split Date')
plt.xlabel('Charge Date')
plt.ylabel('Sales')
plt.show()

In [0]:
#Prophet
train_prophet = df.loc[df.index < split_date]
val_prophet = df.loc[df.index >= split_date]
train_df = train_prophet.reset_index()
val_df = val_prophet.reset_index()
all_forecasts = []

for dtg_value in train_df['dtg'].unique():

    train_subset = train_df[train_df['dtg'] == dtg_value][['charge_dt', 'unt_pre', 'ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28']].copy()
    train_subset.rename(columns={'charge_dt': 'ds', 'unt_pre': 'y'}, inplace=True)
    train_subset.sort_values('ds', inplace=True)

    val_subset = val_df[val_df['dtg'] == dtg_value][['charge_dt', 'ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28']].copy()
    val_subset.rename(columns={'charge_dt': 'ds'}, inplace=True)
    val_subset.sort_values('ds', inplace=True)

    if len(train_subset) < 10 or val_subset.empty:
        continue

    model = Prophet(holidays=holidays_df)
    regressors = ['ty_capacity', 'unt_pre_lag7', 'unt_pre_lag14', 'unt_pre_lag21', 'unt_pre_lag28']
    for reg in regressors:
        model.add_regressor(reg)
    model.fit(train_subset)

    forecast = model.predict(val_subset)

    forecast['dtg'] = dtg_value
    forecast['actual'] = val_df[val_df['dtg'] == dtg_value].set_index('charge_dt').loc[forecast['ds'], 'unt_pre'].values

    all_forecasts.append(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', 'dtg', 'actual']])

prophet_val_forecasts = pd.concat(all_forecasts, ignore_index=True)

In [0]:
agg_forecast = (prophet_val_forecasts.groupby('ds').agg({'yhat': 'sum','yhat_lower': 'sum', 'yhat_upper': 'sum'}).reset_index())
actuals_all = pd.concat([train_df, val_df])
actuals_all = actuals_all.rename(columns={'charge_dt': 'ds', 'unt_pre': 'y'})
agg_actuals = (actuals_all.groupby('ds').agg({'y': 'sum'}).reset_index())
plot_df = pd.merge(agg_forecast, agg_actuals, on='ds', how='outer').sort_values('ds')

plt.style.use('seaborn-darkgrid')
fig, ax = plt.subplots(figsize=(18, 6))

plot_df_val = plot_df[plot_df['ds'] >= split_date]
plot_df_train = plot_df[plot_df['ds'] < split_date]

ax.plot(plot_df_val['ds'], plot_df_val['y'], 'k.', alpha=0.6, label='Actual')
ax.plot(plot_df_train['ds'], plot_df_train['y'], 'red', linewidth=1.5, label='Train') 

ax.plot(plot_df['ds'], plot_df['yhat'], color='blue', label='Forecast')
ax.fill_between(plot_df['ds'], plot_df['yhat_lower'], plot_df['yhat_upper'], color='skyblue', alpha=0.3, label='Confidence Interval')

ax.axvline(x=split_date, color='black', linestyle='--', lw=2, label='Forecast Start')
ax.set_title('Forecasted Demand with Prophet (Aggregated)', fontsize=18, fontweight='bold')
ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Sales', fontsize=14)
ax.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [0]:
# Step 1: Aggregate by charge date (ds)
agg_forecasts = prophet_val_forecasts.groupby('ds').agg({
    'yhat': 'sum',
    'actual': 'sum',
    'yhat_lower': 'sum',
    'yhat_upper': 'sum'
}).reset_index()

# Step 2: Plot
plt.figure(figsize=(12, 6))
plt.plot(agg_forecasts['ds'], agg_forecasts['actual'], label='Actual', marker='o')
plt.plot(agg_forecasts['ds'], agg_forecasts['yhat'], label='Forecast (Prophet)', marker='x')
plt.xlabel('Charge Date')
plt.ylabel('Total Sales Across All DTGs')
plt.title('Prophet Forecast vs Actual – Aggregated Over DTG')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [0]:
mse_prophet = np.sqrt(mean_squared_error(prophet_val_forecasts['actual'], prophet_val_forecasts['yhat']))
mape_prophet = mean_absolute_percentage_error(agg_forecasts['actual'], agg_forecasts['yhat'])
print(f'MSE: {mse_prophet:.2f}')
print(f'MAPE: {mape_prophet:.2f}')

In [0]:
'''%python
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor

param_grid = {
    'learning_rate': [0.005, 0.01, 0.1],
    #'subsample': [0.8, 1.0],
    #'colsample_bytree': [0.8, 1.0],
    #'lambda': [0.1, 1.0],
    #'alpha': [0, 0.1],
}

rmse_scorer = make_scorer(mean_squared_error, squared=False)

outer_cv = TimeSeriesSplit(n_splits=5)
inner_cv = TimeSeriesSplit(n_splits=3)

outer_scores = []

for train_idx, test_idx in outer_cv.split(df):
    X_train_outer = df.iloc[train_idx].drop('unt_net', axis=1).copy()
    y_train_outer = df.iloc[train_idx]['unt_net'].copy()
    X_test_outer = df.iloc[test_idx].drop('unt_net', axis=1).copy()
    y_test_outer = df.iloc[test_idx]['unt_net'].copy()

    X_train_processed = preprocessor.fit_preprocess(X_train_outer)
    X_test_processed = preprocessor.transform_preprocess(X_test_outer)

    model = XGBRegressor(objective='reg:pseudohubererror', base_score=0.5, boosting='gbtree', early_stopping_rounds=50, max_depth=3, n_estimators=500, learning_rate=0.01)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, n_jobs=-1, verbose=1, scoring=rmse_scorer)

    grid_search.fit(X_train_processed, y_train_outer, eval_set=[(X_test_processed, y_test_outer)])

    best_model = grid_search.best_estimator_
    test_score = best_model.score(X_test_processed, y_test_outer)
    outer_scores.append(test_score)

    print(f'Outer fold score: {test_score}, best params: {grid_search.best_params_}')'''

In [0]:
unique_dates = df.index.sort_values().unique()
tscv = TimeSeriesSplit(n_splits=5, test_size=168)

fig, axs = plt.subplots(5, 1, figsize=(20, 10), sharex=True)
plt.style.use('ggplot')
fold=0
preds = []
scores = []
# Nested cross-validation
for train_index, val_index in tscv.split(unique_dates):

    train_dates = unique_dates[train_index]
    val_dates = unique_dates[val_index]

    train_data = df.loc[train_dates]
    val_data = df.loc[val_dates]

    total_sales_train = train_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_val = val_data.groupby('charge_dt')['unt_pre'].sum().reset_index()
    total_sales_train.plot(ax=axs[fold], label='Train', x='charge_dt', y='unt_pre', style='-')
    total_sales_val.plot(ax=axs[fold], label='Val', x='charge_dt', y='unt_pre', style='-')
    axs[fold].axvline(val_data.index.min(), linestyle='--', color='black')
    axs[fold].set_title(f'Fold {fold+1}')
    fold += 1

    X_train = train_data.drop('unt_pre', axis=1)
    y_train = train_data['unt_pre']

    X_test = val_data.drop('unt_pre', axis=1)
    y_test = val_data['unt_pre']

    model = xgb.XGBRegressor(base_score=0.5, booster ='gbtree', n_estimators=1000, early_stopping_rounds=50, max_depth=3, learning_rate=0.01, objective='reg:pseudohubererror', enable_categorical=True)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

    y_pred=model.predict(X_test)
    preds.extend(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

In [0]:
average_score = np.mean(scores)
print(f'individual scores: {scores}')
print(f'combined score: {np.mean(scores)}')
print(f'std: {np.std(scores)}')      

In [0]:
predictions = pd.DataFrame({'date': X_test.index, 'prediction': y_pred, 'dtg': X_test['dtg']})
actual = pd.DataFrame({'date': X_test.index, 'unt_pre': y_test, 'dtg': X_test['dtg']})
y_pred_total = predictions.groupby('date')['prediction'].sum().reset_index()
y_test_total = actual.groupby('date')['unt_pre'].sum().reset_index()
y_pred_total.set_index('date', inplace=True)
y_test_total.set_index('date', inplace=True)
mape_xgb = mean_absolute_percentage_error(y_test_total, y_pred_total)
mse_xgb = np.sqrt(mean_squared_error(y_test_total, y_pred_total))
print(f'MSE: {mse_xgb:.2f}')
print(f'MAPE: {mape_xgb:.2f}')


In [0]:
pd.DataFrame(data=model.feature_importances_, index=model.feature_names_in_, columns=['importance']).sort_values('importance', ascending=False)

In [0]:
df_reset_actual = actual.reset_index()
df_reset_actual['date'] = df_reset_actual['date'].astype(str)

df_reset_pred = predictions.reset_index()
df_reset_pred['date'] = df_reset_pred['date'].astype(str)

df_plot_actual = df_reset_actual[df_reset_actual['unt_pre'].notnull()].copy()
df_plot_pred = df_reset_pred[df_reset_pred['prediction'].notnull()].copy()

df_actual = df_plot_actual[['date','dtg','unt_pre']].copy()
df_actual['sales_type'] = 'actual'
df_actual['sales_value'] = df_actual['unt_pre']

df_pred = df_plot_pred[['date','dtg','prediction']].copy()
df_pred['sales_type'] = 'predicted'
df_pred['sales_value'] = df_pred['prediction']

df_melted = pd.concat([df_actual, df_pred]).sort_values(['date','dtg','sales_type'])

min_dtg = 0
max_dtg = 200
min_sales = 0
max_sales = 800

fig = px.line(df_melted, x='dtg', y='sales_value', color='sales_type', animation_frame='date', title='Actual vs Predicted by DTG across Charge Dates', color_discrete_map={'unt_net':'red', 'prediction':'blue'})
fig.update_xaxes(range=[min_dtg, max_dtg], title='DTG')
fig.update_yaxes(range=[min_sales, max_sales], title='Sales')
fig.update_layout(legend_title='sales type', height=900, width=1200)
fig.show()

In [0]:
val_data['error'] = np.abs(y_test - y_pred)
val_data.groupby(val_data.index)['error'].mean().sort_values(ascending=False).head(10)

In [0]:
val_data.groupby(val_data.index)['error'].mean().plot(figsize=(20,5))
plt.show()

In [0]:
future = pd.date_range(start=df.index.max() + pd.DateOffset(days=1), end=(df.index.max() + pd.DateOffset(days=1)), freq='1d')
dtg_values = list(range(0, 301))
future_combinations = list(product(future, dtg_values))
future_df = pd.DataFrame(future_combinations, columns=['charge_dt', 'dtg'])
create_features(future_df)
encode_cyclic_features(future_df, cyclic_cols)
future_df, _ = scale_num_cols(future_df, num_cols, scaler = joblib.load('scaler.pkl'))
for lag in lag_periods:
    future_df[f'lag{lag}'] = future_df.apply(lambda row: target_map.get((row.name - pd.Timedelta(f'{lag} day'), row['dtg']), None), axis=1)
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future.tail()

In [0]:
future_w_features = df_and_future.query('isFuture').copy()
future_w_features = future_w_features.drop(columns=['unt_pre', 'isFuture'])
future_w_features['pred'] = model.predict(future_w_features)
future_w_features.plot(figsize=(20,5), x='dtg', y='pred')
plt.title('Predicted Sales')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()