In [43]:
### IMPORTS ###
import warnings
warnings.filterwarnings("ignore")
import copy
from pathlib import Path
import warnings

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
import numpy as np
import pandas as pd
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import (
    optimize_hyperparameters,
)
import pandas as pd

In [44]:
################## CLEANING THE PURCHASE ORDERS DATA ##############

orders = pd.read_csv("../data/kernel/purchase_orders.csv")

# Time is in GMT+2 which is Norway time
# Make delivery_date, created_date_time and modified_date_time to GMT +2
orders['delivery_date'] = pd.to_datetime(orders['delivery_date'], utc=True).dt.tz_convert('Etc/GMT-2')
orders['created_date_time'] = pd.to_datetime(orders['created_date_time'], utc=True).dt.tz_convert('Etc/GMT-2')
orders['modified_date_time'] = pd.to_datetime(orders['modified_date_time'], utc=True).dt.tz_convert('Etc/GMT-2')


################# CLEANING THE RECEIVALS DATA ########################
receivals = pd.read_csv("../data/kernel/receivals.csv")

# Make the date_arrival to GMT +2
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_convert('Etc/GMT-2')


############### MERGE ORDERS AND RECEIVALS DATA ###########################
# --- Merge orders and receivals WITHOUT aggregation ---
orders_with_receivals = orders.merge(
    receivals,
    on=["purchase_order_id", "purchase_order_item_no"],
    how="left",
    suffixes=('_order', '_receival')
)

# --- Fill missing values for orders with no receivals ---
orders_with_receivals["net_weight"] = orders_with_receivals["net_weight"].fillna(0)
orders_with_receivals["date_arrival"] = pd.to_datetime(orders_with_receivals["date_arrival"])


# Make the orders with PUND in KGs, and change quantity accordingly
# 1 PUND = 0,45359237 kilogram
orders_with_receivals.loc[orders_with_receivals['unit'] == 'PUND', 'quantity'] = orders_with_receivals.loc[orders_with_receivals['unit'] == 'PUND', 'net_weight'] * 0.45359237
# Change the unit to KG too: orders_with_receivals.loc[orders_with_receivals['unit'] == 'PUND', 'unit'] = 'KG'
# Drop unit_id and unit columns
orders_with_receivals = orders_with_receivals.drop(columns=['unit_id', 'unit'])

# --- Derived features ---
orders_with_receivals["fill_fraction"] = orders_with_receivals["net_weight"] / orders_with_receivals["quantity"]
orders_with_receivals["lead_time"] = (
    orders_with_receivals["date_arrival"] - orders_with_receivals["delivery_date"]
).dt.days
orders_with_receivals["lead_time"] = orders_with_receivals["lead_time"].fillna(0)


####################### SELECT RELEVANT COLUMNS FROM THE MERGED DATAFRAME ##################################
orders_with_receivals = orders_with_receivals[orders_with_receivals['rm_id'].notnull() & orders_with_receivals['date_arrival'].notnull()]
# date_arrival = actual date of receival, delivery_date = expected date of receival
# lead_time = date_arrival - delivery_date
# quantity  = quantity, net_weight = weight in kg (the actual target per day etc)
selected = orders_with_receivals[["rm_id", "date_arrival", "net_weight", "supplier_id", "delivery_date", "product_id_receival", "quantity", "lead_time"]]
# Filter out the selected rows where rm_id is null or date_arrival is null
selected = selected[selected['rm_id'].notnull() & selected['date_arrival'].notnull()]



In [45]:
##################### CREATING TIME_IDX AND AGGREGATING TO DAILY LEVEL AND FILLING GAPS WITH 0 NET_WEIGHT RECEIVALS ############################
# make a copy and normalize date_arrival to date-only (drop time) so grouping is by year-month-day
df_agg = selected.copy()
# ensure date_arrival is a datetime and floor to day (sets time to 00:00:00)
df_agg['date_arrival'] = df_agg['date_arrival'].dt.floor('D')
# Remove timezone info if present
df_agg['date_arrival'] = df_agg['date_arrival'].dt.tz_localize(None)

df_agg = df_agg.groupby(['rm_id', 'date_arrival']).agg({
    'net_weight': 'sum',
    'quantity': 'sum',
}).reset_index()

# Add time_idx based on days since each rm_id's minimum date
df_agg = df_agg.sort_values(['rm_id', 'date_arrival'])
df_agg['local_time_idx'] = (df_agg['date_arrival'] - df_agg.groupby('rm_id')['date_arrival'].transform('min')).dt.days

# Fill gaps from each rm_id's min date to 2024-12-31 with 0 net_weight entries
end_date = pd.Timestamp('2024-12-31')
all_filled = []

for rm_id, group in df_agg.groupby('rm_id'):
    min_date = group['date_arrival'].min()
    max_idx = (end_date - min_date).days
    
    full_range = pd.DataFrame({
        'local_time_idx': range(0, max_idx + 1)
    })
    full_range['rm_id'] = rm_id
    full_range['date_arrival'] = min_date + pd.to_timedelta(full_range['local_time_idx'], unit='D')
    
    merged = pd.merge(full_range, group, on=['rm_id', 'local_time_idx', 'date_arrival'], how='left')
    merged['net_weight'] = merged['net_weight'].fillna(0)
    merged['quantity'] = merged['quantity'].fillna(0)
  
    all_filled.append(merged)

df_agg = pd.concat(all_filled, ignore_index=True)
selected_with_local_time = df_agg

In [46]:
######################### ADD ADDITIONAL FEATURES ##################################
# Add additional features
selected_with_local_time["month"] = selected_with_local_time["date_arrival"].dt.month.astype(str).astype("category")
selected_with_local_time["year"] = selected_with_local_time["date_arrival"].dt.year.astype(str).astype("category")
selected_with_local_time["day_of_week"] = selected_with_local_time["date_arrival"].dt.dayofweek.astype(str).astype("category")  # 0=Monday, 6=Sunday
selected_with_local_time["log_weight"] = np.log1p(selected_with_local_time["net_weight"])

# Norwegian special days/holidays
# Fixed holidays
def get_norwegian_holidays(year):
    """Return dictionary of Norwegian holidays for a given year"""
    from datetime import timedelta
    
    holidays = {}
    
    # Fixed date holidays
    holidays[f'{year}-01-01'] = 'New Year'
    holidays[f'{year}-05-01'] = 'Labour Day'
    holidays[f'{year}-05-17'] = 'Constitution Day'
    holidays[f'{year}-12-24'] = 'Christmas Eve'
    holidays[f'{year}-12-25'] = 'Christmas Day'
    holidays[f'{year}-12-26'] = 'Boxing Day'
    holidays[f'{year}-12-31'] = 'New Year Eve'
    
    # Easter-based holidays (Easter dates vary each year)
    # Approximate Easter calculation (Meeus/Jones/Butcher algorithm)
    a = year % 19
    b = year // 100
    c = year % 100
    d = b // 4
    e = b % 4
    f = (b + 8) // 25
    g = (b - f + 1) // 3
    h = (19 * a + b - d - g + 15) % 30
    i = c // 4
    k = c % 4
    l = (32 + 2 * e + 2 * i - h - k) % 7
    m = (a + 11 * h + 22 * l) // 451
    month = (h + l - 7 * m + 114) // 31
    day = ((h + l - 7 * m + 114) % 31) + 1
    
    easter = pd.Timestamp(year=year, month=month, day=day)
    
    # Easter-related holidays
    holidays[(easter - timedelta(days=3)).strftime('%Y-%m-%d')] = 'Maundy Thursday'
    holidays[(easter - timedelta(days=2)).strftime('%Y-%m-%d')] = 'Good Friday'
    holidays[easter.strftime('%Y-%m-%d')] = 'Easter Sunday'
    holidays[(easter + timedelta(days=1)).strftime('%Y-%m-%d')] = 'Easter Monday'
    holidays[(easter + timedelta(days=39)).strftime('%Y-%m-%d')] = 'Ascension Day'
    holidays[(easter + timedelta(days=49)).strftime('%Y-%m-%d')] = 'Whit Sunday'
    holidays[(easter + timedelta(days=50)).strftime('%Y-%m-%d')] = 'Whit Monday'
    
    return holidays

# Create a mapping of all dates to holidays
all_holidays = {}
for year in range(selected_with_local_time['date_arrival'].dt.year.min(), 
                  selected_with_local_time['date_arrival'].dt.year.max() + 1):
    all_holidays.update(get_norwegian_holidays(year))

# Add special day column
selected_with_local_time['date_str'] = selected_with_local_time['date_arrival'].dt.strftime('%Y-%m-%d')
selected_with_local_time['special_days'] = selected_with_local_time['date_str'].map(all_holidays).fillna('none').astype('category')
selected_with_local_time.drop('date_str', axis=1, inplace=True)

# Add binary flag for whether it's a holiday
selected_with_local_time['is_holiday'] = (selected_with_local_time['special_days'] != 'none').astype(int)

special_days = list(all_holidays.values())

# Make rm_id a string instead of numeric
selected_with_local_time["rm_id"] = selected_with_local_time["rm_id"].astype(int).astype(str).astype("category")
selected_with_local_time["is_holiday"] = selected_with_local_time["is_holiday"].astype(str).astype("category")
selected_with_local_time.drop("year", axis=1, inplace=True)

In [47]:
######################### CREATE TIME SERIES DATASET FOR PYTORCH FORECASTING ##################################
full_data = selected_with_local_time.copy()

max_prediction_length = 151
max_encoder_length = 365
# V: training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    # V: data[lambda x: x.local_time_idx <= training_cutoff],
    data = full_data,
    time_idx="local_time_idx",
    target="net_weight",
    group_ids=["rm_id"],
    min_encoder_length=max_encoder_length
    // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["rm_id"],
    #static_reals= no static real yet,
    time_varying_known_categoricals=["special_days", "month", "day_of_week", "is_holiday"],
    #variable_groups={
    #    "special_days": special_days
    #},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["local_time_idx"],
    # CAN PUT YEAR IN TIME_VARYING_KNOWN_REALS
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "quantity",
        "net_weight",
        "log_weight",
    ],
    target_normalizer=GroupNormalizer(
        groups=["rm_id"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
#V: validation = TimeSeriesDataSet.from_dataset(
#V:    training, data, predict=True, stop_randomization=True
#V:)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(
    train=True, batch_size=batch_size, num_workers=0
)
#V: val_dataloader = validation.to_dataloader(
#V:    train=False, batch_size=batch_size * 10, num_workers=0

In [48]:
################# DECIDING ON THE MODEL AND TRAINER PARAMETERS ##########################
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

# configure network and trainer
pl.seed_everything(42)
trainer = pl.Trainer(
    max_epochs=50,
    accelerator="gpu",
    enable_model_summary=True,
    gradient_clip_val=0.1,
    limit_train_batches=30,
    #fast_dev_run = True,
    callbacks=[lr_logger],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.01,
    hidden_size=8,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    loss=QuantileLoss(),
    #optimizer="ranger", OPTIMIZER FOR FINDING BEST LEARNING RATE
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size() / 1e3:.1f}k")

Seed set to 42
ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Number of parameters in network: 10.5k


In [49]:
######## TRAINING THE MODEL ##########

trainer.fit(
    tft,
    train_dataloaders=train_dataloader
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params | Mode 
------------------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0      | train
1  | logging_metrics                    | ModuleList                      | 0      | train
2  | input_embeddings                   | MultiEmbedding                  | 1.8 K  | train
3  | prescalers                         | ModuleDict                      | 128    | train
4  | static_variable_selection          | VariableSelectionNetwork        | 1.2 K  | train
5  | encoder_variable_selection         | VariableSelectionNetwork        | 2.5 K  | train
6  | decoder_variable_selection         | VariableSelectionNetwork        | 1.2 K  | train
7  | static_context_variable_selection  | GatedResidualNetwork            | 304    | train
8  | static_context_initial_hidden_lstm |

Epoch 49: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:09<00:00,  3.27it/s, v_num=1, train_loss_step=731.0, train_loss_epoch=584.0]   

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:09<00:00,  3.25it/s, v_num=1, train_loss_step=731.0, train_loss_epoch=584.0]


In [81]:
################# FULLL PREDICTION FOR ALL RM_IDs (NEED ENCODER AND DECODER DATA) ###############


rm_ids = full_data['rm_id'].unique().tolist()
predict_data = []
# Create prediction date range
pred_start = pd.Timestamp('2025-01-01')
pred_end = pd.Timestamp('2025-05-31')
pred_dates = pd.date_range(start=pred_start, end=pred_end, freq='D')

all_predict_dfs = []

for rm_id in rm_ids:
    test_rm_id = rm_id  # must match categorical rm_id type
    historical = full_data[full_data['rm_id'] == test_rm_id].copy()
    if historical.empty:
        continue
    min_date = historical['date_arrival'].min()

    # build prediction rows for this rm_id
    rows = []
    for date in pred_dates:
        time_idx = (date - min_date).days
        date_str = date.strftime('%Y-%m-%d')
        special_day = all_holidays.get(date_str, 'none')
        is_holiday = '1' if special_day != 'none' else '0'

        rows.append({
            'rm_id': test_rm_id,
            'date_arrival': date,
            'local_time_idx': time_idx,
            'month': str(date.month),
            'day_of_week': str(date.dayofweek),
            'special_days': special_day,
            'is_holiday': is_holiday,
            'net_weight': 0,   # placeholder
            'quantity': 0,     # placeholder
            'log_weight': 0    # placeholder
        })

    pred_df_rm = pd.DataFrame(rows)

    # encoder/context data (last max_encoder_length days)
    encoder_data = historical.tail(max_encoder_length).copy()
    encoder_data['local_time_idx'] = encoder_data['local_time_idx'].astype(int)

    # combine encoder + prediction for this rm_id and collect
    combined = pd.concat([encoder_data, pred_df_rm], ignore_index=True)
    all_predict_dfs.append(combined)

# final combined prediction dataframe for all rm_ids
predict_data = pd.concat(all_predict_dfs, ignore_index=True)

predict_data = pd.DataFrame(predict_data)

# Convert to categorical to match training data
predict_data['rm_id'] = predict_data['rm_id'].astype(str).astype('category')
predict_data['month'] = predict_data['month'].astype(str).astype('category')
predict_data['day_of_week'] = predict_data['day_of_week'].astype(str).astype('category')
predict_data['special_days'] = predict_data['special_days'].astype(str).astype('category')
predict_data['is_holiday'] = predict_data['is_holiday'].astype(str).astype('category')

# Ensure local_time_idx is integer (required by TimeSeriesDataSet)
predict_data['local_time_idx'] = predict_data['local_time_idx'].astype(int)

print(f"Prediction dataframe shape: {predict_data.shape}")
print(f"Date range: {predict_data['date_arrival'].min()} to {predict_data['date_arrival'].max()}")
print(f"Time index range: {predict_data['local_time_idx'].min()} to {predict_data['local_time_idx'].max()}")
print("\nFirst few rows:")
print(predict_data.head())

Prediction dataframe shape: (101473, 10)
Date range: 2024-01-02 00:00:00 to 2025-05-31 00:00:00
Time index range: 0 to 7655

First few rows:
   local_time_idx rm_id date_arrival  net_weight  quantity month day_of_week  \
0            7132   342   2024-01-02         0.0       0.0     1           1   
1            7133   342   2024-01-03         0.0       0.0     1           2   
2            7134   342   2024-01-04         0.0       0.0     1           3   
3            7135   342   2024-01-05         0.0       0.0     1           4   
4            7136   342   2024-01-06         0.0       0.0     1           5   

   log_weight special_days is_holiday  
0         0.0         none          0  
1         0.0         none          0  
2         0.0         none          0  
3         0.0         none          0  
4         0.0         none          0  


In [82]:
################## MAKE PREDICTIONS #######################

predictions = tft.predict(predict_data, return_x=True, return_index=True, return_decoder_lengths=True)

ltdx_and_rmid = predictions.index

output = predictions.output

pred = []

pred_start = pd.Timestamp('2025-01-01')
pred_end = pd.Timestamp('2025-05-31')
pred_dates = pd.date_range(start=pred_start, end=pred_end, freq='D')


for rm_id_index in range(0,192):
    rm_id_test = ltdx_and_rmid["rm_id"][rm_id_index]
    ltdx_test = ltdx_and_rmid["local_time_idx"][rm_id_index]
    for date in pred_dates:
        pred_weight = output[rm_id_index][(date-pred_start).days].item()
        pred.append({
            "rm_id": rm_id_test,
            "local_time_idx": ltdx_test,
            "date": date,
            "predicted_weight": pred_weight
        })


pred = pd.DataFrame(pred)
pred_over_0 = pred[pred["predicted_weight"]>0]

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [83]:
################## PREPARING THE SUBMISSION FILE #######################
sample_submission = pd.read_csv("../data/sample_submission.csv")
prediction_mapping = pd.read_csv("../data/prediction_mapping.csv", parse_dates=["forecast_start_date", "forecast_end_date"])

submission = sample_submission.merge(prediction_mapping, on="ID")
submission["forecast_end_date"] = pd.to_datetime(submission["forecast_end_date"])
submission["forecast_start_date"] = pd.to_datetime(submission["forecast_start_date"])

for p in pred_over_0.itertuples():
    rm_id = p.rm_id
    date_arrival = p.date.replace(tzinfo=None)
    predicted_weight = p.predicted_weight
    submission.loc[
        (submission['rm_id'] == int(rm_id)) & (submission['forecast_end_date'] >= date_arrival),
        'predicted_weight'
    ] += predicted_weight*0.8  # applying a scaling factor of 0.8 to predictions

In [84]:
############## PRINT THE MODEL's TOTAL PREDICTIONS PER RM_ID ##############
filtered = submission.copy()

agg_df = filtered.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)


print("TOTAL PREDICTED WEIGHTS PER RM_ID WITH THE ML MODEL:")
print(agg_df[agg_df["predicted_weight"]>0])

TOTAL PREDICTED WEIGHTS PER RM_ID WITH THE ML MODEL:
     rm_id  predicted_weight
75    2130      7.143864e+06
180   3865      5.190772e+06
176   3781      4.608997e+06
151   3126      3.721015e+06
150   3125      2.529422e+06
147   3122      2.082348e+06
149   3124      1.840926e+06
160   3282      1.600152e+06
148   3123      1.590398e+06
182   3901      9.931108e+05
85    2142      5.116467e+04
163   3421      2.996941e+04
27     387      8.345096e-33
47    1872      4.180805e-39


# BENEATH IS THE FALLBACK METHOD

In [85]:
################### FOR SOME RM_IDS THAT THE MODEL DIDN'T PREDICT MAKE FALLBACKS ###########################

receivals_2024 = orders_with_receivals[orders_with_receivals['date_arrival'].dt.year == 2024]
receivals_2024 = receivals_2024[['rm_id', 'date_arrival', 'net_weight']]
receivals_2023 = orders_with_receivals[orders_with_receivals['date_arrival'].dt.year == 2023]
receivals_2023 = receivals_2023[['rm_id', 'date_arrival', 'net_weight']]


########## FINDING WHICH RM_IDS ARE DEPRECATED BASED ON RECEIVALS DATES IN 2023 AND 2024 ##################
# I want to group the receivals by rm_id and get the latest date_arrival for each rm_id. If receivals_2024 end before month and day in receivals_2023 by 40 days, then I want to consider that rm_id as deprecated.
latest_2024 = receivals_2024.groupby('rm_id')['date_arrival'].max().reset_index()
latest_2023 = receivals_2023.groupby('rm_id')['date_arrival'].max().reset_index()

# I want to keep all 2024 even if they are not in 2023
merged_latest = latest_2023.merge(latest_2024, on='rm_id', suffixes=('_2023', '_2024'), how='right')
# You need to make the years the same in date arrival before calculating the difference in days
merged_latest['date_arrival_2023'] = merged_latest['date_arrival_2023'].apply(lambda x: x.replace(year=2024) if pd.notna(x) else x)
merged_latest['date_diff'] = (merged_latest['date_arrival_2023'] - merged_latest['date_arrival_2024']).dt.days
deprecated_rm_ids = merged_latest[merged_latest['date_diff'] >40]['rm_id'].tolist()
print(f"Deprecated RM IDs: {deprecated_rm_ids}")

Deprecated RM IDs: [2123.0, 2124.0, 2140.0, 2147.0, 2981.0, 3121.0, 3142.0, 3265.0, 3581.0, 3642.0, 3761.0, 4021.0, 4044.0]


In [86]:
############# FIND HOW MUCH WE WANT TO SCALE THE RM_IDS USING 2023 to 2024 AS A BASELINE ##################
# For both receivals_2023 and receivals_2024, I want to drop the rm_ids that are in deprecated_rm_ids
receivals_2023 = receivals_2023[~receivals_2023['rm_id'].isin(deprecated_rm_ids)]
receivals_2024 = receivals_2024[~receivals_2024['rm_id'].isin(deprecated_rm_ids)]

# Now I want to group by rm_id and sum net_weight for each rm_id by 2024-05-31
receivals_2024_grouped = receivals_2024[receivals_2024['date_arrival'] <= '2024-05-31'].groupby('rm_id')['net_weight'].sum().reset_index()
receivals_2023_grouped = receivals_2023[receivals_2023['date_arrival'] <= '2024-05-31'].groupby('rm_id')['net_weight'].sum().reset_index()
receivals_comparison = receivals_2023_grouped.merge(receivals_2024_grouped, on='rm_id', suffixes=('_2023', '_2024'), how='right')
receivals_comparison['weight_scale'] = receivals_comparison['net_weight_2024']/receivals_comparison['net_weight_2023']
# Make the weight which are 1 or more to be 1
receivals_comparison.loc[receivals_comparison['weight_scale'] >= 1, 'weight_scale'] = 1
# Fill weight scale nans with 0.7
receivals_comparison['weight_scale'] = receivals_comparison['weight_scale'].fillna(0.7)
# Make the weight_scale that are bigger than 0.7 to 0.7
receivals_comparison.loc[receivals_comparison['weight_scale'] > 0.7, 'weight_scale'] = 0.7

concluding_fallbacks = set(receivals_comparison["rm_id"].unique())
scale_mapping = dict(zip(receivals_comparison['rm_id'], receivals_comparison['weight_scale']))

In [87]:
test2024 = sample_submission.merge(prediction_mapping, on="ID")
# For every forecast_start_date and forecast_end_date in submission make it 2024 instead of 2025
test2024['forecast_start_date'] = test2024['forecast_start_date'].apply(lambda x: x.replace(year=2024))
test2024['forecast_end_date'] = test2024['forecast_end_date'].apply(lambda x: x.replace(year=2024))


############# CREATING A 2024 RECEIVALS SUBMISSION WITH THE DESIRED RM_IDS and SCALE ##################
# THIS IS METHOD 2 - MORE CONSERVATIVE - ONLY ADD IF RECEIVAL IS 2 DAYS BEFORE FORECAST END DATE 
test2024['predicted_weight'] = test2024['predicted_weight'].astype(float)

for receival in receivals_2024.itertuples():
    rm_id = receival.rm_id
    date_arrival = receival.date_arrival
    net_weight = receival.net_weight
    date_arrival_naive = date_arrival.replace(tzinfo=None)
    mask = (
        (test2024['rm_id'] == rm_id) &
        (test2024['forecast_end_date'] >= (date_arrival_naive+pd.Timedelta(days=2)))
    )
    # If rm_id is in scale_mapping + apply the scale, else don't do anything
    if rm_id in scale_mapping:
        test2024.loc[mask, 'predicted_weight'] += net_weight*scale_mapping[rm_id]

In [88]:
######## FIND WHICH RM_IDS THAT NEEDS FALLBACK AND DO FALLBACKS ON THE SUBMISSION FILE #######################

def do_fallback_on(submission, fallbacks):
    for rm_id in fallbacks:
        # Find IDs corresponding to this rm_id
        ids = prediction_mapping[prediction_mapping["rm_id"] == rm_id]["ID"].tolist()
        for id in ids:
            # Put predicted_weight from test2024 into submission
            weight = test2024[test2024["ID"] == id]["predicted_weight"]
            submission.loc[submission["ID"] == id, "predicted_weight"] = weight

best = submission.copy()

best_rmids = best.copy()
best_rmids = best_rmids[best_rmids["predicted_weight"]>0]
best_rmids = best_rmids[["ID", "predicted_weight", "rm_id"]]


best_rmids = best_rmids.groupby("rm_id", as_index=False).agg({
        "predicted_weight": "max",
        }).sort_values("predicted_weight", ascending=False)

best_rmids = set(best_rmids["rm_id"].unique())

fallbacks_to_do = concluding_fallbacks - best_rmids
print(f"RM IDs needing fallbacks: {fallbacks_to_do}")
# do fallbacks

do_fallback_on(best, fallbacks_to_do)

RM IDs needing fallbacks: {3201.0, 3601.0, 3362.0, 3621.0, 4263.0, 3883.0, 2741.0, 3381.0, 4161.0, 2125.0, 4302.0, 2129.0, 2131.0, 2132.0, 2133.0, 2134.0, 2135.0, 2143.0, 2144.0, 2145.0, 2161.0, 4081.0, 4222.0}


In [89]:
### REMOVE SMALL WEIGHTS BELOW 1 KG FROM THE FINAL SUBMISSION INTO 0 ####
best.loc[best["predicted_weight"]<1, "predicted_weight"] = 0

In [90]:
### PRINT THE FINAL MODEL + FALLBACKS TOTAL PREDICTIONS PER RM_ID ##############
print("TOTAL PREDICTED WEIGHTS PER RM_ID WITH THE ML MODEL + FALLBACKS:")
agg_df2 = best.copy().groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

print(agg_df2[agg_df2["predicted_weight"]>0])

TOTAL PREDICTED WEIGHTS PER RM_ID WITH THE ML MODEL + FALLBACKS:
     rm_id  predicted_weight
75    2130      7.143864e+06
180   3865      5.190772e+06
176   3781      4.608997e+06
151   3126      3.721015e+06
150   3125      2.529422e+06
147   3122      2.082348e+06
149   3124      1.840926e+06
160   3282      1.600152e+06
148   3123      1.590398e+06
182   3901      9.931108e+05
79    2134      2.976199e+05
80    2135      2.874560e+05
181   3883      1.355480e+05
136   2741      1.156820e+05
88    2145      1.135763e+05
86    2143      1.123164e+05
76    2131      1.099434e+05
87    2144      8.351958e+04
161   3362      6.725600e+04
190   4222      5.194000e+04
85    2142      5.116467e+04
77    2132      3.579853e+04
187   4081      3.409000e+04
163   3421      2.996941e+04
74    2129      2.591660e+04
191   4263      1.747200e+04
156   3201      1.678600e+04
192   4302      1.509200e+04
162   3381      1.421107e+04
78    2133      1.386660e+04
171   3621      1.017752e+04
71    2

In [91]:
best = best[["ID", "predicted_weight"]]
best.to_csv("FULL_PIPELINE.csv", index=False)