In [146]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

orders_with_receivals = pd.read_csv("../data_cleaned/orders_with_receivals_detailed.csv", parse_dates=["delivery_date", "created_date_time", "modified_date_time", "date_arrival"])
receivals_2024 = orders_with_receivals[orders_with_receivals['date_arrival'].dt.year == 2024]
receivals_2024 = receivals_2024[['rm_id', 'date_arrival', 'net_weight']]
receivals_2023 = orders_with_receivals[orders_with_receivals['date_arrival'].dt.year == 2023]
receivals_2023 = receivals_2023[['rm_id', 'date_arrival', 'net_weight']]

In [147]:
# I want to group the receivals by rm_id and get the latest date_arrival for each rm_id. If receivals_2024 end before month and day in receivals_2023 by 60 days, then I want to consider that rm_id as deprecated.
latest_2024 = receivals_2024.groupby('rm_id')['date_arrival'].max().reset_index()
latest_2023 = receivals_2023.groupby('rm_id')['date_arrival'].max().reset_index()

# I want to keep all 2024 even if they are not in 2023
merged_latest = latest_2023.merge(latest_2024, on='rm_id', suffixes=('_2023', '_2024'), how='right')
# You need to make the years the same in date arrival before calculating the difference in days
merged_latest['date_arrival_2023'] = merged_latest['date_arrival_2023'].apply(lambda x: x.replace(year=2024) if pd.notna(x) else x)
merged_latest['date_diff'] = (merged_latest['date_arrival_2023'] - merged_latest['date_arrival_2024']).dt.days
deprecated_rm_ids = merged_latest[merged_latest['date_diff'] > 40]['rm_id'].tolist()
print(len(deprecated_rm_ids))
print(deprecated_rm_ids)

13
[2123.0, 2124.0, 2140.0, 2147.0, 2981.0, 3121.0, 3142.0, 3265.0, 3581.0, 3642.0, 3761.0, 4021.0, 4044.0]


In [148]:
# For both receivals_2023 and receivals_2024, I want to drop the rm_ids that are in deprecated_rm_ids
receivals_2023 = receivals_2023[~receivals_2023['rm_id'].isin(deprecated_rm_ids)]
receivals_2024 = receivals_2024[~receivals_2024['rm_id'].isin(deprecated_rm_ids)]

# Now I want to group by rm_id and sum net_weight for each rm_id by 2024-05-31
receivals_2024_grouped = receivals_2024[receivals_2024['date_arrival'] <= '2024-05-31'].groupby('rm_id')['net_weight'].sum().reset_index()
receivals_2023_grouped = receivals_2023[receivals_2023['date_arrival'] <= '2024-05-31'].groupby('rm_id')['net_weight'].sum().reset_index()
receivals_comparison = receivals_2023_grouped.merge(receivals_2024_grouped, on='rm_id', suffixes=('_2023', '_2024'), how='right')
receivals_comparison['weight_scale'] = receivals_comparison['net_weight_2024']/receivals_comparison['net_weight_2023']
# Make the weight which are 1 or more to be 1
receivals_comparison.loc[receivals_comparison['weight_scale'] >= 1, 'weight_scale'] = 1
# Fill weight scale nans with 0.7
receivals_comparison['weight_scale'] = receivals_comparison['weight_scale'].fillna(0.7)
# Make the weight_scale that are bigger than 0.7 to 0.7
receivals_comparison.loc[receivals_comparison['weight_scale'] > 0.7, 'weight_scale'] = 0.7

concluding_fallbacks = set(receivals_comparison["rm_id"].unique())
scale_mapping = dict(zip(receivals_comparison['rm_id'], receivals_comparison['weight_scale']))

In [149]:
sample_submission = pd.read_csv("../data/sample_submission.csv")
prediction_mapping = pd.read_csv("../data/prediction_mapping.csv", parse_dates=["forecast_start_date", "forecast_end_date"])
test2024 = sample_submission.merge(prediction_mapping, on="ID")

In [150]:
# For every forecast_start_date and forecast_end_date in submission make it 2024 instead of 2025
test2024['forecast_start_date'] = test2024['forecast_start_date'].apply(lambda x: x.replace(year=2024))
test2024['forecast_end_date'] = test2024['forecast_end_date'].apply(lambda x: x.replace(year=2024))

In [151]:
# THIS IS METHOD 2 - MORE CONSERVATIVE - ONLY ADD IF RECEIVAL IS 2 DAYS BEFORE FORECAST END DATE 
test2024['predicted_weight'] = test2024['predicted_weight'].astype(float)

for receival in receivals_2024.itertuples():
    rm_id = receival.rm_id
    date_arrival = receival.date_arrival
    net_weight = receival.net_weight
    date_arrival_naive = date_arrival.replace(tzinfo=None)
    mask = (
        (test2024['rm_id'] == rm_id) &
        (test2024['forecast_end_date'] >= (date_arrival_naive+pd.Timedelta(days=2)))
    )
    # If rm_id is in scale_mapping + apply the scale, else don't do anything
    if rm_id in scale_mapping:
        test2024.loc[mask, 'predicted_weight'] += net_weight*scale_mapping[rm_id]

In [152]:
def do_fallback_on(submission, fallbacks):
    for rm_id in fallbacks:
        # Find IDs corresponding to this rm_id
        ids = prediction_mapping[prediction_mapping["rm_id"] == rm_id]["ID"].tolist()
        for id in ids:
            # Put predicted_weight from test2024 into submission
            weight = test2024[test2024["ID"] == id]["predicted_weight"]
            submission.loc[submission["ID"] == id, "predicted_weight"] = weight

In [153]:
best = pd.read_csv("DEADGE_submission.csv")

# Find best rm_ids that are bigger than 0 predicted_weight
best_rmids = best.copy()
best_rmids = best_rmids[best_rmids["predicted_weight"]>0]
best_rmids = best_rmids.merge(prediction_mapping, on="ID", how='left')
best_rmids = best_rmids[["ID", "predicted_weight", "rm_id"]]


best_rmids = best_rmids.groupby("rm_id", as_index=False).agg({
        "predicted_weight": "max",
        }).sort_values("predicted_weight", ascending=False)

best_rmids = set(best_rmids["rm_id"].unique())

fallbacks_to_do = concluding_fallbacks - best_rmids

# do fallbacks

do_fallback_on(best, fallbacks_to_do)

In [154]:
best.to_csv("DEADGE_new_fallbacks.csv")