In [1]:
import pandas as pd

test2024 = pd.read_csv("../validation/submission_2024receivals_scale07.csv")
prediction_mapping = pd.read_csv("../data/prediction_mapping.csv")

merged = test2024.merge(prediction_mapping, on="ID")

filtered = merged[merged.groupby("rm_id")["predicted_weight"].transform("sum") > 0]

agg_df = filtered.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

ModuleNotFoundError: No module named 'pandas'

In [20]:
used_rm_ids = set(agg_df["rm_id"])

receivals = pd.read_csv("../data_cleaned/orders_with_receivals_detailed.csv")
receivals_filtered = receivals[receivals["rm_id"].isin(used_rm_ids)]

In [21]:
# Select columns: rm_id, date_arrival, net_weight
selected = receivals_filtered[["rm_id", "date_arrival", "net_weight"]]

print(selected)
print(len(used_rm_ids))

         rm_id               date_arrival  net_weight
34057   2130.0  2012-03-14 10:58:00+02:00      8920.0
34058   2130.0  2012-03-14 10:58:00+02:00      4120.0
34059   2130.0  2012-03-20 17:34:00+02:00      2236.0
34060   2130.0  2012-03-20 17:34:00+02:00      4188.0
34065   2142.0  2012-03-14 10:58:00+02:00       680.0
...        ...                        ...         ...
133275  2142.0  2024-12-12 12:41:00+02:00      3920.0
133276  2143.0  2024-12-12 12:41:00+02:00       260.0
133288  3381.0  2024-12-18 12:18:00+02:00      2806.0
133292  3901.0  2024-12-17 16:05:00+02:00     12540.0
133293  3901.0  2024-12-19 11:40:00+02:00     14040.0

[60444 rows x 3 columns]
46


In [22]:
# ARIMA-based time series forecasting for receival prediction
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
import numpy as np
from datetime import timedelta
import pandas as pd

warnings.filterwarnings('ignore')
simulated_receivals = []
np.random.seed(42)  # For reproducibility

for rm_id in agg_df["rm_id"]:
    df_rm = receivals_filtered[receivals_filtered["rm_id"] == rm_id].copy()
    if df_rm.empty:
        continue
    
    df_rm = df_rm.sort_values("date_arrival")
    df_rm["date_arrival"] = pd.to_datetime(df_rm["date_arrival"])
    
    # Use only data after 2017 for forecasting
    df_hist = df_rm[df_rm["date_arrival"].dt.year > 2017].copy()
    if len(df_hist) < 10:  # Need more data points for ARIMA
        continue
    
    # Create monthly time series of total net_weight
    df_hist.set_index("date_arrival", inplace=True)
    monthly_weights = df_hist["net_weight"].resample('M').sum()
    
    # Remove months with zero weights for better ARIMA fitting
    monthly_weights = monthly_weights[monthly_weights > 0]
    
    if len(monthly_weights) < 6:  # Need at least 6 months for ARIMA
        continue
    
    # Analyze historical arrival patterns for this rm_id
    df_hist_reset = df_hist.reset_index()
    df_hist_reset['month'] = df_hist_reset['date_arrival'].dt.month
    df_hist_reset['day_of_year'] = df_hist_reset['date_arrival'].dt.dayofyear
    
    # Calculate monthly distribution (probability of receivals per month)
    monthly_counts = df_hist_reset['month'].value_counts().sort_index()
    monthly_probs = monthly_counts / monthly_counts.sum()
    
    # Calculate day-of-year distribution for more granular patterns
    dayofyear_counts = df_hist_reset['day_of_year'].value_counts()
    
    try:
        # Fit ARIMA model - using auto order selection
        # Try different ARIMA orders and select best based on AIC
        best_aic = float('inf')
        best_model = None
        best_order = None
        
        # Test different ARIMA orders
        for p in range(0, 3):
            for d in range(0, 2):
                for q in range(0, 3):
                    try:
                        model = ARIMA(monthly_weights, order=(p, d, q))
                        fitted_model = model.fit()
                        if fitted_model.aic < best_aic:
                            best_aic = fitted_model.aic
                            best_model = fitted_model
                            best_order = (p, d, q)
                    except:
                        continue
        
        if best_model is None:
            # Fallback to simple mean if ARIMA fails
            monthly_pred = monthly_weights.mean()
            n_receivals = max(1, int(monthly_weights.count() / len(monthly_weights.index) * 12))  # Average per month * 12
        else:
            # Forecast 12 months ahead (2025)
            forecast_result = best_model.forecast(steps=12)
            monthly_pred = max(0, forecast_result.mean())  # Take mean and ensure positive
            
            # Estimate number of receivals per month based on historical frequency
            historical_monthly_counts = df_hist.resample('M')["net_weight"].count()
            avg_monthly_count = max(1, int(historical_monthly_counts.mean()))
            n_receivals = avg_monthly_count * 12  # Scale to full year
    
    except Exception as e:
        # Fallback if ARIMA completely fails
        monthly_pred = monthly_weights.mean()
        n_receivals = max(1, len(df_hist) // 2)  # Conservative estimate
    
    if n_receivals <= 0:
        continue
    
    # Generate predicted total weight for 2025 based on ARIMA forecast
    total_predicted_weight = monthly_pred * 12  # 12 months
    
    # Cap total predicted weight to reasonable bounds (max of last 3 years)
    yearly_totals = df_hist.resample('Y')["net_weight"].sum()
    if len(yearly_totals) > 0:
        max_historical_year = yearly_totals.max()
        total_predicted_weight = min(total_predicted_weight, max_historical_year * 1.2)  # Allow 20% growth
    
    # Sample individual receival weights from historical distribution
    historical_weights = df_hist["net_weight"][df_hist["net_weight"] > 0]
    if len(historical_weights) == 0:
        continue
    
    # Generate individual receivals
    receivals_list = []
    running_total = 0
    
    for i in range(n_receivals):
        if running_total >= total_predicted_weight:
            break
            
        # Sample weight from historical distribution
        weight = np.random.choice(historical_weights)
        
        # Don't exceed total predicted weight
        if running_total + weight > total_predicted_weight:
            weight = max(0, total_predicted_weight - running_total)
            if weight <= 0:
                break
        
        # Generate date based on historical patterns for this rm_id
        if len(monthly_probs) > 0:
            # Sample month based on historical monthly distribution
            sampled_month = np.random.choice(monthly_probs.index, p=monthly_probs.values)
            
            # Generate random day within that month
            if sampled_month in [1, 3, 5, 7, 8, 10, 12]:
                max_day = 31
            elif sampled_month in [4, 6, 9, 11]:
                max_day = 30
            else:  # February
                max_day = 28  # Assuming non-leap year for 2025
            
            sampled_day = np.random.randint(1, max_day + 1)
            
            try:
                arrival_date = pd.Timestamp(f"2025-{sampled_month:02d}-{sampled_day:02d}")
            except:
                # Fallback if invalid date
                arrival_date = pd.Timestamp("2025-01-01") + timedelta(days=np.random.randint(0, 365))
        else:
            # Fallback to uniform random if no historical patterns
            start_date = pd.Timestamp("2025-01-01")
            random_days = np.random.randint(0, 365)
            arrival_date = start_date + timedelta(days=random_days)
        
        receivals_list.append({
            "rm_id": rm_id,
            "date_arrival": arrival_date,
            "net_weight": weight
        })
        
        running_total += weight
    
    simulated_receivals.extend(receivals_list)

simulated_df = pd.DataFrame(simulated_receivals)
print(f"ARIMA-based forecasting complete.")
print(f"Total simulated receivals for 2025: {len(simulated_df)}")
if len(simulated_df) > 0:
    print(f"Total predicted weight: {simulated_df['net_weight'].sum():,.0f}")
    print(f"Unique rm_ids with predictions: {simulated_df['rm_id'].nunique()}")
    print(simulated_df.head())

ARIMA-based forecasting complete.
Total simulated receivals for 2025: 5183
Total predicted weight: 71,085,581
Unique rm_ids with predictions: 36
   rm_id date_arrival  net_weight
0   3781   2025-10-15     13880.0
1   3781   2025-10-21      9340.0
2   3781   2025-03-11     20980.0
3   3781   2025-10-04     22340.0
4   3781   2025-08-22     22160.0


In [23]:
simulated_df.to_csv("simulated_receivals_2025.csv", index=False)

In [24]:
sample_submission = pd.read_csv("../data/sample_submission.csv")
prediction_mapping = pd.read_csv("../data/prediction_mapping.csv", parse_dates=["forecast_start_date", "forecast_end_date"])
submission = sample_submission.merge(prediction_mapping, on="ID")
simulated_df = pd.read_csv("simulated_receivals_2025.csv", parse_dates=["date_arrival"])

In [25]:
for receival in simulated_df.itertuples():
    rm_id = receival.rm_id
    date_arrival = receival.date_arrival
    net_weight = receival.net_weight
    # Convert date_arrival to naive datetime for comparison
    date_arrival_naive = date_arrival.replace(tzinfo=None)
    submission.loc[
        (submission['rm_id'] == rm_id) & (submission['forecast_end_date'] >= date_arrival_naive),
        'predicted_weight'
    ] += (net_weight*0.8)

In [26]:
submission = submission[["ID", "predicted_weight"]]
submission.to_csv("testing2025.csv", index=False)

In [17]:
submission = pd.read_csv("testing2025.csv")

In [27]:
test_df = submission.merge(prediction_mapping, on="ID")
test_df = test_df.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

print(test_df[0:46])

     rm_id  predicted_weight
75    2130      6.009917e+06
180   3865      2.117237e+06
151   3126      1.819248e+06
83    2140      1.670645e+06
147   3122      1.637264e+06
160   3282      1.525760e+06
150   3125      1.462576e+06
149   3124      1.024576e+06
148   3123      1.002832e+06
176   3781      9.871144e+05
79    2134      6.253800e+05
159   3265      4.949120e+05
182   3901      4.429440e+05
85    2142      3.663764e+05
87    2144      2.637948e+05
142   2981      2.551360e+05
80    2135      2.384537e+05
76    2131      2.293000e+05
77    2132      1.388832e+05
181   3883      1.175360e+05
88    2145      1.114832e+05
78    2133      1.029703e+05
152   3142      9.710400e+04
163   3421      9.254000e+04
136   2741      9.111520e+04
191   4263      8.025600e+04
172   3642      5.952000e+04
190   4222      5.915200e+04
86    2143      5.849040e+04
71    2125      3.602880e+04
161   3362      2.980800e+04
90    2147      2.336704e+04
70    2124      1.680160e+04
74    2129    

In [None]:
# SCALE SPECIFIC RM_ID DOWN
testing_2130 = pd.read_csv("testing2025.csv")
testing_2130 = testing_2130.merge(prediction_mapping, on="ID")

testing_2130.loc[testing_2130['rm_id'] == 2130, 'predicted_weight'] *= 0.6

testing = testing_2130.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

print(testing)
testing_2130 = testing_2130[["ID", "predicted_weight"]]


     rm_id  predicted_weight
75    2130      3.605950e+06
180   3865      2.117237e+06
151   3126      1.819248e+06
83    2140      1.670645e+06
147   3122      1.637264e+06
..     ...               ...
64    2001      0.000000e+00
65    2061      0.000000e+00
66    2102      0.000000e+00
67    2121      0.000000e+00
202   4501      0.000000e+00

[203 rows x 2 columns]


In [40]:
testing_2130.to_csv("testing2025_2130_only.csv", index=False)