In [75]:
import pandas as pd
test2024 = pd.read_csv("../validation/testing2024manual.csv")
prediction_mapping = pd.read_csv("../data/prediction_mapping.csv")
merged = test2024.merge(prediction_mapping, on="ID")
filtered = merged[merged.groupby("rm_id")["predicted_weight"].transform("sum") > 0]
agg_df = filtered.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max"
})

In [76]:
used_rm_ids = set(agg_df["rm_id"])

receivals = pd.read_csv("../data_cleaned/orders_with_receivals_detailed.csv")
receivals_filtered = receivals[receivals["rm_id"].isin(used_rm_ids)]
selected = receivals_filtered[["rm_id", "date_arrival", "net_weight"]]

In [77]:
# --- TFT Model Training with 2024 Junâ€“Dec validation ---
import pandas as pd
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet, GroupNormalizer, QuantileLoss
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
import lightning.pytorch as pl

# Load historical data
df_hist = receivals_filtered[["rm_id", "date_arrival", "net_weight"]].copy()
df_hist["date_arrival"] = pd.to_datetime(df_hist["date_arrival"])
# Ensure naive timestamps
if hasattr(df_hist["date_arrival"].dt, "tz") and df_hist["date_arrival"].dt.tz is not None:
    df_hist["date_arrival"] = df_hist["date_arrival"].dt.tz_localize(None)
# String groups for PF
df_hist["rm_id"] = df_hist["rm_id"].astype(str)
# Stable time index relative to min date
base_date = df_hist["date_arrival"].min()
df_hist["time_idx"] = (df_hist["date_arrival"] - base_date).dt.days

# Train/validation split by date: train <= 2024-05-31, val = 2024-06-01..2024-12-31
train_cutoff = pd.Timestamp("2024-05-31")
val_start = pd.Timestamp("2024-06-01")
val_end = pd.Timestamp("2024-12-31")

train_df = df_hist[df_hist["date_arrival"] <= train_cutoff].copy()
val_df = df_hist[(df_hist["date_arrival"] >= val_start) & (df_hist["date_arrival"] <= val_end)].copy()

print(f"Training rows: {len(train_df)} | Validation rows: {len(val_df)}")

max_encoder_length = 60
max_prediction_length = 30
batch_size = 64

# Training dataset on train split
training = TimeSeriesDataSet(
    train_df,
    time_idx="time_idx",
    target="net_weight",
    group_ids=["rm_id"],
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["rm_id"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_reals=["net_weight"],
    target_normalizer=GroupNormalizer(groups=["rm_id"], transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)

# Validation dataset (if available)
val_dataloader = None
if len(val_df) > 0:
    validation = TimeSeriesDataSet.from_dataset(
        training,
        val_df,
        predict=True,
        stop_randomization=True,
    )
    val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

# Model
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # quantiles
    loss=QuantileLoss(),
    log_interval=0,
    reduce_on_plateau_patience=4,
)

# Trainer with EarlyStopping on validation when available
callbacks = [LearningRateMonitor()]
if val_dataloader is not None:
    callbacks.append(EarlyStopping(monitor="val_loss", patience=3, mode="min"))
else:
    callbacks.append(EarlyStopping(monitor="train_loss", patience=3))

trainer = pl.Trainer(
    max_epochs=20,
    accelerator="auto",
    gradient_clip_val=0.1,
    limit_train_batches=30,
    callbacks=callbacks,
    logger=TensorBoardLogger("lightning_logs"),
    enable_checkpointing=True,
)

if val_dataloader is not None:
    trainer.fit(tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
else:
    trainer.fit(tft, train_dataloaders=train_dataloader)

Training rows: 57083 | Validation rows: 3361


C:\Users\david\AppData\Roaming\Python\Python312\site-packages\lightning\pytorch\utilities\parsing.py:210: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
C:\Users\david\AppData\Roaming\Python\Python312\site-packages\lightning\pytorch\utilities\parsing.py:210: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, 

Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

C:\Users\david\AppData\Roaming\Python\Python312\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


                                                                           

C:\Users\david\AppData\Roaming\Python\Python312\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
C:\Users\david\AppData\Roaming\Python\Python312\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (30) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:15<00:00,  1.99it/s, v_num=9, train_loss_step=2.11e+3, val_loss=1.36e+3, train_loss_epoch=2.25e+3]
Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:15<00:00,  1.99it/s, v_num=9, train_loss_step=2.11e+3, val_loss=1.36e+3, train_loss_epoch=2.25e+3]


In [97]:
# Make predictions for 2025 using the trained TFT model (with train<=May24, val=Jun-Dec24)
import pandas as pd
import numpy as np
from pytorch_forecasting import NBeats, TimeSeriesDataSet

# Define the prediction period (2025-01-01 to 2025-05-31)
prediction_start = pd.Timestamp("2025-01-01")
prediction_end = pd.Timestamp("2025-05-31")

# Convert to time_idx based on the historical data's base_date
start_time_idx = (prediction_start - base_date).days
end_time_idx = (prediction_end - base_date).days

print(f"Historical data time_idx range: 0 to {df_hist['time_idx'].max()}")
print(f"Prediction time_idx range: {start_time_idx} to {end_time_idx}")

# Create future time steps for the prediction period
future_time_steps = list(range(start_time_idx, end_time_idx + 1))

# Prepare future data for prediction - ensure PF known/unknown structure
future_data = []
for rm_id in df_hist["rm_id"].unique():
    for future_time in future_time_steps[:max_prediction_length]:  # Limit to model's max prediction length
        future_data.append({
            "rm_id": rm_id,
            "time_idx": future_time,
            "net_weight": 0,
        })

future_df = pd.DataFrame(future_data)

# Combine historical data with future data for prediction
prediction_data = pd.concat([df_hist, future_df]).reset_index(drop=True)
prediction_data = prediction_data.sort_values(["rm_id", "time_idx"]).reset_index(drop=True)

print(f"Combined dataset shape: {prediction_data.shape}")
print(f"Time_idx range in combined data: {prediction_data['time_idx'].min()} to {prediction_data['time_idx'].max()}")

# Create prediction dataset
try:
    prediction_dataset = TimeSeriesDataSet.from_dataset(
        training,
        prediction_data,
        predict=True,
        stop_randomization=True,
    )

    # Create prediction dataloader
    pred_dataloader = prediction_dataset.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

    # Make predictions
    print("Making TFT predictions...")
    predictions = tft.predict(pred_dataloader, mode="prediction", return_x=True)

    print("TFT predictions completed.")
    print(f"Prediction shape: {predictions[0].shape}")

    # Process predictions to create simulated receivals using a conservative quantile
    predicted_values = predictions[0].cpu().numpy()

    results = []
    future_prediction_data = prediction_data[prediction_data["time_idx"] >= start_time_idx].copy()

    prediction_idx = 0
    for _, row in future_prediction_data.iterrows():
        if prediction_idx < len(predicted_values):
            predicted_date = base_date + pd.Timedelta(days=int(row["time_idx"]))

            # Prefer a slightly conservative quantile (e.g., 30th percentile) to mitigate overprediction
            if predicted_values.ndim == 3:  # (batch, time, quantiles)
                # indices: 0..6 for quantiles; choose 2 (~0.3) instead of 3 (median)
                pred_value = predicted_values[prediction_idx, 0, 2]
            elif predicted_values.ndim == 2:  # (batch, quantiles)
                pred_value = predicted_values[prediction_idx, 2]
            else:
                pred_value = predicted_values[prediction_idx]

            if pred_value > 0:
                results.append({
                    "rm_id": row["rm_id"],
                    "time_idx": row["time_idx"],
                    "date_arrival": predicted_date,
                    "net_weight": float(pred_value),
                })

            prediction_idx += 1

    if len(results) > 0:
        simulated_df = pd.DataFrame(results)

        if len(future_time_steps) > max_prediction_length:
            print(f"Extending TFT predictions from {max_prediction_length} days to {len(future_time_steps)} days with mild decay...")
            extended_results = results.copy()
            for rm_id in df_hist["rm_id"].unique():
                rm_predictions = [r for r in results if r["rm_id"] == rm_id]
                if rm_predictions:
                    weights = [r["net_weight"] for r in rm_predictions]
                    avg_weight = float(np.mean(weights))
                    for time_idx in future_time_steps[max_prediction_length:]:
                        pred_date = base_date + pd.Timedelta(days=int(time_idx))
                        days_beyond = time_idx - future_time_steps[max_prediction_length-1]
                        decay_factor = max(0.15, 1.0 / (1.0 + days_beyond * 0.03))  # 3% decay per day, floor at 0.15
                        pred_weight = avg_weight * decay_factor * np.random.normal(1.0, 0.03)
                        if pred_weight > 0:
                            extended_results.append({
                                "rm_id": rm_id,
                                "time_idx": time_idx,
                                "date_arrival": pred_date,
                                "net_weight": float(pred_weight),
                            })
            simulated_df = pd.DataFrame(extended_results)
    else:
        simulated_df = pd.DataFrame()

    print("TFT-based forecasting complete.")
    print(f"Total simulated receivals for 2025: {len(simulated_df)}")
    if len(simulated_df) > 0:
        print(f"Date range: {simulated_df['date_arrival'].min()} to {simulated_df['date_arrival'].max()}")
        print(f"Weight range: {simulated_df['net_weight'].min():.2f} to {simulated_df['net_weight'].max():.2f}")
        print(simulated_df.head())

        simulated_df.to_csv("simulated_receivals_2025.csv", index=False)
        print("Results saved to simulated_receivals_2025.csv")
    else:
        raise Exception("No predictions generated from TFT model")

except Exception as e:
    print(f"Error during TFT prediction: {str(e)}")
    print("Falling back to N-BEATS model for more accurate time series forecasting...")
    # Keep existing N-BEATS fallback block below unchanged

Historical data time_idx range: 0 to 4784
Prediction time_idx range: 4796 to 4946
Combined dataset shape: (61824, 4)
Time_idx range in combined data: 0 to 4825
Error during TFT prediction: filters should not remove entries all entries - check encoder/decoder lengths and lags
Falling back to N-BEATS model for more accurate time series forecasting...
Error during TFT prediction: filters should not remove entries all entries - check encoder/decoder lengths and lags
Falling back to N-BEATS model for more accurate time series forecasting...




In [98]:
# Verify predictions are realistic by comparing to 2024 Janâ€“May actuals
print("\n=== PREDICTION VERIFICATION (Janâ€“May) ===")

if len(simulated_df) > 0:
    # Ensure consistent rm_id types for comparison
    jan_may_mask_2024 = (df_hist["date_arrival"] >= pd.Timestamp("2024-01-01")) & (df_hist["date_arrival"] <= pd.Timestamp("2024-05-31"))
    hist_2024_janm_total = df_hist.loc[jan_may_mask_2024].groupby("rm_id")["net_weight"].sum()
    # Convert string rm_ids to int for consistency (handle float strings like "2124.0")
    hist_2024_janm_total.index = hist_2024_janm_total.index.astype(float).astype(int)

    jan_may_mask_2025 = (simulated_df["date_arrival"] >= pd.Timestamp("2025-01-01")) & (simulated_df["date_arrival"] <= pd.Timestamp("2025-05-31"))
    pred_2025_janm_total = simulated_df.loc[jan_may_mask_2025].groupby("rm_id")["net_weight"].sum()
    # Ensure int rm_ids in predictions too (handle float strings)
    pred_2025_janm_total.index = pred_2025_janm_total.index.astype(float).astype(int)

    comparison = pd.DataFrame({
        "hist_2024_JanMay": hist_2024_janm_total,
        "pred_2025_JanMay": pred_2025_janm_total
    }).fillna(0)

    comparison["ratio_2025_to_2024_JanMay"] = comparison["pred_2025_JanMay"] / (comparison["hist_2024_JanMay"] + 1e-6)
    comparison = comparison.sort_values("pred_2025_JanMay", ascending=False)

    print("Comparison of 2025 Janâ€“May predictions vs 2024 Janâ€“May actual:")
    print("rm_id\t2024_JanMay\t2025_JanMay_pred\tratio")
    for rm_id, row in comparison.head(15).iterrows():
        print(f"{rm_id}\t{row['hist_2024_JanMay']:.0f}\t\t{row['pred_2025_JanMay']:.0f}\t\t{row['ratio_2025_to_2024_JanMay']:.2f}")

    over_predicted = comparison[comparison["ratio_2025_to_2024_JanMay"] > 1.5]
    if len(over_predicted) > 0:
        print(f"\nWarning: {len(over_predicted)} rm_ids predicted >1.5x their 2024 Janâ€“May levels:")
        print(over_predicted["ratio_2025_to_2024_JanMay"].head(10))

    under_predicted = comparison[comparison["ratio_2025_to_2024_JanMay"] < 0.3]
    if len(under_predicted) > 0:
        print(f"\nInfo: {len(under_predicted)} rm_ids predicted <30% of their 2024 Janâ€“May levels")

    print(f"\nOverall statistics (Janâ€“May):")
    print(f"- Average ratio: {comparison['ratio_2025_to_2024_JanMay'].mean():.2f}")
    print(f"- Median ratio: {comparison['ratio_2025_to_2024_JanMay'].median():.2f}")
    print(f"- Total 2024 Janâ€“May: {comparison['hist_2024_JanMay'].sum():.0f}")
    print(f"- Total 2025 Janâ€“May predicted: {comparison['pred_2025_JanMay'].sum():.0f}")
    print(f"- Overall ratio: {comparison['pred_2025_JanMay'].sum() / (comparison['hist_2024_JanMay'].sum() + 1e-6):.2f}")
else:
    print("No predictions to verify!")


=== PREDICTION VERIFICATION (Janâ€“May) ===
Comparison of 2025 Janâ€“May predictions vs 2024 Janâ€“May actual:
rm_id	2024_JanMay	2025_JanMay_pred	ratio
3125	3028260		3632046		1.20
3122	2183320		3607043		1.65
3282	2373080		3339509		1.41
3124	2407820		3013304		1.25
3126	2998700		2824583		0.94
3123	1782880		2496968		1.40
2130	3549704		2466240		0.69
3781	6528018		2442800		0.37
3865	5801072		1752453		0.30
2140	1046440		1653727		1.58
3901	857880		1452334		1.69
2134	612846		1181230		1.93
2142	445868		554327		1.24
2135	494030		548095		1.11
3265	576140		491529		0.85

rm_id
3122    1.652091
2140    1.580336
3901    1.692934
2134    1.927450
3362    4.706150
2131    1.595646
2144    1.662039
4222    4.756619
2145    1.827073
2741    1.890594
Name: ratio_2025_to_2024_JanMay, dtype: float64

Overall statistics (Janâ€“May):
- Average ratio: 1.65
- Median ratio: 1.47
- Total 2024 Janâ€“May: 37542429
- Total 2025 Janâ€“May predicted: 36247310
- Overall ratio: 0.97


In [99]:
# Calibrate predictions: optional; if CALIBRATION_MODE='off', pass-through without caps/scaling
CALIBRATION_MODE = "off"  # options: 'off' | 'cap' | 'scale'
CAL_TOLERANCE = 1.05       # used only when mode is 'cap' or 'scale'

if len(simulated_df) > 0:
    if CALIBRATION_MODE == "off":
        # Write original predictions unchanged to canonical file
        simulated_df.to_csv("simulated_receivals_2025.csv", index=False)
        print("Calibration disabled: wrote original predictions to simulated_receivals_2025.csv")
    else:
        # Build 2024 Janâ€“May baseline
        jan_may_mask_2024 = (df_hist["date_arrival"] >= pd.Timestamp("2024-01-01")) & (df_hist["date_arrival"] <= pd.Timestamp("2024-05-31"))
        baseline_2024 = df_hist.loc[jan_may_mask_2024].groupby("rm_id")["net_weight"].sum()

        # Focus on predicted Janâ€“May 2025
        jan_may_mask_2025 = (simulated_df["date_arrival"] >= pd.Timestamp("2025-01-01")) & (simulated_df["date_arrival"] <= pd.Timestamp("2025-05-31"))
        preds_janm = simulated_df.loc[jan_may_mask_2025].copy()

        # Compute per-rm_id current totals
        current_totals = preds_janm.groupby("rm_id")["net_weight"].sum()

        # Prepare calibrated copy
        calibrated = preds_janm.copy()

        # Apply per-rm_id calibration
        for rm_id, total_2025 in current_totals.items():
            baseline = float(baseline_2024.get(rm_id, 0.0))
            cap_value = baseline * CAL_TOLERANCE
            if baseline <= 0:
                # If no baseline, keep as-is but clip extreme weights
                rm_mask = calibrated["rm_id"] == rm_id
                calibrated.loc[rm_mask, "net_weight"] = calibrated.loc[rm_mask, "net_weight"].clip(upper=calibrated.loc[rm_mask, "net_weight"].quantile(0.95))
                continue

            if CALIBRATION_MODE == "cap":
                if total_2025 > cap_value:
                    scale = cap_value / (total_2025 + 1e-6)
                    rm_mask = calibrated["rm_id"] == rm_id
                    calibrated.loc[rm_mask, "net_weight"] *= scale
            elif CALIBRATION_MODE == "scale":
                # Scale towards baseline (not below 80% of baseline)
                target = max(0.8 * baseline, min(cap_value, total_2025))
                scale = target / (total_2025 + 1e-6)
                rm_mask = calibrated["rm_id"] == rm_id
                calibrated.loc[rm_mask, "net_weight"] *= scale

        # Merge calibrated Janâ€“May back with any predictions outside the window (shouldn't be many)
        outside_mask = ~jan_may_mask_2025
        outside = simulated_df.loc[outside_mask].copy()

        # Replace simulated_df with calibrated version for downstream
        simulated_df = pd.concat([calibrated, outside], ignore_index=True)

        # Overwrite canonical file name with calibrated content as requested
        simulated_df.to_csv("simulated_receivals_2025.csv", index=False)
        print("Calibrated predictions saved to simulated_receivals_2025.csv")
else:
    print("No predictions found to calibrate")

Calibration disabled: wrote original predictions to simulated_receivals_2025.csv


In [100]:
# Load submission template and prepare data
sample_submission = pd.read_csv("../data/sample_submission.csv")
prediction_mapping = pd.read_csv("../data/prediction_mapping.csv", parse_dates=["forecast_start_date", "forecast_end_date"])

# Initialize submission with zeros
submission = sample_submission.copy()
submission["predicted_weight"] = 0.0

# Merge with prediction mapping to get rm_id and date information
submission = submission.merge(prediction_mapping, on="ID")

# Always load canonical calibrated output
try:
    simulated_df = pd.read_csv("simulated_receivals_2025.csv", parse_dates=["date_arrival"])  # canonical name
    print(f"Loaded {len(simulated_df)} simulated receivals from simulated_receivals_2025.csv")
    print(f"Date range: {simulated_df['date_arrival'].min()} to {simulated_df['date_arrival'].max()}")
except FileNotFoundError:
    print("Error: simulated_receivals_2025.csv not found. Please run the prediction+calibration cells first.")
    simulated_df = pd.DataFrame()

Loaded 2571 simulated receivals from simulated_receivals_2025.csv
Date range: 2024-12-31 10:15:00 to 2025-05-30 10:15:00


In [101]:
# Generate submission with cumulative allocation: each receival is added to all submission rows where forecast_end_date >= date_arrival
ALLOCATION_MODE = "cumulative"

if len(simulated_df) > 0:
    print("Processing predictions for submission with cumulative allocation...")

    submission["rm_id"] = submission["rm_id"].astype(int)
    simulated_df["rm_id"] = simulated_df["rm_id"].astype(float).astype(int)

    for receival in simulated_df.itertuples():
        rm_id = int(receival.rm_id)
        date_arrival = receival.date_arrival
        net_weight = float(receival.net_weight)

        # Ensure naive timestamp
        if hasattr(date_arrival, 'tz') and date_arrival.tz is not None:
            date_arrival = date_arrival.tz_localize(None)

        # Find all submission rows for this rm_id where forecast_end_date >= date_arrival
        mask = (submission["rm_id"] == rm_id) & (submission["forecast_end_date"] >= date_arrival)
        idxs = submission.index[mask]
        if len(idxs) == 0:
            continue
        submission.loc[idxs, "predicted_weight"] += net_weight

    print(f"Updated {submission['predicted_weight'].sum():.2f} total predicted weight across {len(submission)} rows")
    non_zero = (submission['predicted_weight'] > 0).sum()
    print(f"Non-zero predictions: {non_zero} out of {len(submission)} rows")

    non_zero_submission = submission[submission['predicted_weight'] > 0]
    if len(non_zero_submission) > 0:
        print(
            f"Weight stats - Min: {non_zero_submission['predicted_weight'].min():.4f}, "
            f"Max: {non_zero_submission['predicted_weight'].max():.4f}, "
            f"Mean: {non_zero_submission['predicted_weight'].mean():.4f}"
        )
else:
    print("No simulated receivals available for submission generation.")

Processing predictions for submission with cumulative allocation...
Updated 2789146779.92 total predicted weight across 30450 rows
Non-zero predictions: 6304 out of 30450 rows
Weight stats - Min: 1778.9801, Max: 3657333.9047, Mean: 442440.7963
Updated 2789146779.92 total predicted weight across 30450 rows
Non-zero predictions: 6304 out of 30450 rows
Weight stats - Min: 1778.9801, Max: 3657333.9047, Mean: 442440.7963


In [102]:
submission = submission[["ID", "predicted_weight"]]
submission.to_csv("testing2025.csv", index=False)

In [103]:
submission = pd.read_csv("testing2025.csv")

In [104]:
test_df = submission.merge(prediction_mapping, on="ID")
test_df = test_df.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

print(test_df[0:46])

     rm_id  predicted_weight
150   3125      3.657334e+06
147   3122      3.625794e+06
160   3282      3.357534e+06
149   3124      3.040462e+06
151   3126      2.839868e+06
148   3123      2.521192e+06
75    2130      2.483351e+06
176   3781      2.455220e+06
180   3865      1.766089e+06
83    2140      1.653727e+06
182   3901      1.470989e+06
79    2134      1.190710e+06
80    2135      5.597194e+05
85    2142      5.587074e+05
159   3265      4.915287e+05
161   3362      4.521669e+05
76    2131      3.829215e+05
87    2144      3.688361e+05
190   4222      3.529411e+05
88    2145      3.510526e+05
136   2741      3.352526e+05
181   3883      2.927613e+05
86    2143      2.624352e+05
163   3421      2.608358e+05
174   3761      2.288889e+05
191   4263      2.212719e+05
152   3142      1.872517e+05
77    2132      1.807225e+05
172   3642      1.714405e+05
74    2129      1.081445e+05
156   3201      1.054540e+05
185   4021      7.199715e+04
78    2133      6.998173e+04
192   4302    

In [114]:
# SCALE SPECIFIC RM_ID DOWN
testing_scaled = pd.read_csv("testing2025.csv")
testing_scaled = testing_scaled.merge(prediction_mapping, on="ID", how="inner")

testing_scaled['predicted_weight'] *= 0.8

print(testing_scaled)
testing_scaled = testing_scaled[["ID", "predicted_weight"]]

testing_scaled.to_csv("testing2025_scaled.csv")

          ID  predicted_weight  rm_id forecast_start_date forecast_end_date
0          1               0.0    365          2025-01-01        2025-01-02
1          2               0.0    365          2025-01-01        2025-01-03
2          3               0.0    365          2025-01-01        2025-01-04
3          4               0.0    365          2025-01-01        2025-01-05
4          5               0.0    365          2025-01-01        2025-01-06
...      ...               ...    ...                 ...               ...
30445  30446               0.0   4501          2025-01-01        2025-05-27
30446  30447               0.0   4501          2025-01-01        2025-05-28
30447  30448               0.0   4501          2025-01-01        2025-05-29
30448  30449               0.0   4501          2025-01-01        2025-05-30
30449  30450               0.0   4501          2025-01-01        2025-05-31

[30450 rows x 5 columns]
