In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import ipynbname

In [2]:
# Change cwd to project root to ensure relative paths work correctly.
# If moving the .ipynb file back to project root, simply remove the 2 lines below.
project_root = os.path.abspath(os.path.join(os.path.dirname(ipynbname.path()), '..'))
os.chdir(project_root)

print(sys.version)
print(f"cwd: {os.getcwd()}")
print(f"Pandas version: {pd.__version__}")

3.14.0 (main, Oct  7 2025, 15:35:21) [Clang 20.1.4 ]
cwd: /home/christian/Schreibtisch/semester7/moderne-maskinlæring-i-praksis-TDT4173/group-project/main_predict_stock
Pandas version: 2.3.3


In [3]:
# receivals_path = "data/kernel/receivals.csv"
receivals_path = "data_cleaned/receivals_cleaned.csv"  # use cleaned data

receivals = pd.read_csv(receivals_path)
pred_map = pd.read_csv("data/prediction_mapping.csv")

# preprocess receivals
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'])

# Make datetime naive (remove timezone) to allow comparison with naive window_start/window_end
if receivals['date_arrival'].dt.tz is not None:
    receivals['date_arrival'] = receivals['date_arrival'].dt.tz_localize(None)

receivals = receivals.dropna(subset=['rm_id', 'net_weight'])

# build historical intervals — for every year, same windows as prediction
intervals = []
for year in range(2021, 2025):   # adjust as necessary
    for window_len in range(1, 152):  # e.g., Jan 1–May 31 == 151 days
        window_start = pd.Timestamp(year=year, month=1, day=1)
        window_end = window_start + pd.Timedelta(days=window_len-1)
        for rm_id in receivals['rm_id'].unique():
            mask = (
                (receivals['rm_id'] == rm_id)
                & (receivals['date_arrival'] >= window_start)
                & (receivals['date_arrival'] <= window_end)
            )
            total = receivals.loc[mask, 'net_weight'].sum()
            # build features: e.g., total last year, mean per interval, etc.
            features = {
                "rm_id": rm_id,
                "interval_days": window_len,
                "year": year,
                "month": window_end.month,
                "day_of_week": window_end.dayofweek,
                # add more features...
                # e.g.: "prev_year_total": ...
            }
            intervals.append({**features, "target": total})

train_df = pd.DataFrame(intervals)
print(train_df.head())

   rm_id  interval_days  year  month  day_of_week  target
0  365.0              1  2021      1            4     0.0
1  379.0              1  2021      1            4     0.0
2  389.0              1  2021      1            4     0.0
3  369.0              1  2021      1            4     0.0
4  366.0              1  2021      1            4     0.0


In [4]:
print(train_df.shape)

(122612, 6)


In [5]:
X = train_df.drop(columns=['target'])
y = train_df['target']

import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=8,
    min_child_weight=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1.0,
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    objective='reg:quantileerror',
    quantile_alpha=0.2,
)

# (Optional) validation split—critical for time series generalization:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)  # Do NOT shuffle in time series context!

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=True,
)

# Evaluate
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_val)
print("MAE on validation:", mean_absolute_error(y_val, y_pred))

[0]	validation_0-quantile:21309.17352
[1]	validation_0-quantile:21309.17352
[2]	validation_0-quantile:21298.90584
[3]	validation_0-quantile:21291.97148
[4]	validation_0-quantile:21281.33964
[5]	validation_0-quantile:21281.33964
[6]	validation_0-quantile:21274.40233
[7]	validation_0-quantile:21274.40233
[8]	validation_0-quantile:21260.93195
[9]	validation_0-quantile:21249.78588
[10]	validation_0-quantile:21249.78588
[11]	validation_0-quantile:21245.19723
[12]	validation_0-quantile:21245.19723
[13]	validation_0-quantile:21235.85673
[14]	validation_0-quantile:21226.44943
[15]	validation_0-quantile:21226.44943
[16]	validation_0-quantile:21226.44943
[17]	validation_0-quantile:21226.44943
[18]	validation_0-quantile:21226.44943
[19]	validation_0-quantile:21226.04740
[20]	validation_0-quantile:21227.22609
[21]	validation_0-quantile:21220.54371
[22]	validation_0-quantile:21222.62965
[23]	validation_0-quantile:21209.74596
[24]	validation_0-quantile:21209.74596
[25]	validation_0-quantile:21200.87

In [6]:
prediction_map = pd.read_csv("data/prediction_mapping.csv")

rm_ids = prediction_map['rm_id'].unique()
interval_days = range(1, 151)
year = 2025

predict_df = pd.DataFrame([
    {"rm_id": rm_id, "interval_days": interval, "year": year, "month": (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).month,
     "day_of_week": (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).dayofweek}
    for rm_id in rm_ids
    for interval in interval_days
])

print(predict_df.head(), predict_df.shape)




   rm_id  interval_days  year  month  day_of_week
0    365              1  2025      1            2
1    365              2  2025      1            3
2    365              3  2025      1            4
3    365              4  2025      1            5
4    365              5  2025      1            6 (30450, 5)


In [7]:
y_pred = model.predict(predict_df)
y_pred = np.maximum(0, y_pred)  # Ensure no negative predictions

pred2025 = pd.DataFrame({
    "ID": np.arange(1, len(y_pred) + 1),
    "predicted_weight": y_pred
})

pred2025.to_csv("christian/basic_xgboost_predictions_jan-may_2021-2024_day_of_week.csv", index=False)



In [8]:
prediction_mapping = pd.read_csv("data/prediction_mapping.csv")
merged = pred2025.merge(prediction_mapping, on="ID")

agg_df = merged.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

print(agg_df.head(20))
# print(max(y_pred), min(y_pred), np.mean(y_pred), np.median(y_pred))

     rm_id  predicted_weight
83    2140      3.202167e+06
75    2130      2.121236e+06
151   3126      7.534909e+05
150   3125      7.283574e+05
147   3122      6.625316e+05
148   3123      6.307374e+05
149   3124      6.043996e+05
85    2142      4.709707e+05
146   3121      3.295725e+05
79    2134      3.283404e+05
80    2135      3.230294e+05
142   2981      3.159286e+05
76    2131      1.880870e+05
87    2144      1.848180e+05
77    2132      1.547058e+05
152   3142      1.117459e+05
88    2145      1.001855e+05
86    2143      7.004290e+04
74    2129      6.987972e+04
159   3265      6.944052e+04
