In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import ipynbname

In [2]:
# Change cwd to project root to ensure relative paths work correctly.
# If moving the .ipynb file back to project root, simply remove the 2 lines below.
project_root = os.path.abspath(os.path.join(os.path.dirname(ipynbname.path()), '..'))
os.chdir(project_root)

print(sys.version)
print(f"cwd: {os.getcwd()}")
print(f"Pandas version: {pd.__version__}")

3.14.0 (main, Oct  7 2025, 15:35:21) [Clang 20.1.4 ]
cwd: /home/christian/Schreibtisch/semester7/moderne-maskinlæring-i-praksis-TDT4173/group-project/main_predict_stock
Pandas version: 2.3.3


In [6]:
# receivals_path = "data/kernel/receivals.csv"
receivals_path = "data_cleaned/receivals_cleaned.csv"  # use cleaned data

receivals = pd.read_csv(receivals_path)
pred_map = pd.read_csv("data/prediction_mapping.csv")

# preprocess receivals
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'])

# Make datetime naive (remove timezone) to allow comparison with naive window_start/window_end
if receivals['date_arrival'].dt.tz is not None:
    receivals['date_arrival'] = receivals['date_arrival'].dt.tz_localize(None)

receivals = receivals.dropna(subset=['rm_id', 'net_weight'])

# build historical intervals — for every year, same windows as prediction
intervals = []
for year in range(2010, 2025):   # adjust as necessary
    for window_len in range(1, 152):  # e.g., Jan 1–May 31 == 151 days
        window_start = pd.Timestamp(year=year, month=1, day=1)
        window_end = window_start + pd.Timedelta(days=window_len-1)
        for rm_id in receivals['rm_id'].unique():
            mask = (
                (receivals['rm_id'] == rm_id)
                & (receivals['date_arrival'] >= window_start)
                & (receivals['date_arrival'] <= window_end)
            )
            total = receivals.loc[mask, 'net_weight'].sum()
            # build features: e.g., total last year, mean per interval, etc.
            features = {
                "rm_id": rm_id,
                "interval_days": window_len,
                "year": year,
                # add more features...
                # e.g.: "prev_year_total": ...
            }
            intervals.append({**features, "target": total})

train_df = pd.DataFrame(intervals)
print(train_df.head())

   rm_id  interval_days  year  target
0  365.0              1  2010     0.0
1  379.0              1  2010     0.0
2  389.0              1  2010     0.0
3  369.0              1  2010     0.0
4  366.0              1  2010     0.0


In [7]:
print(train_df.shape)

(459795, 4)


In [8]:
X = train_df.drop(columns=['target'])
y = train_df['target']

import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1.0,
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    objective='reg:quantileerror',
    quantile_alpha=0.2,
)

# (Optional) validation split—critical for time series generalization:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)  # Do NOT shuffle in time series context!

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=True,
)

# Evaluate
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_val)
print("MAE on validation:", mean_absolute_error(y_val, y_pred))

[0]	validation_0-quantile:18453.13356
[1]	validation_0-quantile:18481.57702
[2]	validation_0-quantile:18481.57702
[3]	validation_0-quantile:18502.05475
[4]	validation_0-quantile:18517.06179
[5]	validation_0-quantile:18517.06179
[6]	validation_0-quantile:18517.06179
[7]	validation_0-quantile:18532.56401
[8]	validation_0-quantile:18479.42718
[9]	validation_0-quantile:18479.42718
[10]	validation_0-quantile:18479.42718
[11]	validation_0-quantile:18479.42718
[12]	validation_0-quantile:18509.19872
[13]	validation_0-quantile:18509.19872
[14]	validation_0-quantile:18532.46315
[15]	validation_0-quantile:18532.46315
[16]	validation_0-quantile:18558.59958
[17]	validation_0-quantile:18558.59958
[18]	validation_0-quantile:18552.46610
[19]	validation_0-quantile:18555.67064
[20]	validation_0-quantile:18552.99316
[21]	validation_0-quantile:18567.49026
[22]	validation_0-quantile:18567.49026
[23]	validation_0-quantile:18567.49026
[24]	validation_0-quantile:18527.08653
[25]	validation_0-quantile:18527.08

In [None]:
prediction_map = pd.read_csv("data/prediction_mapping.csv")

rm_ids = prediction_map['rm_id'].unique()
interval_days = range(1, 151)
year = 2025

predict_df = pd.DataFrame([
    {"rm_id": rm_id, "interval_days": interval, "year": year}
    for rm_id in rm_ids
    for interval in interval_days
])

print(predict_df.head(), predict_df.shape)




   rm_id  interval_days  year
0    365              1  2025
1    365              2  2025
2    365              3  2025
3    365              4  2025
4    365              5  2025 (30450, 3)


In [21]:
y_pred = model.predict(predict_df)
y_pred = np.maximum(0, y_pred)  # Ensure no negative predictions

pd.DataFrame({
    "ID": np.arange(1, len(y_pred) + 1),
    "predicted_weight": y_pred
}).to_csv("christian/basic_xgboost_predictions.csv", index=False)


In [22]:
print(max(y_pred), min(y_pred), np.mean(y_pred), np.median(y_pred))

4.287798e+06 0.0 27960.938 0.0
