In [None]:
# Imports
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Paths
from pathlib import Path
ROOT = Path('..').resolve().parents[1] if False else Path('..').resolve()
DATA_DIR = Path('..') / 'processed_data' if False else Path('..') / 'processed_data'
# The notebook runs from 1_DatasetCharacteristics/notebooks â€” adjust paths below
DATA_DIR = Path('..') / 'processed_data'
TRAIN_FILE = DATA_DIR / 'train_features.csv'
TEST_FILE = DATA_DIR / 'test_features.csv'
print('Train file:', TRAIN_FILE)
print('Test file:', TEST_FILE)

In [None]:
# Load data
train = pd.read_csv(TRAIN_FILE, parse_dates=['Datum'])
test = pd.read_csv(TEST_FILE, parse_dates=['Datum'])
train.shape, test.shape

In [None]:
# Prepare features and target (same as scripts)
FEATURE_COLUMNS = ['Temperatur','Niederschlag','Windgeschwindigkeit','Wettercode','is_holiday']
def prepare_xy(df):
    df = df.copy()
    if 'Umsatz' in df.columns:
        y = df['Umsatz']
    else:
        y = df['Umsatz_label']
    X = df.copy()
    for c in FEATURE_COLUMNS:
        if c not in X.columns:
            X[c] = np.nan
    X = X[FEATURE_COLUMNS]
    X['Wettercode'] = pd.to_numeric(X['Wettercode'], errors='coerce')
    # Fill numeric NaNs with median
    for c in X.select_dtypes(include=[np.number]).columns:
        med = X[c].median()
        if pd.isna(med): med = 0.0
        X[c] = X[c].fillna(med)
    X['is_holiday'] = X['is_holiday'].fillna(False).astype(int)
    mask = ~y.isna()
    return X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True)
X, y = prepare_xy(train)
X.shape, y.shape

In [None]:
# 80/20 split (reproducible)
rng = np.random.default_rng(42)
idx = rng.permutation(len(X))
cut = int(len(X)*0.8)
train_idx, val_idx = idx[:cut], idx[cut:]
X_train, X_val = X.iloc[train_idx].to_numpy(), X.iloc[val_idx].to_numpy()
y_train, y_val = y.iloc[train_idx].to_numpy(), y.iloc[val_idx].to_numpy()
# Fit least-squares linear regression (with intercept)
X_train_aug = np.hstack([np.ones((X_train.shape[0],1)), X_train])
coef, *_ = np.linalg.lstsq(X_train_aug, y_train, rcond=None)
X_val_aug = np.hstack([np.ones((X_val.shape[0],1)), X_val])
y_pred = X_val_aug.dot(coef)
rmse = sqrt(((y_val - y_pred)**2).mean())
print(f'Validation RMSE: {rmse:.4f}')
# show coefficients
cols = ['intercept'] + X.columns.tolist()
for c, v in zip(cols, coef):
    print(f'{c}: {v:.6f}')

In [None]:
# Predict on test set and save predictions
X_test = prepare_xy(test)[0]  # returns X, y but test has no y
X_test_aug = np.hstack([np.ones((X_test.shape[0],1)), X_test.to_numpy()])
preds = X_test_aug.dot(coef)
out = pd.DataFrame({'Datum': test['Datum'], 'Umsatz_pred': preds})
out_path = Path('..') / 'processed_data' / 'test_predictions_baseline_from_nb.csv'
out.to_csv(out_path, index=False)
print('Wrote predictions to', out_path)

## Next steps
- Try regularized linear models (Ridge), tree-based models, or feature engineering (lags/rolls).
- Add cross-validation and error analysis plots.
- Save model coefficients and predictions for reproducibility.