In [256]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import torch.nn as nn
import torch

In [257]:
f = './merged_omzet_weer_ticket.csv'

df = pd.read_csv(f, usecols=['Datum_uur', 'Omzet', 'Datum', 'Neerslag', "Tmin", "Tmax", "aantal_tickets", "aantal_opgedaagd"])

df.head()

Unnamed: 0,Datum_uur,Omzet,Datum,Tmax,Tmin,Neerslag,aantal_tickets,aantal_opgedaagd
0,2022-07-09 14:00:00,9.0,2022-07-09,21.6 °C,15.3 °C,"0,0 mm",1319.0,962.0
1,2022-07-09 16:00:00,182.35,2022-07-09,21.6 °C,15.3 °C,"0,0 mm",1319.0,962.0
2,2022-07-09 17:00:00,767.1,2022-07-09,21.6 °C,15.3 °C,"0,0 mm",1319.0,962.0
3,2022-07-09 18:00:00,1933.7,2022-07-09,21.6 °C,15.3 °C,"0,0 mm",1319.0,962.0
4,2022-07-09 19:00:00,2567.4,2022-07-09,21.6 °C,15.3 °C,"0,0 mm",1319.0,962.0


In [258]:
df.shape

(2103, 8)

In [259]:
def clean_neerslag(v: str):
  if (not isinstance(v, str)): return v
  
  return float(v.replace('mm', '').replace(',', '.').strip())

def clean_temp(v: str):
  if (not isinstance(v, str)): return v

  return float(v.replace('°C', '').strip())

In [260]:
df['Neerslag'] = df['Neerslag'].apply(clean_neerslag)
df['Tmin'] = df['Tmin'].apply(clean_temp)
df['Tmax'] = df['Tmax'].apply(clean_temp)

In [261]:
df['hour'] = pd.to_datetime(df['Datum_uur']).dt.hour
df['day_of_the_week'] = pd.to_datetime(df['Datum']).dt.dayofweek
df['month'] = pd.to_datetime(df['Datum']).dt.month
df["is_weekend"] = (df["day_of_the_week"] >= 5) & (df['day_of_the_week'] < 7)

df = df.dropna(subset=['Omzet'])

df.head()

Unnamed: 0,Datum_uur,Omzet,Datum,Tmax,Tmin,Neerslag,aantal_tickets,aantal_opgedaagd,hour,day_of_the_week,month,is_weekend
0,2022-07-09 14:00:00,9.0,2022-07-09,21.6,15.3,0.0,1319.0,962.0,14,5,7,True
1,2022-07-09 16:00:00,182.35,2022-07-09,21.6,15.3,0.0,1319.0,962.0,16,5,7,True
2,2022-07-09 17:00:00,767.1,2022-07-09,21.6,15.3,0.0,1319.0,962.0,17,5,7,True
3,2022-07-09 18:00:00,1933.7,2022-07-09,21.6,15.3,0.0,1319.0,962.0,18,5,7,True
4,2022-07-09 19:00:00,2567.4,2022-07-09,21.6,15.3,0.0,1319.0,962.0,19,5,7,True


In [262]:
omzet_vorige_uur = []

for i in range(len(df)):
  if i == 0:
    omzet_vorige_uur.append(900)
  else:
    vorige_rij = df.iloc[i - 1]	
    huidige_rij = df.iloc[i]

    if (huidige_rij['Datum'] == vorige_rij['Datum']) and (huidige_rij['hour'] - 1 == vorige_rij['hour']):
      omzet_vorige_uur.append(vorige_rij['Omzet'])
    else:
      omzet_vorige_uur.append(huidige_rij['Omzet'])
      
df['omzet_vorige_uur'] = omzet_vorige_uur

df.head()

Unnamed: 0,Datum_uur,Omzet,Datum,Tmax,Tmin,Neerslag,aantal_tickets,aantal_opgedaagd,hour,day_of_the_week,month,is_weekend,omzet_vorige_uur
0,2022-07-09 14:00:00,9.0,2022-07-09,21.6,15.3,0.0,1319.0,962.0,14,5,7,True,900.0
1,2022-07-09 16:00:00,182.35,2022-07-09,21.6,15.3,0.0,1319.0,962.0,16,5,7,True,182.35
2,2022-07-09 17:00:00,767.1,2022-07-09,21.6,15.3,0.0,1319.0,962.0,17,5,7,True,182.35
3,2022-07-09 18:00:00,1933.7,2022-07-09,21.6,15.3,0.0,1319.0,962.0,18,5,7,True,767.1
4,2022-07-09 19:00:00,2567.4,2022-07-09,21.6,15.3,0.0,1319.0,962.0,19,5,7,True,1933.7


In [263]:
# Basic cyclical encoding
df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_the_week']/7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_the_week']/7)
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

# Feature interactions
df['temp_range'] = df['Tmax'] - df['Tmin']
df['temp_avg'] = (df['Tmax'] + df['Tmin']) / 2

# Ratio features
df['opkomst_ratio'] = df['aantal_opgedaagd'] / df['aantal_tickets'].clip(lower=1)

# Time-based aggregations (rolling averages)
df['rolling_tickets_avg'] = df.groupby('hour')['aantal_tickets'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
df['rolling_opgedaagd_avg'] = df.groupby('hour')['aantal_opgedaagd'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

In [264]:
feature_columns = [
    'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
    'is_weekend', 'Neerslag', 'Tmin', 'Tmax', 'temp_range', 'temp_avg',
    'aantal_tickets', 'aantal_opgedaagd', 'opkomst_ratio',
    'rolling_tickets_avg', 'rolling_opgedaagd_avg', 'omzet_vorige_uur'
]

X = df[feature_columns]
y = df['Omzet']

In [265]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [266]:
# Normaliseer de features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

In [267]:
X_train_tensor = torch.FloatTensor(X_train_scaled)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_train_tensor = torch.FloatTensor(y_train_scaled)
y_test_tensor = torch.FloatTensor(y_test_scaled)

In [268]:
print("X_train min:", X_train.min(), "X_train max:", X_train.max())
print("y_train min:", y_train.min(), "y_train max:", y_train.max())


X_train min: hour_sin                     -1.0
hour_cos                     -1.0
day_sin                 -0.974928
day_cos                 -0.900969
month_sin                    -1.0
month_cos                    -1.0
is_weekend                  False
Neerslag                      0.0
Tmin                         -2.3
Tmax                          2.3
temp_range                    1.3
temp_avg                      0.0
aantal_tickets              104.0
aantal_opgedaagd              0.0
opkomst_ratio                 0.0
rolling_tickets_avg         166.0
rolling_opgedaagd_avg       129.0
omzet_vorige_uur             0.01
dtype: object X_train max: hour_sin                         1.0
hour_cos                         1.0
day_sin                     0.974928
day_cos                          1.0
month_sin                        1.0
month_cos                        1.0
is_weekend                      True
Neerslag                        21.8
Tmin                            19.8
Tmax           

In [269]:
# class RevenuePredictor(nn.Module):
#     def __init__(self, input_size):
#         super(RevenuePredictor, self).__init__()
#         self.net = nn.Sequential(
#             nn.Linear(input_size, 32),
#             nn.ReLU(),
#             nn.BatchNorm1d(32),
#             nn.Dropout(0.1), 

#             nn.Linear(32, 16),
#             nn.ReLU(),
#             nn.BatchNorm1d(16),
#             nn.Dropout(0.1),

#             nn.Linear(16, 1)
#         )
    
#     def forward(self, x):
#         return self.net(x)

# model = RevenuePredictor(input_size=X_train.shape[1])

class RevenuePredictor(nn.Module):
    def __init__(self, input_size):
        super(RevenuePredictor, self).__init__()
        
        # L1 regularization voor feature selectie
        self.feature_selector = nn.Linear(input_size, input_size)
        self.sigmoid = nn.Sigmoid()
        
        self.net = nn.Sequential(
            nn.Linear(input_size, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.BatchNorm1d(8),
            
            nn.Linear(8, 1)
        )
    
    def forward(self, x):
        # Feature importance gates
        importance = self.sigmoid(self.feature_selector(x))
        x = x * importance
        return self.net(x)

In [270]:
# Training parameters instellen
batch_size = 32
num_epochs = 100
learning_rate = 0.001

# DataLoader maken voor batched training
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model, loss functie en optimizer initialiseren
model = RevenuePredictor(input_size=X_train_scaled.shape[1])
criterion = nn.HuberLoss(delta=0.5)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Training loop
best_loss = float('inf')
patience = 8
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    
    for batch_X, batch_y in train_loader:
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass en optimalisatie
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()
        
        train_loss += loss.item()
    
    # Evaluatie op de testset
    model.eval()
    val_loss = 0
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor)
    
    # Print voortgang
    avg_train_loss = train_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {test_loss:.4f}')
    
    # Learning rate scheduling
    scheduler.step(test_loss)
    
    # # Early stopping
    if test_loss < best_loss:
        best_loss = test_loss
        patience_counter = 0
    else:
        patience_counter += 1
        
    if patience_counter >= patience:
        print("Early stopping triggered")
        break

# Evaluatie van het finale model
model.eval()
with torch.no_grad():
    train_predictions = model(X_train_tensor)
    test_predictions = model(X_test_tensor)
    
    # Convert predictions terug naar originele schaal
    train_predictions = y_scaler.inverse_transform(train_predictions.numpy())
    test_predictions = y_scaler.inverse_transform(test_predictions.numpy())
    train_actual = y_scaler.inverse_transform(y_train_tensor.numpy())
    test_actual = y_scaler.inverse_transform(y_test_tensor.numpy())
    
    # Bereken metrics
    train_mse = mean_squared_error(train_actual, train_predictions)
    test_mse = mean_squared_error(test_actual, test_predictions)
    train_r2 = r2_score(train_actual, train_predictions)
    test_r2 = r2_score(test_actual, test_predictions)
    
    print(f"\nFinale resultaten:")
    print(f"Train MSE: {train_mse:.4f}")
    print(f"Test MSE: {test_mse:.4f}")
    print(f"Train R²: {train_r2:.4f}")
    print(f"Test R²: {test_r2:.4f}")

Epoch [1/100], Train Loss: 0.2201, Test Loss: 0.2229
Epoch [2/100], Train Loss: 0.1539, Test Loss: 0.1769
Epoch [3/100], Train Loss: 0.1154, Test Loss: 0.1492
Epoch [4/100], Train Loss: 0.0932, Test Loss: 0.0932




Epoch [5/100], Train Loss: 0.0843, Test Loss: 0.0799
Epoch [6/100], Train Loss: 0.0795, Test Loss: 0.1267
Epoch [7/100], Train Loss: 0.0742, Test Loss: 0.0740
Epoch [8/100], Train Loss: 0.0682, Test Loss: 0.1276
Epoch [9/100], Train Loss: 0.0724, Test Loss: 0.0718
Epoch [10/100], Train Loss: 0.0693, Test Loss: 0.0788
Epoch [11/100], Train Loss: 0.0687, Test Loss: 0.0639
Epoch [12/100], Train Loss: 0.0637, Test Loss: 0.0817
Epoch [13/100], Train Loss: 0.0608, Test Loss: 0.0590
Epoch [14/100], Train Loss: 0.0662, Test Loss: 0.0587
Epoch [15/100], Train Loss: 0.0618, Test Loss: 0.0969
Epoch [16/100], Train Loss: 0.0604, Test Loss: 0.0908
Epoch [17/100], Train Loss: 0.0593, Test Loss: 0.0624
Epoch [18/100], Train Loss: 0.0565, Test Loss: 0.0890
Epoch [19/100], Train Loss: 0.0596, Test Loss: 0.0569
Epoch [20/100], Train Loss: 0.0573, Test Loss: 0.0653
Epoch [21/100], Train Loss: 0.0594, Test Loss: 0.0775
Epoch [22/100], Train Loss: 0.0582, Test Loss: 0.1108
Epoch [23/100], Train Loss: 0.057

In [271]:
torch.save(model.state_dict(), "revenue_model.pth")