In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [5]:
df = pd.read_csv('sorting_event_volumes_2023.csv')

Unnamed: 0,sorting_center_name,event_location,event_type,scanning_date,input_belt,output_belt,position,no_of_events
0,VANTAA,1000388,LAJ,2023-01-02,,,,20
1,VANTAA,1000388,LAJ,2023-01-03,,,,18
2,VANTAA,1000388,LAJ,2023-01-04,,,,17
3,VANTAA,1000388,LAJ,2023-01-05,,,,24
4,VANTAA,1000388,LAJ,2023-01-09,,,,15
...,...,...,...,...,...,...,...,...
8949716,OULU,90940641,REK,2023-12-27,,,,22
8949717,OULU,90940641,REK,2023-12-28,,,,32
8949718,OULU,90940641,REK,2023-12-29,,,,21
8949719,OULU,90940641,REK,2023-12-30,,,,14


### Cleaning and datamanagement 

In [6]:
# Data cleaning
print("Number of rows is: " + str(df.shape[0]))
df = df.loc[df["event_type"] == "LAJ", :].copy()
df.drop(['event_location', 'input_belt', 'position'], axis=1, inplace = True)
df.dropna(inplace = True)
df['output_belt'] = df['output_belt'].astype(int)
df['scanning_date'] = pd.to_datetime(df['scanning_date'])

print("Number of rows cleaned data is: " + str(df.shape[0]))

Number of rows is: 8949721
Number of rows cleaned data is: 7450939


In [7]:
# Data preparation
df['day'] = df['scanning_date'].dt.day
df['month'] = df['scanning_date'].dt.month
df['weekday'] = df['scanning_date'].dt.day_of_week + 1
df['week'] = df['scanning_date'].dt.day_of_year // 7 + 1
df['week_of_month'] = (df['day'] - 1) // 7 + 1


sorting_center_names = df["sorting_center_name"].unique()
df["sorting_center_name"].value_counts()

sorting_center_name
VANTAA       4888466
TAMPERE      1154436
KUOPIO        578274
LIETO         443087
OULU          354047
SEINÄJOKI      32629
Name: count, dtype: int64

In [8]:
dfs = {}
for name, group in df.groupby('sorting_center_name'):
    df_grouped = group.groupby(['scanning_date', 'output_belt'], as_index=False)['no_of_events'].sum()
    df_grouped = df_grouped.merge(group[['scanning_date', 'day', 'month', 'weekday', 'week', 'week_of_month']].drop_duplicates(), 
                                   on='scanning_date', 
                                   how='left')
    dfs[name] = df_grouped

dfs['VANTAA']

Unnamed: 0,scanning_date,output_belt,no_of_events,day,month,weekday,week,week_of_month
0,2023-01-02,0,1,2,1,1,1,1
1,2023-01-02,1,533,2,1,1,1,1
2,2023-01-02,2,256,2,1,1,1,1
3,2023-01-02,3,1422,2,1,1,1,1
4,2023-01-02,4,1684,2,1,1,1,1
...,...,...,...,...,...,...,...,...
44001,2023-12-30,96,956,30,12,6,53,5
44002,2023-12-30,97,692,30,12,6,53,5
44003,2023-12-30,98,929,30,12,6,53,5
44004,2023-12-30,99,47,30,12,6,53,5


### Restructuring the data to 300 rows for the days

In [19]:
df_vantaa = dfs['VANTAA']
pivot_df = df_vantaa.pivot_table(
    index=['scanning_date', 'day', 'month', 'weekday', 'week', 'week_of_month'],
    columns='output_belt',
    values='no_of_events',
    fill_value=0  # Vul lege waarden met 0 als er geen events zijn voor bepaalde output_belts
)
pivot_df = pivot_df.reset_index()

pivot_df.columns.name = None
pivot_df = pivot_df.rename(columns=lambda x: f'no_of_events_output_belt_{x}' if isinstance(x, int) else x)

pivot_df


Unnamed: 0,scanning_date,day,month,weekday,week,week_of_month,no_of_events_output_belt_0,no_of_events_output_belt_1,no_of_events_output_belt_2,no_of_events_output_belt_3,...,no_of_events_output_belt_342,no_of_events_output_belt_343,no_of_events_output_belt_344,no_of_events_output_belt_345,no_of_events_output_belt_346,no_of_events_output_belt_347,no_of_events_output_belt_348,no_of_events_output_belt_349,no_of_events_output_belt_350,no_of_events_output_belt_351
0,2023-01-02,2,1,1,1,1,1.0,533.0,256.0,1422.0,...,681.0,725.0,598.0,387.0,419.0,611.0,739.0,660.0,751.0,546.0
1,2023-01-03,3,1,2,1,1,5.0,753.0,323.0,2333.0,...,346.0,854.0,682.0,264.0,621.0,231.0,1351.0,251.0,1173.0,479.0
2,2023-01-04,4,1,3,1,1,5.0,552.0,374.0,2460.0,...,756.0,753.0,685.0,306.0,570.0,425.0,1046.0,522.0,932.0,462.0
3,2023-01-05,5,1,4,1,1,2.0,713.0,263.0,2230.0,...,409.0,635.0,511.0,232.0,454.0,353.0,730.0,429.0,784.0,316.0
4,2023-01-06,6,1,5,1,1,1.0,128.0,135.0,879.0,...,0.0,66.0,23.0,0.0,37.0,0.0,66.0,0.0,61.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,2023-12-26,26,12,2,52,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
337,2023-12-27,27,12,3,52,4,6.0,637.0,299.0,1435.0,...,185.0,791.0,1089.0,66.0,622.0,169.0,1014.0,159.0,1142.0,412.0
338,2023-12-28,28,12,4,52,4,17.0,870.0,332.0,2056.0,...,201.0,694.0,961.0,45.0,367.0,103.0,693.0,196.0,702.0,293.0
339,2023-12-29,29,12,5,52,5,4.0,513.0,290.0,1549.0,...,48.0,467.0,858.0,118.0,416.0,162.0,679.0,163.0,756.0,255.0


In [21]:
# Voeg 'week_of_month' toe aan de combinatie van categorische features
pivot_df['category_comb'] = (
    pivot_df['day'].astype(str) + '_' + 
    pivot_df['weekday'].astype(str) + '_' + 
    pivot_df['week_of_month'].astype(str)
)

# Encodeer de gecombineerde categorieën
pivot_df['category_encoded'] = pivot_df['category_comb'].astype('category').cat.codes

# Voeg de nieuwe kolommen in na 'week_of_month'
pivot_df.insert(pivot_df.columns.get_loc('week_of_month') + 1, 'category_comb', pivot_df.pop('category_comb'))
pivot_df.insert(pivot_df.columns.get_loc('week_of_month') + 2, 'category_encoded', pivot_df.pop('category_encoded'))

pivot_df


Unnamed: 0,scanning_date,day,month,weekday,week,week_of_month,category_comb,category_encoded,no_of_events_output_belt_0,no_of_events_output_belt_1,...,no_of_events_output_belt_342,no_of_events_output_belt_343,no_of_events_output_belt_344,no_of_events_output_belt_345,no_of_events_output_belt_346,no_of_events_output_belt_347,no_of_events_output_belt_348,no_of_events_output_belt_349,no_of_events_output_belt_350,no_of_events_output_belt_351
0,2023-01-02,2,1,1,1,1,2_1_1,138,1.0,533.0,...,681.0,725.0,598.0,387.0,419.0,611.0,739.0,660.0,751.0,546.0
1,2023-01-03,3,1,2,1,1,3_2_1,158,5.0,753.0,...,346.0,854.0,682.0,264.0,621.0,231.0,1351.0,251.0,1173.0,479.0
2,2023-01-04,4,1,3,1,1,4_3_1,166,5.0,552.0,...,756.0,753.0,685.0,306.0,570.0,425.0,1046.0,522.0,932.0,462.0
3,2023-01-05,5,1,4,1,1,5_4_1,174,2.0,713.0,...,409.0,635.0,511.0,232.0,454.0,353.0,730.0,429.0,784.0,316.0
4,2023-01-06,6,1,5,1,1,6_5_1,182,1.0,128.0,...,0.0,66.0,23.0,0.0,37.0,0.0,66.0,0.0,61.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,2023-12-26,26,12,2,52,4,26_2_4,113,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
337,2023-12-27,27,12,3,52,4,27_3_4,121,6.0,637.0,...,185.0,791.0,1089.0,66.0,622.0,169.0,1014.0,159.0,1142.0,412.0
338,2023-12-28,28,12,4,52,4,28_4_4,128,17.0,870.0,...,201.0,694.0,961.0,45.0,367.0,103.0,693.0,196.0,702.0,293.0
339,2023-12-29,29,12,5,52,5,29_5_5,135,4.0,513.0,...,48.0,467.0,858.0,118.0,416.0,162.0,679.0,163.0,756.0,255.0


In [23]:
x_days = 5
output_belts = [col for col in pivot_df.columns if col.startswith('no_of_events_output_belt')]

input_sequences = []
target_sequences = []

# Sequenties genereren
for i in range(len(pivot_df) - x_days):
    # Voeg 'week_of_month' en 'category_encoded' toe aan de input sequentie
    input_seq = pivot_df.iloc[i:i + x_days][output_belts + ['day', 'weekday', 'week_of_month', 'category_encoded']].values
    target_features = pivot_df.iloc[i + x_days][['day', 'month', 'weekday', 'week', 'week_of_month', 'category_encoded']].values
    
    input_sequences.append(input_seq)
    target_sequences.append(pivot_df.iloc[i + x_days][output_belts].values)

# Data splitsen op volgorde
split_ratio = 0.7  # 70% training, 30% testing
split_index = int(len(input_sequences) * split_ratio)

input_train = input_sequences[:split_index]
input_test = input_sequences[split_index:]
target_train = target_sequences[:split_index]
target_test = target_sequences[split_index:]

# Convert to tensors
input_train_tensor = torch.tensor(input_train, dtype=torch.float32)
input_test_tensor = torch.tensor(input_test, dtype=torch.float32)
target_train_tensor = torch.tensor(target_train, dtype=torch.float32)
target_test_tensor = torch.tensor(target_test, dtype=torch.float32)

print(f"Train input tensor shape: {input_train_tensor.shape}")
print(f"Test input tensor shape: {input_test_tensor.shape}")
print(f"Train target tensor shape: {target_train_tensor.shape}")
print(f"Test target tensor shape: {target_test_tensor.shape}")



Train input tensor shape: torch.Size([235, 5, 154])
Test input tensor shape: torch.Size([101, 5, 154])
Train target tensor shape: torch.Size([235, 150])
Test target tensor shape: torch.Size([101, 150])


## RNN from youtube

In [24]:
# Parameters
input_size = input_train_tensor.shape[2]  # Aantal input features per dag
hidden_size = 64  # Aantal neuronen in de verborgen laag
output_size = target_train_tensor.shape[1]  # Aantal output belts (de te voorspellen no_of_events)


class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_prob=0.5):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_prob)  # Dropout layer

    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)
        hidden = self.i2h(combined)
        hidden = self.dropout(hidden)  # Apply dropout
        output = self.i2o(combined)  
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [26]:
# Training
num_epochs = 300
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    
    # Training loop
    for i in range(len(input_train_tensor)):
        input_seq = input_train_tensor[i].unsqueeze(0)  # Vorm: [1, sequence_length, input_size]
        target = target_train_tensor[i].unsqueeze(0)    # Vorm: [1, output_size]
        hidden = model.init_hidden().to(device)  # Initialiseer hidden op de GPU

        for t in range(input_seq.shape[1]):
            output, hidden = model(input_seq[:, t, :], hidden)
        
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
    
    # Test/evaluatie na elke epoch
    model.eval()
    total_test_loss = 0
    with torch.no_grad():  # Geen gradients tijdens evaluatie
        for i in range(len(input_test_tensor)):
            input_seq = input_test_tensor[i].unsqueeze(0)  # Vorm: [1, sequence_length, input_size]
            target = target_test_tensor[i].unsqueeze(0)    # Vorm: [1, output_size]
            hidden = model.init_hidden().to(device)  # Initialiseer hidden op de GPU

            for t in range(input_seq.shape[1]):
                output, hidden = model(input_seq[:, t, :], hidden)
            
            loss = criterion(output, target)
            total_test_loss += loss.item()
    
    # Gemiddelde loss voor train en test
    avg_train_loss = total_train_loss / len(input_train_tensor)
    avg_test_loss = total_test_loss / len(input_test_tensor)

    # Elke 10 epochs printen van de loss
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')


Epoch [10/300], Train Loss: 349.8331, Test Loss: 405.0298


KeyboardInterrupt: 

In [29]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Controleer of CUDA beschikbaar is
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# RNN-model aanpassen
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_prob=0.5):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_prob)  # Dropout layer

    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)
        hidden = self.i2h(combined)
        hidden = self.dropout(hidden)  # Apply dropout
        output = self.i2o(combined)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size).to(device)

# Parameters
input_size = input_train_tensor.shape[2]  # Aantal input features per dag
hidden_size = 64  # Aantal neuronen in de verborgen laag
output_size = target_train_tensor.shape[1]  # Aantal output belts (de te voorspellen no_of_events)
batch_size = 10  # Stel een batchgrootte in

# Model, loss en optimizer
model = RNN(input_size, hidden_size, output_size).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Verplaats tensors naar het gekozen apparaat (GPU of CPU)
input_train_tensor = input_train_tensor.to(device)
input_test_tensor = input_test_tensor.to(device)
target_train_tensor = target_train_tensor.to(device)
target_test_tensor = target_test_tensor.to(device)

# Maak TensorDataset en DataLoader voor training en testen
train_dataset = TensorDataset(input_train_tensor, target_train_tensor)
test_dataset = TensorDataset(input_test_tensor, target_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training
num_epochs = 300
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    
    # Training loop met DataLoader
    for input_batch, target_batch in train_loader:
        hidden = model.init_hidden(input_batch.size(0))  # Initialiseer hidden met de juiste batchgrootte
        
        # Model output berekenen voor elke tijdstap in de sequentie
        for t in range(input_batch.shape[1]):
            output, hidden = model(input_batch[:, t, :], hidden)
        
        loss = criterion(output, target_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
    
    # Test/evaluatie na elke epoch
    model.eval()
    total_test_loss = 0
    with torch.no_grad():  # Geen gradients tijdens evaluatie
        for input_batch, target_batch in test_loader:
            hidden = model.init_hidden(input_batch.size(0))  # Initialiseer hidden met de juiste batchgrootte
            
            # Model output berekenen voor elke tijdstap in de sequentie
            for t in range(input_batch.shape[1]):
                output, hidden = model(input_batch[:, t, :], hidden)
            
            loss = criterion(output, target_batch)
            total_test_loss += loss.item()
    
    # Gemiddelde loss voor train en test
    avg_train_loss = total_train_loss / len(train_loader)
    avg_test_loss = total_test_loss / len(test_loader)

    # Elke 10 epochs printen van de loss
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')


Epoch [10/300], Train Loss: 341.2193, Test Loss: 437.4738
Epoch [20/300], Train Loss: 286.5108, Test Loss: 384.8653
Epoch [30/300], Train Loss: 280.4655, Test Loss: 379.9514
Epoch [40/300], Train Loss: 267.5373, Test Loss: 377.8096
Epoch [50/300], Train Loss: 262.5787, Test Loss: 380.7123
Epoch [60/300], Train Loss: 244.4293, Test Loss: 378.3222
Epoch [70/300], Train Loss: 243.6894, Test Loss: 385.2857
Epoch [80/300], Train Loss: 248.2501, Test Loss: 388.0321
Epoch [90/300], Train Loss: 241.0759, Test Loss: 395.3552
Epoch [100/300], Train Loss: 236.2579, Test Loss: 394.0209
Epoch [110/300], Train Loss: 225.0729, Test Loss: 393.9832
Epoch [120/300], Train Loss: 228.6318, Test Loss: 392.2353
Epoch [130/300], Train Loss: 221.9872, Test Loss: 399.3401
Epoch [140/300], Train Loss: 229.6845, Test Loss: 406.5865
Epoch [150/300], Train Loss: 212.1147, Test Loss: 414.4524
Epoch [160/300], Train Loss: 217.4417, Test Loss: 405.5456
Epoch [170/300], Train Loss: 219.0091, Test Loss: 436.2923
Epoch 