In [204]:


from _dfguru import DataFrameGuru as DFG
from _occupancy_forecasting import MasterTrainer
from _occupancy_forecasting import load_data
from _occupancy_forecasting import avoid_name_conflicts
from _evaluating import ParameterSearch

import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

import numpy as np
import os
dfg = DFG()
torch.cuda.empty_cache()


############ Inputs ############
#args = parse_arguments()
#args = prompt_for_missing_arguments(args)0
#n_run = args.n_run
#n_param = args.n_param

n_run = 10
n_param = 0

overwrite = True
################################

param_dir = "_occupancy_forecasting/parameters/wrap_up"
tb_log_dir = "_occupancy_forecasting/training_logs/wrap_up"
cp_log_dir = "_occupancy_forecasting/checkpoints/wrap_up"
path_to_data = "data/occupancy_forecasting"

frequency = "5min"
split_by = "time"


train_dict, val_dict, test_dict = load_data(
    path_to_data_dir=path_to_data, 
    frequency=frequency, 
    split_by=split_by,
    dfguru=dfg,
    with_examweek=False
)


data = train_dict[0]


In [205]:
import matplotlib.pyplot as plt

import datetime
import json


helper_file = os.path.join("data/occupancy_forecasting", "helpers_occpred.json")
with open(helper_file, "r") as f:
    helper = json.load(f)       
norm_registered = helper["columns_to_normalize"]["registered"]
norm_temperature = helper["columns_to_normalize"]["tl"]

In [206]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

from _plotting import DataPlotter

### Plot Some Features on Slide of 17.12 Presentation

In [207]:
# 2024-04-09 07:00:00
start = datetime.datetime(2024, 4, 16, 7, 0, 0)
# 2024-04-09 21:00:00
stop = datetime.datetime(2024, 4, 16, 21, 0, 0)

plot_data = dfg.filter_by_timestamp(data, "datetime", start, stop) 

plot_data["tl"] = (plot_data["tl"] - norm_temperature["min"]) / (norm_temperature["max"] - norm_temperature["min"])
plot_data["registered"] = (plot_data["registered"] - norm_registered["min"]) / (norm_registered["max"] - norm_registered["min"])

plotter = DataPlotter(
    save_path="",
    dataframe_guru=dfg
)

#plotter.plot_some_features(plot_data)


### Extract ranges and size of train/val/test set

In [208]:

for room_id in [0,1]:
    # print number of samples and range of datetime
    print(f"Train Room {room_id}: {len(train_dict[room_id])} samples from {train_dict[room_id]['datetime'].min()} to {train_dict[room_id]['datetime'].max()}")
    print(f"Val Room {room_id}: {len(val_dict[room_id])} samples from {val_dict[room_id]['datetime'].min()} to {val_dict[room_id]['datetime'].max()}")
    print(f"Test Room {room_id}: {len(test_dict[room_id])} samples from {test_dict[room_id]['datetime'].min()} to {test_dict[room_id]['datetime'].max()}")
    print()

Train Room 0: 17728 samples from 2024-04-08 00:00:00 to 2024-06-08 13:15:00
Val Room 0: 2216 samples from 2024-06-08 13:20:00 to 2024-06-16 05:55:00
Test Room 0: 2216 samples from 2024-06-16 06:00:00 to 2024-06-23 22:35:00

Train Room 1: 17728 samples from 2024-04-08 00:00:00 to 2024-06-08 13:15:00
Val Room 1: 2216 samples from 2024-06-08 13:20:00 to 2024-06-16 05:55:00
Test Room 1: 2216 samples from 2024-06-16 06:00:00 to 2024-06-23 22:35:00



### Show class imbalances -> zero problem

In [209]:
# print number of timesteps with occrate = 0

for room_id in [0,1]:
    train_occrate = train_dict[room_id]["occrate"]
    print(f"Train Room {room_id}: zero:{len(train_occrate[train_occrate == 0])} total:{len(train_occrate)} relative:{len(train_occrate[train_occrate == 0]) / len(train_occrate)}")
    val_occrate = val_dict[room_id]["occrate"]
    print(f"Val Room {room_id}: zero:{len(val_occrate[val_occrate == 0])} total:{len(val_occrate)} relative:{len(val_occrate[val_occrate == 0]) / len(val_occrate)}")
    test_occrate = test_dict[room_id]["occrate"]
    print(f"Test Room {room_id}: zero:{len(test_occrate[test_occrate == 0])} total:{len(test_occrate)} relative:{len(test_occrate[test_occrate == 0]) / len(test_occrate)}")
    print()
    
    
# sum over all rooms
train_occrate = np.concatenate([train_dict[0]["occrate"], train_dict[1]["occrate"]])
val_occrate = np.concatenate([val_dict[0]["occrate"], val_dict[1]["occrate"]])
test_occrate = np.concatenate([test_dict[0]["occrate"], test_dict[1]["occrate"]])

print(f"Train: zero:{len(train_occrate[train_occrate == 0])} total:{len(train_occrate)} relative:{len(train_occrate[train_occrate == 0]) / len(train_occrate)}")
print(f"Val: zero:{len(val_occrate[val_occrate == 0])} total:{len(val_occrate)} relative:{len(val_occrate[val_occrate == 0]) / len(val_occrate)}")
print(f"Test: zero:{len(test_occrate[test_occrate == 0])} total:{len(test_occrate)} relative:{len(test_occrate[test_occrate == 0]) / len(test_occrate)}")

Train Room 0: zero:13653 total:17728 relative:0.7701376353790613
Val Room 0: zero:1820 total:2216 relative:0.8212996389891697
Test Room 0: zero:1727 total:2216 relative:0.7793321299638989

Train Room 1: zero:14132 total:17728 relative:0.7971570397111913
Val Room 1: zero:1842 total:2216 relative:0.8312274368231047
Test Room 1: zero:1868 total:2216 relative:0.8429602888086642

Train: zero:27785 total:35456 relative:0.7836473375451264
Val: zero:3662 total:4432 relative:0.8262635379061372
Test: zero:3595 total:4432 relative:0.8111462093862816


### Features that make sense

In [210]:
train_dict[0].columns

Index(['datetime', 'avgocc', 'occrate', 'occcount', 'occcountdiff',
       'occratediff', 'lecturerampafter', 'lecture', 'exam', 'test',
       'registered', 'cancelled', 'lecturerampbefore', 'level', 'type',
       'maxoccrateestimate', 'maxocccountestimate', 'maxocccount', 'tutorium',
       'studyarea', 'maxoccrate', 'offsite', 'coursenumber', 'ects', 'VL',
       'UE', 'KS', 'Informatik', 'None_sa', 'Volkswirtschaftslehre', 'Chemie',
       'Wirtschaftsinformatik', 'Maschinenbau', 'Betriebswirtschaftslehre',
       'Rechtswissenschaften', 'Mathematik', 'Mechatronik',
       'Informationselektronik', 'Biologische Chemie', 'Sozialwissenschaften',
       'Artificial Intelligence', 'Kunststofftechnik', 'Statistik',
       'Pädagogik', 'Medical Engineering', 'B1 - Bachelor 1. Jahr',
       'None_level', 'B2 - Bachelor 2. Jahr', 'M1 - Master 1. Jahr',
       'B3 - Bachelor 3. Jahr', 'D - Diplom', 'M2 - Master 2. Jahr', 'hod1',
       'hod2', 'dow1', 'dow2', 'week1', 'week2', 'holiday', '

In [211]:
course_features = {"maxocccount", "maxoccrate" ,"maxoccrateestimate", "maxocccountestimate",
                "coursenumber", "exam",  "test", "tutorium", "cancelled","offsite", 
                "lecture", "lecturerampbefore", "lecturerampafter",
                "registered", "type", "studyarea", "ects", "level"}
datetime_features = {"dow", "hod", "week", "holiday", "zwickltag"}
general_features = {"occcount", "occrate", "avgocc"}
weather_features = {"weather"}
shift_features = {"occcount1week", "occrate1week", "occcount1day", "occrate1day"}

In [212]:
train_dict[0].columns

Index(['datetime', 'avgocc', 'occrate', 'occcount', 'occcountdiff',
       'occratediff', 'lecturerampafter', 'lecture', 'exam', 'test',
       'registered', 'cancelled', 'lecturerampbefore', 'level', 'type',
       'maxoccrateestimate', 'maxocccountestimate', 'maxocccount', 'tutorium',
       'studyarea', 'maxoccrate', 'offsite', 'coursenumber', 'ects', 'VL',
       'UE', 'KS', 'Informatik', 'None_sa', 'Volkswirtschaftslehre', 'Chemie',
       'Wirtschaftsinformatik', 'Maschinenbau', 'Betriebswirtschaftslehre',
       'Rechtswissenschaften', 'Mathematik', 'Mechatronik',
       'Informationselektronik', 'Biologische Chemie', 'Sozialwissenschaften',
       'Artificial Intelligence', 'Kunststofftechnik', 'Statistik',
       'Pädagogik', 'Medical Engineering', 'B1 - Bachelor 1. Jahr',
       'None_level', 'B2 - Bachelor 2. Jahr', 'M1 - Master 1. Jahr',
       'B3 - Bachelor 3. Jahr', 'D - Diplom', 'M2 - Master 2. Jahr', 'hod1',
       'hod2', 'dow1', 'dow2', 'week1', 'week2', 'holiday', '

#### Features Columns

Essential Features:
* Occupancy information: number of occupancts absolute or relative (divided by room capacity)
* Time stamp: Temporal resolution of t minutes

Course Features:
* Lecture: If a lecture takes place or not
* Date Specific Features: Exam, Test, Tutorium, Cancelled
* Course Specific Features: Registered students, Type (VL,UE,KS), Study area, Level, Course number

Time-related Features:
* Time, Weekday, (Calendarweek)
* Holiday, Zwickltag

Weather Features:
* Temperature, Air pressure, Precipation (sum over time interval), Wind speed, Air humidity, Sunshine duration

Additional Features:
* Average occupancy information of last k weeks

'VL', 'UE', 'KS', 

Study area: Maybe try with learnable parameter
'Informatik', 'None_sa',
'Volkswirtschaftslehre', 'Chemie', 'Wirtschaftsinformatik',
'Maschinenbau', 'Betriebswirtschaftslehre', 'Rechtswissenschaften',
'Mathematik', 'Mechatronik', 'Informationselektronik',
'Biologische Chemie', 'Sozialwissenschaften', 'Artificial Intelligence',
'Kunststofftechnik', 'Statistik', 'Pädagogik', 'Medical Engineering',

Level: Maybe try with learnable parameter
'B1 - Bachelor 1. Jahr', 'None_level', 'B2 - Bachelor 2. Jahr',
'M1 - Master 1. Jahr', 'B3 - Bachelor 3. Jahr', 'D - Diplom',
'M2 - Master 2. Jahr', 

In [213]:
# General Columns
# datetime, occupancy information (occcount, occrate=occcount/room_capacity)
# lecture ?

# Columns Concerning Specific Course Dates
# 

### Read out results files

In [214]:
# read txt file with training results

with open("results_wrapup_normal.txt", "r") as f:
    lines = f.readlines()
    line_str = "".join(lines)

In [215]:
list_of_runs = line_str.split("\n\n\n")
# remove empty strings
list_of_runs = [run for run in list_of_runs if run != ""][:-1]
list_of_runs

runs_of_interest = list_of_runs[:]

runs_of_interest

["################# Run: 0 #################\nTime: 2024-11-25 12:48:24\nDataset: train | Loss: MAE\nCombinations: [[0, 9], [0, 4], [0, 1], [0, 5], [0, 10], [0, 7], [0, 6], [0, 2], [0, 3], [0, 8], [0, 0]]\nModel Losses: [0.009174, 0.009215, 0.009297, 0.009307, 0.009342, 0.009409, 0.009479, 0.009513, 0.009543, 0.009601, 0.011038]\nBL zero Losses: [0.025558, 0.025558, 0.025558, 0.025558, 0.025558, 0.025558, 0.025558, 0.025558, 0.025558, 0.025558, 0.025558]\nBL naive Losses: [0.023304, 0.023304, 0.023304, 0.023304, 0.023304, 0.023304, 0.023304, 0.023304, 0.023304, 0.023304, 0.023304]\nBL avg Losses: [0.023492, 0.023492, 0.023492, 0.023492, 0.023492, 0.023492, 0.023492, 0.023492, 0.023492, 0.023492, 0.023492]\nHyperparameters: [('course_encoding_dim', [(np.str_('1'), np.int64(5))]), ('dataset_mode', [(np.str_('normal'), np.int64(5))]), ('dropout', [(np.str_('0'), np.int64(5))]), ('features', [(np.str_('occrate_lecture'), np.int64(1)), (np.str_('occrate_lecture_exam_cancelled_tutorium_test'

In [216]:
import ast

In [222]:
import pandas as pd

list_of_dfs = []
for run in runs_of_interest:
    
    splitted_run = run.split("\n")
    # filter out empty strings
    splitted_run = [elem for elem in splitted_run if elem != ""]

    run_id = int(splitted_run[0].split(" ")[2])
    if (run_id < 0) or (run_id > 10):
        continue
    
    
    comb_lists = []
    model_losses = []
    zero_baselines = []
    naive_baselines = []
    avg_baselines = []
    dataset_types = []
    loss_types = []
    
    for elem in splitted_run[2:]:

        by_bar = elem.split("|")
        
        if len(by_bar) == 2:
            dataset = by_bar[0].split(":")[1].strip()
            loss_type = by_bar[1].split(":")[1].strip()

            
        elif len(by_bar) == 1:
            
            if by_bar[0] == "":
                continue
            
            array_type, array = by_bar[0].split(":")
            array_type = array_type.strip()
            
            if array_type == "Hyperparameters":
                continue
            
            array = array.strip()
            array = np.array(ast.literal_eval(array))

            
            if array_type == "Combinations":
                            
                dataset_types.extend(np.repeat(dataset, len(array)))
                loss_types.extend(np.repeat(loss_type, len(array)))
                
                comb_lists.extend(array)
                
            elif array_type == "Model Losses":
                model_losses.extend(array)
                
            elif array_type == "BL zero Losses":
                zero_baselines.extend(array)
                
            elif array_type == "BL naive Losses":
                naive_baselines.extend(array)

            elif array_type == "BL avg Losses":
                avg_baselines.extend(array)

            else:
                print(array_type, array)
                raise
      
    run_df = pd.DataFrame({
        "run_id": run_id,
        "dataset": dataset_types,
        "loss_type": loss_types,
        "combinations": comb_lists,
        "model_losses": model_losses,
        "zero_baselines": zero_baselines,
        "naive_baselines": naive_baselines,
        "avg_baselines": avg_baselines
    })
    
    list_of_dfs.append(run_df)

In [223]:
results_df = pd.concat(list_of_dfs).reset_index(drop=True)

In [225]:

# iterate through all combinations and load hyperparameters
import json
path_to_checkpoints = "_occupancy_forecasting/checkpoints/wrap_up"
for idx, row in results_df.iterrows():
    comb = row["combinations"]
    
    comb_path = os.path.join(path_to_checkpoints, f"run_{comb[0]}/comb_{comb[1]}")
    
    hyperparameters_path = os.path.join(comb_path, "hyperparameters.json")

    
    hyperparameters = json.load(open(hyperparameters_path, "r"))
    
    # overwrite combinations with tuple of run_id and comb_id
    results_df.at[idx, "combinations"] = (row["run_id"], comb[1])
    
    # add all hyperparameters to the dataframe
    for key, value in hyperparameters.items():
        results_df.at[idx, key] = str(value)

print(len(results_df))

1026


In [273]:
# filter out all runs with a certain loss type

def filter_dataframe_by_column_value(df, column, value):
    return df[df[column] == value].reset_index(drop=True)


# loss type = MAE
results_filt = filter_dataframe_by_column_value(results_df, "loss_type", "MAE")
# model class = ed_lstm
results_filt = filter_dataframe_by_column_value(results_filt, "model_class", "ed_lstm")
# split_by = time
results_filt = filter_dataframe_by_column_value(results_filt, "split_by", "time")
# frequency = 5min
results_filt = filter_dataframe_by_column_value(results_filt, "frequency", "5min")
# lr = 0.001
results_filt = filter_dataframe_by_column_value(results_filt, "lr", "0.001")
# batch_size = "32"
results_filt = filter_dataframe_by_column_value(results_filt, "batch_size", "32")
# with_examweek = False
results_filt = filter_dataframe_by_column_value(results_filt, "with_examweek", "False")
# course_encoding_dim = 3
results_filt = filter_dataframe_by_column_value(results_filt, "course_encoding_dim", "3")
# room_id = 0
results_filt = filter_dataframe_by_column_value(results_filt, "room_ids", "[0]")


# num_layers = 3
results_filt = filter_dataframe_by_column_value(results_filt, "num_layers", "3")

# hidden_size = "[32, 32]"
results_filt = filter_dataframe_by_column_value(results_filt, "hidden_size", "[32, 32]")

## x_horizon = 36
results_filt = filter_dataframe_by_column_value(results_filt, "y_horizon", "36")

# finalize filtering
results_out = filter_dataframe_by_column_value(results_filt, "dataset", "test")
results_out = results_out.sort_values(by="model_losses")

results_out.drop_duplicates(subset=["features"], keep="first", inplace=True)
results_out

Unnamed: 0,run_id,dataset,loss_type,combinations,model_losses,zero_baselines,naive_baselines,avg_baselines,info,model_class,...,layer_norm,weight_decay,forget_gate,include_x_features,zero_sample_drop_rate,x_size,y_features_size,y_size,occcount,feature_store
0,9,test,MAE,"(9, 5)",0.008764,0.017787,0.016627,0.011299,Run 8: ed lstm - longer forecasting horizon,ed_lstm,...,False,0,True,True,0.1,2,1,1,False,occrate_coursenumber_exam_test_tutorium_cancel...
1,9,test,MAE,"(9, 7)",0.009796,0.017787,0.016627,0.011299,Run 8: ed lstm - longer forecasting horizon,ed_lstm,...,False,0,True,True,0.1,12,11,1,False,occrate_coursenumber_exam_test_tutorium_cancel...
8,10,test,MAE,"(10, 3)",0.009796,0.017787,0.016627,0.011299,Run 9: weather,ed_lstm,...,False,0,True,True,0.1,9,8,1,False,occrate_coursenumber_exam_test_tutorium_cancel...
3,9,test,MAE,"(9, 2)",0.010636,0.017787,0.016627,0.011299,Run 8: ed lstm - longer forecasting horizon,ed_lstm,...,False,0,True,True,0.1,6,5,1,False,occrate_coursenumber_exam_test_tutorium_cancel...
5,9,test,MAE,"(9, 0)",0.013729,0.017787,0.016627,0.011299,Run 8: ed lstm - longer forecasting horizon,ed_lstm,...,False,0,True,True,0.1,1,0,1,False,occrate_coursenumber_exam_test_tutorium_cancel...
9,10,test,MAE,"(10, 0)",0.014079,0.017787,0.016627,0.011299,Run 9: weather,ed_lstm,...,False,0,True,True,0.1,3,2,1,False,occrate_coursenumber_exam_test_tutorium_cancel...


In [278]:
pretty_results = results_out[["features", "model_losses", "avg_baselines", "naive_baselines", "zero_baselines"]]

# split features by "_"
pretty_results["features"] = pretty_results["features"].apply(lambda x: x.split("_"))
pretty_results

Unnamed: 0,features,model_losses,avg_baselines,naive_baselines,zero_baselines
0,"[occrate, avgocc, coursenumber]",0.008764,0.011299,0.016627,0.017787
1,"[occrate, avgocc, coursenumber, exam, tutorium...",0.009796,0.011299,0.016627,0.017787
8,"[occrate, avgocc, coursenumber, weather]",0.009796,0.011299,0.016627,0.017787
3,"[occrate, avgocc, coursenumber, exam, tutorium...",0.010636,0.011299,0.016627,0.017787
5,[occrate],0.013729,0.011299,0.016627,0.017787
9,"[occrate, avgocc, coursenumber, tl]",0.014079,0.011299,0.016627,0.017787


results_filt

### Old Stuff

In [15]:
"""# 2024-04-09 07:00:00
start = datetime.datetime(2024, 4, 9, 7, 0, 0)
# 2024-04-09 21:00:00
stop = datetime.datetime(2024, 4, 9, 21, 0, 0)

plot_data = dfg.filter_by_timestamp(data, "datetime", start, stop) 

# 3 subplots in 3 rows
fig, axs = plt.subplots(3, 1, figsize=(15, 10))
# plot occrate
axs[0].plot(plot_data["datetime"], plot_data["occrate"], label="ocrate")
# plot registered
registered = (plot_data["registered"] - norm_registered["min"]) / (norm_registered["max"] - norm_registered["min"])
axs[1].plot(plot_data["datetime"], registered, label="registered")
# temperature
temperature = (plot_data["tl"] - norm_temperature["min"]) / (norm_temperature["max"] - norm_temperature["min"])
axs[2].plot(plot_data["datetime"], temperature, label="temperature")
plt.show()"""

'# 2024-04-09 07:00:00\nstart = datetime.datetime(2024, 4, 9, 7, 0, 0)\n# 2024-04-09 21:00:00\nstop = datetime.datetime(2024, 4, 9, 21, 0, 0)\n\nplot_data = dfg.filter_by_timestamp(data, "datetime", start, stop) \n\n# 3 subplots in 3 rows\nfig, axs = plt.subplots(3, 1, figsize=(15, 10))\n# plot occrate\naxs[0].plot(plot_data["datetime"], plot_data["occrate"], label="ocrate")\n# plot registered\nregistered = (plot_data["registered"] - norm_registered["min"]) / (norm_registered["max"] - norm_registered["min"])\naxs[1].plot(plot_data["datetime"], registered, label="registered")\n# temperature\ntemperature = (plot_data["tl"] - norm_temperature["min"]) / (norm_temperature["max"] - norm_temperature["min"])\naxs[2].plot(plot_data["datetime"], temperature, label="temperature")\nplt.show()'

In [16]:

#fig = make_subplots(
#    rows=3, 
#    cols=1, 
#    subplot_titles=("Occupancy Rate", "Registered Students", "Temperature in Linz")
#    )
#x_col = "datetime"

## occupancy rate
#fig.add_trace(
#    go.Scatter(
#        x=plot_data[x_col], 
#        y=plot_data["occrate"],
#        mode='lines', 
#        name='Occupancy Rate'
#        ),
#    row=1, col=1
#    )
## registered students
#registered = (plot_data["registered"] - norm_registered["min"]) / (norm_registered["max"] - norm_registered["min"])
#fig.add_trace(
#    go.Scatter(
#        x=plot_data[x_col], 
#        y=registered,
#        mode='lines', 
#        name='Registered Students'
#        ),
#    row=2, col=1
#    )

## temperature
#temperature = (plot_data["tl"] - norm_temperature["min"]) / (norm_temperature["max"] - norm_temperature["min"])
#fig.add_trace(
#    go.Scatter(
#        x=plot_data[x_col], 
#        y=temperature,
#        mode='lines', 
#        name='Temperature in Linz'
#        ),
#    row=3, col=1
#    )

## set y axis between 0 and 1
#fig.update_yaxes(range=[-0.1, 1], row=1, col=1)
#fig.update_yaxes(range=[-0.1, 1], row=2, col=1)
#fig.update_yaxes(range=[-0.1, 1], row=3, col=1)
#fig.show()