# Occupancy Forecasting Notebook

In [9]:
#Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


from sklearn.model_selection import train_test_split, KFold

In [10]:
# Importing the dataset
file_name = "frequency_data"
data = pd.read_csv(f"data/cleaned_data/{file_name}.csv")

dtypes = pd.read_csv(f"data/cleaned_data/{file_name}_dtypes.csv", index_col=0)
data = data.astype(dtypes.to_dict()["0"])

data["datetime"] = data["time"]
data.drop("time", axis=1, inplace=True)


### Data Utility Functions

In [11]:
def filter_room_id(data, room_id):
    return data[data["room_id"] == room_id].reset_index(drop=True)

def filter_event_type(data, event_type):
    return data[data["event_type"] == event_type].reset_index(drop=True)

def resample(data, time_column, frequency, agg_func):
    
    # get min day
    if frequency == "MS":
        min_time = min(data[time_column]).replace(day=1, hour=0, minute=0, second=0)
    else:
        min_time = min(data[time_column]).replace(hour=0, minute=0, second=0)

    # get max day
    max_time = max(data[time_column])
    max_time = max_time.replace(day=max_time.day+1, hour=0, minute=0, second=0)
    
    
    idx = pd.date_range(start=min_time, end=max_time, freq=frequency, inclusive="both") 

    df_resampled = data.set_index(time_column)\
                   .resample(frequency, label="left", closed="left")

    if agg_func == "sum":
        df_resampled = df_resampled.sum()
    elif agg_func == "count":
        df_resampled = df_resampled.count()
    elif agg_func == "value_counts":
        df_resampled = df_resampled.value_counts()
    else:
        raise ValueError("agg_func must be 'sum', 'count' or 'value_counts'")
    return df_resampled.reindex(idx, fill_value=0).reset_index().rename(columns={"index": time_column})[[time_column, "event_type"]]

def derive_day(data):
    data["day"] = data["datetime"].dt.date
    return data

def derive_week(data):
    data["week"] = data["datetime"].dt.isocalendar().week
    return data

def derive_time(data):
    data["time"] = data["datetime"].dt.time
    return data

def derive_weekday(data):
    data["weekday"] = data["datetime"].dt.weekday
    return data

In [12]:
data_filterd = filter_room_id(data, 0)
data_filterd = filter_event_type(data_filterd, 1)

## 0. Basic Statistics and Plots

Bar Charts

In [13]:
# Paper: 
# Basic bar plots count of events after resampling
# Check if it follows some distribution

data_resampled = resample(data_filterd, "datetime", "5min", "count")


data_trimmed = data_resampled[data_resampled["event_type"] > 0]
value_counts = data_trimmed.value_counts("event_type")

fig = go.Figure()
# line plot
fig.add_trace(
    go.Bar(
        x=value_counts.index, 
        y=value_counts, 
        name='value_counts'))


Boxplot

In [14]:
# Boxplot plots -> Boxplot for every day and time
# Do for different events and rooms
data_resampled = resample(data_filterd, "datetime", "15min", "count")
data_resampled = derive_time(data_resampled)
data_resampled = derive_weekday(data_resampled)
data_resampled = derive_week(data_resampled)
data_resampled = data_resampled.drop("datetime", axis=1)


data_plot = data_resampled.groupby(["week", "time", "weekday"]).sum().reset_index()
data_plot = data_plot[data_plot["weekday"] == 0]

print(data_plot)
fig = px.box(data_plot, x="time", y="event_type")
fig.show()

      week      time  weekday  event_type
0       15  00:00:00        0           0
7       15  00:15:00        0           0
14      15  00:30:00        0           0
21      15  00:45:00        0           0
28      15  01:00:00        0           0
...    ...       ...      ...         ...
7848    26  22:45:00        0           0
7853    26  23:00:00        0           0
7858    26  23:15:00        0           0
7863    26  23:30:00        0           0
7868    26  23:45:00        0           0

[1152 rows x 4 columns]


More plots in the other notebook

## 1.Patterns

clustering -> different algorithms, k-means

Papers tried: 

### 1.1 Occupancy data analytics and prediction: A case study - Xin Liang a, b, Tianzhen Hong b, *, Geoffrey Qiping Shen

Distance Measures: Eucleadian, Dynamic Time Warp, Correlation Simmilarity -> compared among all different configurations

Clustering Algos: K-Means, 

Algorithm Evaluation: Davies-Bouldin index, to find best k

Insights of the paper:

* High variability in the data statistical methods using some kind of mean will fail miserably!

In [59]:
def filter_by_time(data, time_column, start_time, end_time):
    
    return data[(data[time_column].dt.time >= start_time) & (data[time_column].dt.time <= end_time)]



In [60]:
# split into daily samples
data_resampled = resample(data_filterd, "datetime", "15min", "count")
data_resampled = derive_time(data_resampled)
data_resampled = derive_weekday(data_resampled)
data_resampled = derive_week(data_resampled)
data_resampled = derive_day(data_resampled)

data_resampled = filter_by_time(data_resampled, "datetime", datetime.strptime("08:00:00", "%H:%M:%S").time(), datetime.strptime("20:00:00", "%H:%M:%S").time())


samples = []
times = []
group_list = []
for group in data_resampled.groupby("day"):
    #print(group[1]["event_type"].reset_index(drop=True))
    samples.append(group[1]["event_type"].values)
    times.append(group[1]["datetime"].values)
    group_list.append(group[0])
    
samples = np.array(samples[:-1])
times = np.array(times[:-1])
group_list = np.array(group_list[:-1])


In [16]:
#train, test = train_test_split(samples, test_size=0.1, random_state=42)

#train = np.array(train)
#test = np.array(test)

#kf = KFold(n_splits=10)

#for train_index, valid_index in kf.split(train):
    
#    train_fold = train[train_index]
#    valid_fold = train[valid_index]
    
#    break

In [75]:
from sklearn.cluster import KMeans

n_clusters = 7
clustering_model = KMeans(n_clusters=n_clusters, init="random")
clustering_model.fit(samples)

indices = np.arange(len(samples))

In [76]:
def plot_cluster_centers(n_clusters, clustering_model, times):
    
    fig = go.Figure()
    
    for i in range(n_clusters):

        cluster_center = clustering_model.cluster_centers_[i]
        
        # line plot
        fig.add_trace(
            go.Scatter(
                x=times, 
                y=cluster_center, 
                name=f'cluster {i}',
                mode="lines+markers"        
            )
        )
        
    return fig

plot_cluster_centers(n_clusters, clustering_model, times[0]).show()

In [77]:
def plot_cluster_details(cluster_number, clustering_model, samples, time_axis):
    
    indices = np.arange(len(samples))
    
    fig = go.Figure()
    
    cluster_indices = indices[clustering_model.labels_ == cluster_number]
    cluster_center = clustering_model.cluster_centers_[cluster_number]

    cluster_group_list = group_list[cluster_indices]
    
    print(f"################ {cluster_number} ################")
    #print(cluster_indices)
    #print(cluster_group_list)
    #print()
    
    # line plot
    fig.add_trace(
        go.Scatter(
            x=time_axis, 
            y=cluster_center, 
            name=f'Center',
            mode="lines+markers"        
        )
    )

    for x in range(len(cluster_indices)):
        fig.add_trace(
            go.Scatter(
                x=times[0], 
                y=samples[cluster_indices[x]], 
                mode="lines+markers"        
            )
        )
    
    return fig
    
plot_cluster_details(0, clustering_model, samples, times[0]).show()

################ 0 ################


In [20]:
indices = np.arange(len(samples))

fig = go.Figure()

for i in range(n_clusters):

    cluster_indices = indices[clustering_model.labels_ == i]
    cluster_center = clustering_model.cluster_centers_[i]
    
    
    cluster_group_list = group_list[cluster_indices]

    print(f"################ {i} ################")
    print(cluster_indices)
    print(cluster_group_list)
    print()
    
    # line plot
    fig.add_trace(
        go.Scatter(
            x=times[0], 
            y=cluster_center, 
            name=f'cluster {i}',
            mode="lines+markers"        
        )
    )
        
fig.show()

################ 0 ################
[ 6 12 13 19 20 23 25 26 27 31 32 33 34 40 41 42 43 46 47 48 52 53 54 55
 60 61 62 67 68 69 74 75 76]
[datetime.date(2024, 4, 14) datetime.date(2024, 4, 20)
 datetime.date(2024, 4, 21) datetime.date(2024, 4, 27)
 datetime.date(2024, 4, 28) datetime.date(2024, 5, 1)
 datetime.date(2024, 5, 3) datetime.date(2024, 5, 4)
 datetime.date(2024, 5, 5) datetime.date(2024, 5, 9)
 datetime.date(2024, 5, 10) datetime.date(2024, 5, 11)
 datetime.date(2024, 5, 12) datetime.date(2024, 5, 18)
 datetime.date(2024, 5, 19) datetime.date(2024, 5, 20)
 datetime.date(2024, 5, 21) datetime.date(2024, 5, 24)
 datetime.date(2024, 5, 25) datetime.date(2024, 5, 26)
 datetime.date(2024, 5, 30) datetime.date(2024, 5, 31)
 datetime.date(2024, 6, 1) datetime.date(2024, 6, 2)
 datetime.date(2024, 6, 7) datetime.date(2024, 6, 8)
 datetime.date(2024, 6, 9) datetime.date(2024, 6, 14)
 datetime.date(2024, 6, 15) datetime.date(2024, 6, 16)
 datetime.date(2024, 6, 21) datetime.date(2024,

## 2. Train and Test split

In [6]:
data_filterd = filter_room_id(data, 0)
data_filterd = filter_event_type(data_filterd, 1)

## 3. Occupancy Forecasting

In [None]:
# check out some of the papers in the review
# start with some traditional methods

## 4. Count Forecasting
### Try more general approach if 1. is too hard, gather literature on count forecasting