In this notebook the data is preprocessed.

# Setup

In [26]:
import numpy as np
import matplotlib.pyplot as plt #for displaying plots
import pandas as pd
import seaborn as sns
import tensorflow as tf

from keras.layers import *
from keras.models import Sequential
from keras.models import Model
from keras.optimizers import *
from keras.callbacks import *
from keras import regularizers
from keras.utils import plot_model

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.inspection import permutation_importance
import datetime
import random
import properscoring as ps

from scipy.stats import norm

import shap
import xgboost
import pydot
import graphviz
import os
from pathlib import Path

In [2]:
# setting a seed to ensure reproducability and consistency
random.seed(16)
np.random.seed(16)
tf.random.set_seed(16)

In [3]:
# Change Working Directory
os.chdir("..")
print(os.getcwd()) # print current working directory

c:\Users\aisti\OneDrive\Dokumente\Uni\Bachelorarbeit\Probabilistic-Forecasting-of-Bicycle-Counts-in-Karlsruhe-with-Neural-Networks


In [4]:
df_train = pd.read_csv(r"data\df_train.csv")

In [5]:
df_test = pd.read_csv(r"data\df_test.csv")

In [6]:
#save a copy of the raw data
df_train_raw = df_train.copy()
df_test_raw = df_test.copy()

In [7]:
df_train.head()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
0,2012-04-25,4593.0,12.038889,65.611111,3.033333,183.333333,53611.111111,0.0,294,12.0,0,No Holiday,0,No Holiday,0,0,1,0,2
1,2012-04-26,5849.0,14.194444,66.111111,2.844444,249.444444,52555.555556,0.0,176,10.0,0,No Holiday,0,No Holiday,0,0,1,0,3
2,2012-04-27,5846.0,15.233333,68.666667,1.505556,235.0,45055.555556,0.1,373,5.0,1,No Holiday,0,No Holiday,0,0,1,0,4
3,2012-04-28,4261.0,22.2,53.444444,2.7,153.888889,41500.0,0.0,678,8.0,0,No Holiday,0,No Holiday,0,0,0,0,5
4,2012-04-29,1901.0,17.994444,62.222222,3.95,214.444444,52777.777778,3.7,319,16.0,1,No Holiday,0,No Holiday,0,0,0,0,6


In [8]:
df_test.head()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
0,2023-01-01,1160.0,13.527778,66.555556,3.161111,207.777778,57907.777778,0.0,196,10.0,0,weihnachtsferien,1,Neujahr,1,1,0,3,6
1,2023-01-02,2996.0,10.316667,79.722222,3.072222,204.444444,50496.666667,1.6,99,12.6,1,weihnachtsferien,1,No Holiday,0,1,1,3,0
2,2023-01-03,3635.0,6.027778,88.222222,1.622222,199.444444,45582.777778,0.0,347,5.5,0,weihnachtsferien,1,No Holiday,0,1,1,3,1
3,2023-01-04,3265.0,8.911111,77.166667,8.944444,227.777778,46587.777778,0.3,0,17.2,1,weihnachtsferien,1,No Holiday,0,1,1,3,2
4,2023-01-05,3751.0,10.977778,81.944444,6.066667,233.333333,40345.555556,0.3,22,14.3,1,weihnachtsferien,1,No Holiday,0,1,1,3,3


In [9]:
len(df_train), len(df_test)

(3525, 390)

# Preprocessing

In [27]:
scaler = StandardScaler()

In [28]:
def preproc(df_in, fit_scaler, col_normalize = ['temperature', 'humidity', 'windspeed', 'wind_direction', 'visibility', 'precipitation', 'sun', 'windspeed_max' ] ):
    df = df_in.copy()
   
    #Split features and targets
    y = df.pop('bike_count') if 'bike_count' in df.columns else None
    X = df
    
    #normalize values
    if fit_scaler:
        X[col_normalize] = scaler.fit_transform(X[col_normalize]) #only normalize columns with numerical and non categorical values
    else:
        X[col_normalize] = scaler.transform(X[col_normalize]) #only normalize columns with numerical and non categorical values
    
    X['date'] =  pd.to_datetime(X['date'])
        
    X_num = df.select_dtypes(include=['number'])
    
   

    return X, y, X_num

In [29]:
split_date_valid = '2022-01-01'

In [30]:
X_train, y_train, X_train_num = preproc(df_train[df_train.date <= split_date_valid], True)

In [31]:
X_valid, y_valid, X_valid_num = preproc(df_train[df_train.date > split_date_valid], False)

In [32]:
X_test, y_test, X_test_num = preproc(df_test, False)

In [33]:
#reset indices, so every dataset starts at 0
X_train = X_train.reset_index(drop=True)
X_train_num = X_train_num.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

X_valid = X_valid.reset_index(drop=True)
X_valid_num = X_valid_num.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)

X_test = X_test.reset_index(drop=True)
X_test_num = X_test_num.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [34]:
y_test.iloc[5]

1574.0

In [35]:
X_test.iloc[365]

date                        2024-01-01 00:00:00
temperature                           -0.573209
humidity                                -0.2248
windspeed                              2.236176
wind_direction                         0.649639
visibility                             1.281107
precipitation                         -0.397192
sun                                   -0.859681
windspeed_max                          1.688415
precip_indic                                  0
school_holiday                 weihnachtsferien
school_holiday_indicator                      1
public_holiday                          Neujahr
public_holiday_indicator                      1
holiday_indicator                             1
is_workday                                    0
season                                        3
day_of_week                                   0
Name: 365, dtype: object

## Checking the Data

In [36]:
X_train.shape, X_valid.shape, X_test.shape

((3161, 18), (364, 18), (390, 18))

In [37]:
X_train_num.shape, X_valid_num.shape, X_test_num.shape

((3161, 15), (364, 15), (390, 15))

In [38]:
y_train.shape, y_valid.shape, y_test.shape

((3161,), (364,), (390,))

In [39]:
X_train.head()

Unnamed: 0,date,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
0,2012-04-25,-0.039776,-0.645081,-0.218119,0.079113,1.461434,-0.397192,-0.031329,0.665623,0,No Holiday,0,No Holiday,0,0,1,0,2
1,2012-04-26,0.244625,-0.607815,-0.324284,1.042128,1.392951,-0.397192,-0.458165,0.154228,0,No Holiday,0,No Holiday,0,0,1,0,3
2,2012-04-27,0.381694,-0.417343,-1.07681,0.831722,0.90636,-0.371451,0.254435,-1.124262,1,No Holiday,0,No Holiday,0,0,1,0,4
3,2012-04-28,1.300863,-1.551894,-0.405469,-0.349793,0.67568,-0.397192,1.357699,-0.357168,0,No Holiday,0,No Holiday,0,0,0,0,5
4,2012-04-29,0.74599,-0.897664,0.297096,0.532297,1.407369,0.55525,0.059103,1.688415,1,No Holiday,0,No Holiday,0,0,0,0,6


In [40]:
y_train.head()

0    4593.0
1    5849.0
2    5846.0
3    4261.0
4    1901.0
Name: bike_count, dtype: float64

In [41]:
X_train_num.head()

Unnamed: 0,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday_indicator,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
0,-0.039776,-0.645081,-0.218119,0.079113,1.461434,-0.397192,-0.031329,0.665623,0,0,0,0,1,0,2
1,0.244625,-0.607815,-0.324284,1.042128,1.392951,-0.397192,-0.458165,0.154228,0,0,0,0,1,0,3
2,0.381694,-0.417343,-1.07681,0.831722,0.90636,-0.371451,0.254435,-1.124262,1,0,0,0,1,0,4
3,1.300863,-1.551894,-0.405469,-0.349793,0.67568,-0.397192,1.357699,-0.357168,0,0,0,0,0,0,5
4,0.74599,-0.897664,0.297096,0.532297,1.407369,0.55525,0.059103,1.688415,1,0,0,0,0,0,6


In [42]:
X_train.describe()

Unnamed: 0,date,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday_indicator,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
count,3161,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0
mean,2016-10-06 22:22:03.378677504,4.450726e-16,-3.281849e-16,4.832859e-17,1.719599e-16,-1.078964e-16,3.708939e-17,4.7204670000000005e-17,-1.185736e-16,0.401772,0.232521,0.033534,0.248972,0.68491,1.492249,3.001898
min,2012-04-25 00:00:00,-2.660948,-3.112937,-1.513961,-2.348657,-2.003308,-0.3971923,-1.094803,-1.891355,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2014-06-24 00:00:00,-0.7932892,-0.7651612,-0.7364558,-0.6573103,-0.823848,-0.3971923,-0.9899026,-0.6384357,0.0,0.0,0.0,0.0,0.0,1.0,1.0
50%,2016-08-22 00:00:00,-0.0009270793,0.09610389,-0.2056287,0.4432786,0.03399334,-0.3971923,-0.2085744,-0.1014702,0.0,0.0,0.0,0.0,1.0,1.0,3.0
75%,2018-11-02 00:00:00,0.7907021,0.779319,0.4969366,0.7507961,0.7585809,-0.1140338,0.8585175,0.5122046,1.0,0.0,0.0,0.0,1.0,2.0,5.0
max,2022-01-01 00:00:00,2.566737,1.918011,4.952762,2.361217,2.407294,14.61021,2.193287,5.523883,1.0,1.0,1.0,1.0,1.0,3.0,6.0
std,,1.000158,1.000158,1.000158,1.000158,1.000158,1.000158,1.000158,1.000158,0.490334,0.422506,0.180054,0.432486,0.464625,1.101504,2.001106


## Saving the Data

In [46]:
def conv_to_csv(df, filename):
    complete_filename = r"data\\" + filename + ".csv"
    df.to_csv(complete_filename, index=False)

In [48]:
def conv_to_pickle(df, filename):
    complete_filename = r"data\\" + filename + ".pkl"
    df.to_pickle(complete_filename)

In [49]:
conv_to_csv(X_train, "X_train")
conv_to_csv(X_valid, "X_valid")
conv_to_csv(X_test, "X_test")

conv_to_csv(y_train, "y_train")
conv_to_csv(y_valid, "y_valid")
conv_to_csv(y_test, "y_test")

conv_to_csv(X_train_num, "X_train_num")
conv_to_csv(X_valid_num, "X_valid_num")
conv_to_csv(X_test_num, "X_test_num")

In [50]:
conv_to_pickle(X_train, "X_train")
conv_to_pickle(X_valid, "X_valid")
conv_to_pickle(X_test, "X_test")

conv_to_pickle(y_train, "y_train")
conv_to_pickle(y_valid, "y_valid")
conv_to_pickle(y_test, "y_test")

conv_to_pickle(X_train_num, "X_train_num")
conv_to_pickle(X_valid_num, "X_valid_num")
conv_to_pickle(X_test_num, "X_test_num")