In [1]:
import numpy as np
import matplotlib.pyplot as plt #for displaying plots
import pandas as pd
import seaborn as sns
import tensorflow as tf
import datetime
import random
import properscoring as ps
from scipy import stats


In [2]:
# setting a seed to ensure reproducability and consistency
random.seed(16)
np.random.seed(16)
tf.random.set_seed(16)

In [3]:
df = pd.read_csv(r"C:\Users\aisti\OneDrive\Dokumente\Uni\Bachelorarbeit\Daten\bike_data_enhanced_without_lockdown_data.csv")

In [4]:
#save a copy of the raw data
df_raw = df.copy()

In [5]:
# change date column to datetime
df['date'] = pd.to_datetime(df['date'])

df.head()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,precip_type,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season
0,2012-04-25,4593.0,12.038889,65.611111,3.033333,183.333333,53611.111111,0.0,294,12.0,0.0,0.0,No Holiday,0,No Holiday,0,0,1,0
1,2012-04-26,5849.0,14.194444,66.111111,2.844444,249.444444,52555.555556,0.0,176,10.0,0.0,0.0,No Holiday,0,No Holiday,0,0,1,0
2,2012-04-27,5846.0,15.233333,68.666667,1.505556,235.0,45055.555556,0.1,373,5.0,0.0,0.0,No Holiday,0,No Holiday,0,0,1,0
3,2012-04-28,4261.0,22.2,53.444444,2.7,153.888889,41500.0,0.0,678,8.0,0.0,0.0,No Holiday,0,No Holiday,0,0,0,0
4,2012-04-29,1901.0,17.994444,62.222222,3.95,214.444444,52777.777778,3.7,319,16.0,0.0,0.0,No Holiday,0,No Holiday,0,0,0,0


# Dropping precip_type

Since the column precip_type contains some faulty values and is not expected to have a big impact on the prediction (any information contained are also represented by the temperature and precipitation columns), I will drop it.

In [6]:
df = df.drop('precip_type', axis=1)

In [7]:
df.head()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season
0,2012-04-25,4593.0,12.038889,65.611111,3.033333,183.333333,53611.111111,0.0,294,12.0,0.0,No Holiday,0,No Holiday,0,0,1,0
1,2012-04-26,5849.0,14.194444,66.111111,2.844444,249.444444,52555.555556,0.0,176,10.0,0.0,No Holiday,0,No Holiday,0,0,1,0
2,2012-04-27,5846.0,15.233333,68.666667,1.505556,235.0,45055.555556,0.1,373,5.0,0.0,No Holiday,0,No Holiday,0,0,1,0
3,2012-04-28,4261.0,22.2,53.444444,2.7,153.888889,41500.0,0.0,678,8.0,0.0,No Holiday,0,No Holiday,0,0,0,0
4,2012-04-29,1901.0,17.994444,62.222222,3.95,214.444444,52777.777778,3.7,319,16.0,0.0,No Holiday,0,No Holiday,0,0,0,0


The precip_indic column indicates wether it rained on a certain day or not. It also contains a few faulty values (caused by the aggregation of the precip_indic column from hourly to daily). 
If the precipitation is over 0 on a day, it should say 1 and 0 otherwise. 

In [8]:
df.precip_indic.unique()

array([0.   , 1.   , 0.5  , 0.875, 0.75 , 0.625])

In [9]:
df['precip_indic'] = df['precipitation'].apply(lambda x: 1 if x > 0 else 0)

In [10]:
df.precip_indic.unique()

array([0, 1], dtype=int64)

In [11]:
df['day_of_week'] = df['date'].dt.dayofweek

In [None]:
df.head()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
0,2012-04-25,4593.0,12.038889,65.611111,3.033333,183.333333,53611.111111,0.0,294,12.0,0,No Holiday,0,No Holiday,0,0,1,0,2
1,2012-04-26,5849.0,14.194444,66.111111,2.844444,249.444444,52555.555556,0.0,176,10.0,0,No Holiday,0,No Holiday,0,0,1,0,3
2,2012-04-27,5846.0,15.233333,68.666667,1.505556,235.0,45055.555556,0.1,373,5.0,1,No Holiday,0,No Holiday,0,0,1,0,4
3,2012-04-28,4261.0,22.2,53.444444,2.7,153.888889,41500.0,0.0,678,8.0,0,No Holiday,0,No Holiday,0,0,0,0,5
4,2012-04-29,1901.0,17.994444,62.222222,3.95,214.444444,52777.777778,3.7,319,16.0,1,No Holiday,0,No Holiday,0,0,0,0,6


# Splitting the data into test set and training set

The code contains data from april 2012 through january 2024. I want 2023 until january 2024 to be the test set

In [None]:
df.tail()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
3910,2024-01-21,994.0,-1.5,87.666667,1.5875,187.5,14942.5,0.0,191,7.7,0,No Holiday,0,No Holiday,0,0,0,3,6
3911,2024-01-22,3398.0,7.479167,76.791667,7.075,223.75,47245.416667,5.4,0,18.0,1,No Holiday,0,No Holiday,0,0,1,3,0
3912,2024-01-23,4427.0,8.358333,74.541667,7.25,231.25,38257.916667,0.5,164,16.6,1,No Holiday,0,No Holiday,0,0,1,3,1
3913,2024-01-24,4124.0,11.945833,72.291667,8.35,234.583333,45765.833333,1.9,7,18.8,1,No Holiday,0,No Holiday,0,0,1,3,2
3914,2024-01-25,4594.0,9.366667,85.083333,3.345833,229.583333,23374.583333,1.0,220,7.5,1,No Holiday,0,No Holiday,0,0,1,3,3


In [14]:
filtered_df = df[['bike_count', 'temperature', 'humidity', 'windspeed', 'visibility', 'precipitation', 'sun']]

In [None]:
summary = filtered_df.describe()
summary

Unnamed: 0,bike_count,temperature,humidity,windspeed,visibility,precipitation,sun
count,3915.0,3915.0,3915.0,3915.0,3915.0,3915.0,3915.0
mean,4700.835816,12.468751,73.532114,3.466377,33354.532863,1.570779,306.774457
std,1922.055154,7.65128,13.795854,1.814267,16075.483974,3.990632,278.571794
min,273.0,-7.827778,31.277778,0.6,207.777778,0.0,0.0
25%,3222.0,6.397222,63.361111,2.127778,20551.111111,0.0,31.0
50%,4862.0,12.411111,75.0,3.116667,34241.666667,0.0,246.0
75%,6182.0,18.669444,84.333333,4.338889,45731.666667,1.1,545.0
max,10200.0,31.794444,100.0,12.233333,71937.777778,58.3,909.0


In [16]:
summary_latex = summary.style.to_latex()

In [17]:
with open(r'C:\Users\aisti\OneDrive\Dokumente\Uni\Bachelorarbeit\Daten\summary_data.tex', 'w') as file:
    file.write(summary_latex)

In [18]:
df_test = df[df['date'].dt.year.isin([2023, 2024])]
df_train = df[~df['date'].dt.year.isin([2023, 2024])]

In [19]:
df_train.head()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
0,2012-04-25,4593.0,12.038889,65.611111,3.033333,183.333333,53611.111111,0.0,294,12.0,0,No Holiday,0,No Holiday,0,0,1,0,2
1,2012-04-26,5849.0,14.194444,66.111111,2.844444,249.444444,52555.555556,0.0,176,10.0,0,No Holiday,0,No Holiday,0,0,1,0,3
2,2012-04-27,5846.0,15.233333,68.666667,1.505556,235.0,45055.555556,0.1,373,5.0,1,No Holiday,0,No Holiday,0,0,1,0,4
3,2012-04-28,4261.0,22.2,53.444444,2.7,153.888889,41500.0,0.0,678,8.0,0,No Holiday,0,No Holiday,0,0,0,0,5
4,2012-04-29,1901.0,17.994444,62.222222,3.95,214.444444,52777.777778,3.7,319,16.0,1,No Holiday,0,No Holiday,0,0,0,0,6


In [20]:
df_train.tail()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
3520,2022-12-27,2638.0,4.272222,81.777778,2.961111,212.222222,55572.777778,0.0,205,7.8,0,weihnachtsferien,1,No Holiday,0,1,1,3,1
3521,2022-12-28,2806.0,8.711111,67.166667,5.355556,221.666667,69854.444444,0.0,75,11.6,0,weihnachtsferien,1,No Holiday,0,1,1,3,2
3522,2022-12-29,2421.0,11.55,71.777778,7.472222,221.666667,63607.777778,2.1,50,16.2,1,weihnachtsferien,1,No Holiday,0,1,1,3,3
3523,2022-12-30,2980.0,7.1,89.388889,2.222222,196.111111,38045.0,2.2,162,16.2,1,weihnachtsferien,1,No Holiday,0,1,1,3,4
3524,2022-12-31,2011.0,16.444444,64.5,6.205556,218.888889,71937.777778,0.0,247,15.1,0,weihnachtsferien,1,No Holiday,0,1,0,3,5


In [21]:
df_test.head()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
3525,2023-01-01,1160.0,13.527778,66.555556,3.161111,207.777778,57907.777778,0.0,196,10.0,0,weihnachtsferien,1,Neujahr,1,1,0,3,6
3526,2023-01-02,2996.0,10.316667,79.722222,3.072222,204.444444,50496.666667,1.6,99,12.6,1,weihnachtsferien,1,No Holiday,0,1,1,3,0
3527,2023-01-03,3635.0,6.027778,88.222222,1.622222,199.444444,45582.777778,0.0,347,5.5,0,weihnachtsferien,1,No Holiday,0,1,1,3,1
3528,2023-01-04,3265.0,8.911111,77.166667,8.944444,227.777778,46587.777778,0.3,0,17.2,1,weihnachtsferien,1,No Holiday,0,1,1,3,2
3529,2023-01-05,3751.0,10.977778,81.944444,6.066667,233.333333,40345.555556,0.3,22,14.3,1,weihnachtsferien,1,No Holiday,0,1,1,3,3


In [22]:
df_test.tail()

Unnamed: 0,date,bike_count,temperature,humidity,windspeed,wind_direction,visibility,precipitation,sun,windspeed_max,precip_indic,school_holiday,school_holiday_indicator,public_holiday,public_holiday_indicator,holiday_indicator,is_workday,season,day_of_week
3910,2024-01-21,994.0,-1.5,87.666667,1.5875,187.5,14942.5,0.0,191,7.7,0,No Holiday,0,No Holiday,0,0,0,3,6
3911,2024-01-22,3398.0,7.479167,76.791667,7.075,223.75,47245.416667,5.4,0,18.0,1,No Holiday,0,No Holiday,0,0,1,3,0
3912,2024-01-23,4427.0,8.358333,74.541667,7.25,231.25,38257.916667,0.5,164,16.6,1,No Holiday,0,No Holiday,0,0,1,3,1
3913,2024-01-24,4124.0,11.945833,72.291667,8.35,234.583333,45765.833333,1.9,7,18.8,1,No Holiday,0,No Holiday,0,0,1,3,2
3914,2024-01-25,4594.0,9.366667,85.083333,3.345833,229.583333,23374.583333,1.0,220,7.5,1,No Holiday,0,No Holiday,0,0,1,3,3


In [23]:
len(df_test)

390

In [24]:
len(df_train)

3525

In [25]:
df_train.to_csv(r"C:\Users\aisti\OneDrive\Dokumente\Uni\Bachelorarbeit\Daten\df_train.csv", index=False)

In [26]:
df_test.to_csv(r"C:\Users\aisti\OneDrive\Dokumente\Uni\Bachelorarbeit\Daten\df_test.csv", index=False)

In [27]:
len(df_test)

390