In [9]:
import pandas as pd
import numpy as np
import os
import glob

import requests

from datetime import datetime, timedelta, date
import time
import pytz
from workalendar.europe import Netherlands

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_error

import helpers_gvb_reworked_v2 as h

import importlib   # to reload helpers without restarting kernel: importlib.reload(h)

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
from time import time

import warnings
import glob

In [10]:
# stations to create predictions for
stations = ['Centraal Station', 'Station Zuid']

#change every week
week_no = 48

In [11]:
today = pd.to_datetime("today")
today_str = str(today.year) + "-" + str(today.month) + "-" + str(today.day)
covid_url = 'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/2020-09-01/' + today_str

# Main

## 1. Get data

In [12]:
print('Start loading raw data')

Start loading raw data


In [13]:
t1 = time()

herkomst_2020 = h.get_gvb_data ('Datalab_Reis_Herkomst_Uur_')
bestemming_2020 = h.get_gvb_data ('Datalab_Reis_Bestemming_Uur_')

t2 = time()
print('Completed in %s sec.' % (str(t2 - t1)))

Completed in 52.69049334526062 sec.


In [14]:
t1 = time()

herkomst_2021 = h.get_gvb_data ('Datalab_Reis_Herkomst_Uur_2021')
bestemming_2021 = h.get_gvb_data ('Datalab_Reis_Bestemming_Uur_2021')

t2 = time()
print('Completed in %s sec.' % (str(t2 - t1)))

Completed in 0.5435638427734375 sec.


In [15]:
t1 = time()

knmi_obs = h.get_knmi_data('knmi/knmi-observations/2021/**/**/*')

t2 = time()
print('Completed in %s sec.' % (str(t2 - t1)))

Completed in 2.298098564147949 sec.


In [16]:
t1 = time()

knmi_preds = h.get_knmi_data('knmi/knmi/2021/**/**/*.json.gz')

t2 = time()
print('Completed in %s sec.' % (str(t2 - t1)))

Completed in 47.235177755355835 sec.


In [17]:
covid_df_raw = pd.DataFrame(requests.get(url = covid_url).json()['data'])

In [18]:
holidays_data_raw = Netherlands().holidays(2019) + Netherlands().holidays(2020) + Netherlands().holidays(2021) 

In [19]:
vacations_df = h.get_vacations()

In [20]:
events = h.get_events()

                     Locatie      Datum Start show
0               paradisoadam 2021-11-28   17:00:00
1               paradisoadam 2021-01-24   12:00:00
2               paradisoadam 2021-10-15   10:00:00
3               paradisoadam 2021-11-09   14:00:00
4               paradisoadam 2021-06-21   13:00:00
..                       ...        ...        ...
339  beursvanberlageofficial 2021-12-30   18:00:00
340  beursvanberlageofficial 2021-11-09   15:00:00
341  beursvanberlageofficial 2020-12-14   09:00:00
342            concertgebouw 2021-04-18   10:30:00
343            concertgebouw 2020-08-24   10:00:00

[344 rows x 3 columns]


In [21]:
def read_csv_dir(dir):
    
    read_csv_beta = pd.read_csv(dir,sep=',')
    
    return read_csv_beta

### 2. Prepare data

#### Pre-process data sources

In [22]:
print('Start pre-processing data')

Start pre-processing data


In [23]:
herkomst = pd.concat([herkomst_2020, herkomst_2021])
bestemming = pd.concat([bestemming_2020, bestemming_2021])

In [24]:
# Cast 'AantalReizen' to int to sum up
bestemming['AantalReizen'] = bestemming['AantalReizen'].astype(int)
herkomst['AantalReizen'] = herkomst['AantalReizen'].astype(int)

# Remove all duplicates
bestemming.drop_duplicates(inplace=True)
herkomst.drop_duplicates(inplace=True)

# Group by station name because we are analysing per station
bestemming_grouped = bestemming.groupby(['Datum', 'UurgroepOmschrijving (van aankomst)', 'AankomstHalteNaam'], as_index=False)['AantalReizen'].sum()
herkomst_grouped = herkomst.groupby(['Datum', 'UurgroepOmschrijving (van vertrek)', 'VertrekHalteNaam'], as_index=False)['AantalReizen'].sum()

In [25]:
bestemming_herkomst = h.merge_bestemming_herkomst(bestemming_grouped, herkomst_grouped)

In [26]:
gvb_dfs = []

for station in stations:
    gvb_dfs.append(h.preprocess_gvb_data_for_modelling(bestemming_herkomst, station))

In [27]:
knmi_historical = h.preprocess_knmi_data_hour(knmi_obs)

In [28]:
knmi_forecast = h.preprocess_metpre_data(knmi_preds)

In [29]:
covid_df = h.preprocess_covid_data(covid_df_raw)

In [30]:
holiday_df = h.preprocess_holiday_data(holidays_data_raw)

#### Merge datasources

In [31]:
gvb_dfs_merged = []

for df in gvb_dfs:
    gvb_dfs_merged.append(h.merge_gvb_with_datasources(df, knmi_historical, covid_df, holiday_df, vacations_df, events))

In [32]:
# for df in gvb_dfs_merged:
#     print(df['planned_event'].head(100))
#     df['planned_event'] = np.where(df['planned_event'] > 0, 1, 0)

In [33]:
# gvb_dfs_merged[0][gvb_dfs_merged[0]['planned_event'] != 0]

### 3. Clean data

In [34]:
print('Start cleaning data')

Start cleaning data


#### Interpolate missing data

In [35]:
gvb_dfs_interpolated = []

for df in gvb_dfs_merged:
    gvb_dfs_interpolated.append(h.interpolate_missing_values(df))

  checkins_interpolator.fit(X_train, y_train)
  checkouts_interpolator.fit(X_train, y_train)
  checkins_interpolator.fit(X_train, y_train)
  checkouts_interpolator.fit(X_train, y_train)


In [36]:
# gvb_dfs_interpolated[0][gvb_dfs_interpolated[0]['planned_event'] != 0]

In [37]:
gvb_dfs_final = []

for df in gvb_dfs_interpolated:
    
    df['check-ins'] = df['check-ins'].astype(int)
    df['check-outs'] = df['check-outs'].astype(int)
    df[['check-ins_week_ago', 'check-outs_week_ago']] = df.apply(lambda x: h.get_crowd_last_week(df, x), axis=1, result_type="expand")
    
    gvb_dfs_final.append(df)

In [38]:
# gvb_dfs_final[0][gvb_dfs_final[0]['planned_event'] != 0]

#### 4. Create model dataframes

In [39]:
# Define features and targets. This is the same for all stations at the moment.
features = ['year', 'month', 'weekday', 'hour', 'holiday', 'vacation', 'planned_event',  'stringency', 'temperature', 'wind_speed', 'precipitation_h','global_radiation']

targets = ['check-ins', 'check-outs']

In [40]:
data_splits = []

for df in gvb_dfs_final:
    df = df[['datetime'] + features + targets]

    train, validation, test = h.get_train_val_test_split(df.dropna())
    data_splits.append([train, validation, test])

In [41]:
X_train_splits = []
y_train_splits = []

X_validation_splits = []
y_validation_splits = []

X_test_splits = []
y_test_splits = []

for split in data_splits:
    
    X_train_splits.append(split[0][features])
    y_train_splits.append(split[0][targets])
    
    X_validation_splits.append(split[1][features])
    y_validation_splits.append(split[1][targets])
    
    X_test_splits.append(split[2][features])
    y_test_splits.append(split[2][targets])

In [42]:
# def get_crowd_last_week(df, row):
#     week_ago = row['datetime'] - timedelta(weeks=1)
#     subset_with_hour = df[(df['datetime']==week_ago) & (df['hour']==row['hour'])]
#
#     # If crowd from last week is not available at exact date- and hour combination, then get average crowd of last week.
#     subset_week_ago = df[(df['year']==row['year']) & (df['week']==row['week']) & (df['hour']==row['hour'])]
#
#     checkins_week_ago = 0
#     checkouts_week_ago = 0
#
#     if len(subset_with_hour) > 0: # return crowd from week ago at the same day/time (hour)
#         checkins_week_ago = subset_with_hour['check-ins'].mean()
#         checkouts_week_ago = subset_with_hour['check-outs'].mean()
#     elif len(subset_week_ago) > 0: # return average crowd the hour group a week ago
#         checkins_week_ago = subset_week_ago['check-ins'].mean()
#         checkouts_week_ago = subset_week_ago['check-outs'].mean()
#
#     return [checkins_week_ago, checkouts_week_ago]

In [43]:
# Dataframes to predict cdfheck-ins and check-outs of next week
X_predict_dfs = []

for df in gvb_dfs_final:
    X_predict_dfs.append(h.get_future_df(features, df, covid_df.tail(1)['stringency'][0], holiday_df, vacations_df, knmi_forecast, events))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['peak_period'][df.hour.isin([7,8,17,18])] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['peak_period'][df.hour.isin([7,8,17,18])] = 1


### 5. Create model

In [44]:
print('Start modelling')

Start modelling


In [45]:
len(data_splits)

2

In [46]:
basic_models = []

for x in range(0, len(data_splits)):
    model_basic, r_squared_basic, mae_basic, rmse_basic = h.train_random_forest_regressor(X_train_splits[x], y_train_splits[x], 
                                                                                          X_validation_splits[x], y_validation_splits[x], 
                                                                                          None)
    basic_models.append([model_basic, r_squared_basic, mae_basic, rmse_basic])

In [47]:
#### Tune (hyper-)parameters (not done because models currently do not improve with hyperparameter tuning)

In [48]:
# Specify hyperparameters, these could be station-specific. For now, default hyperparameter settings are being used.
centraal_station_hyperparameters = None
station_zuid_hyperparameters = None
#station_bijlmer_arena_hyperparameters = 2

hyperparameters = [centraal_station_hyperparameters,
                  station_zuid_hyperparameters
#                  ,station_bijlmer_arena_hyperparameters
                  ]

In [49]:
#tuned_models = []

#for x in range(0, len(data_splits)):
#    model_tuned, r_squared_tuned, mae_tuned, rmse_tuned = h.train_random_forest_regressor(X_train_splits[x], y_train_splits[x], 
#                                                                                          X_validation_splits[x], y_validation_splits[x], 
#                                                                                          hyperparameters[x])
#    tuned_models.append([model_tuned, r_squared_tuned, mae_tuned, rmse_tuned])

##### Improvements compared to basic model (negative is worse performance)

In [50]:
#for x in range(0, len(basic_models)):
#    print("R-squared difference", tuned_models[x][1]-basic_models[x][1])
#    print("MAE difference", tuned_models[x][2]-basic_models[x][2])
#    print("RMSE difference", tuned_models[x][3]-basic_models[x][3])

#### Train test model (including validation data)

In [51]:
test_models = []

for x in range(0, len(data_splits)):
    X_train_with_val = pd.concat([X_train_splits[x], X_validation_splits[x]])
    y_train_with_val = pd.concat([y_train_splits[x], y_validation_splits[x]])
    
    model_test, r_squared_test, mae_test, rmse_test = h.train_random_forest_regressor(X_train_with_val, y_train_with_val, 
                                                                                          X_test_splits[x], y_test_splits[x], 
                                                                                          hyperparameters[x])
    test_models.append([model_test, r_squared_test, mae_test, rmse_test])

In [52]:
test_models

[[RandomForestRegressor(random_state=1),
  0.891301419803745,
  106.57626811594201,
  158.07635057309253],
 [RandomForestRegressor(random_state=1),
  0.8525079173189607,
  76.41847656249999,
  104.97004454210678]]

#### Check models on R-squared score

In [53]:
for x in range(0, len(test_models)):
    station_name = stations[x]
    r_squared = test_models[x][1]
    if r_squared < 0.7:
        warnings.warn("Model for " + station_name + " shows unexpected performance!")

#### Train final models (to make predictions)

In [54]:
final_models = []

for x in range(0, len(data_splits)):
    X_train_with_val = pd.concat([X_train_splits[x], X_validation_splits[x], X_test_splits[x]])
    y_train_with_val = pd.concat([y_train_splits[x], y_validation_splits[x], y_test_splits[x]])
    
    model_final = h.train_random_forest_regressor(X_train_with_val, y_train_with_val, X_test_splits[x], y_test_splits[x], 
                                                  hyperparameters[x])[0]
    final_models.append(model_final)

In [55]:
print('Start preparing data')

Start preparing data


In [56]:
predictions = []

for predict_df in X_predict_dfs:
    for model in final_models:
        prediction = h.predict(model, predict_df.dropna())
        predictions.append(prediction)

### 6. Save

In [57]:
#today = pd.to_datetime("today")
#today_str = str(today.year) + str(today.month) + str(today.day)

In [58]:
predictions[0].to_csv(('output/prediction_all_week_' + str(week_no) + '.csv'))

### 7. Make graphs

In [59]:
df_best_1 = pd.read_csv("data_bestemming/Datalab_Reis_Bestemming_Uur_"+"{}.csv".format((today - timedelta(days=0)).strftime('%Y%m%d')), sep = ";")
df_best_2 = pd.read_csv("data_bestemming/Datalab_Reis_Bestemming_Uur_"+"{}.csv".format((today - timedelta(days=7)).strftime('%Y%m%d')), sep = ";")
df_best_3 = pd.read_csv("data_bestemming/Datalab_Reis_Bestemming_Uur_"+"{}.csv".format((today - timedelta(days=14)).strftime('%Y%m%d')), sep = ";")
df_best_4 = pd.read_csv("data_bestemming/Datalab_Reis_Bestemming_Uur_"+"{}.csv".format((today - timedelta(days=28)).strftime('%Y%m%d')), sep = ";")

FileNotFoundError: [Errno 2] No such file or directory: 'data_bestemming/Datalab_Reis_Bestemming_Uur_20220106.csv'

In [None]:
df_best = h.preprocess_gvb_data(pd.concat([df_best_1, df_best_2, df_best_3, df_best_4]))

In [None]:
df_best_weekday = df_best[df_best['weekday'].isin([0,1,2,3,4])]
df_best_weekend = df_best[df_best['weekday'].isin([5,6])]

In [None]:
df_best_weekday = df_best_weekday[df_best_weekday['arrival_stop_name'] == 'Centraal Station']
df_best_weekend = df_best_weekend[df_best_weekend['arrival_stop_name'] == 'Centraal Station']

In [None]:
df_best_weekday_grouped = df_best_weekday.groupby('week').sum().reset_index()
df_best_weekend_grouped = df_best_weekend.groupby('week').sum().reset_index()

In [None]:
df_best_weekday_grouped = df_best_weekday_grouped[df_best_weekday_grouped.week.isin([week_no-1, week_no-2, week_no-3, week_no-4])]
df_best_weekend_grouped = df_best_weekend_grouped[df_best_weekend_grouped.week.isin([week_no-1, week_no-2, week_no-3, week_no-4])]

In [None]:
df_best_weekday_grouped = df_best_weekday_grouped[['week','count']]
df_best_weekend_grouped = df_best_weekend_grouped[['week','count']]

In [None]:
predictions[0]['week'] = predictions[0]['datetime'].dt.isocalendar().week  

In [None]:
pred_weekdays = predictions[0][predictions[0].weekday.isin([0,1,2,3,4])]
pred_weekends = predictions[0][predictions[0].weekday.isin([5,6])]

In [None]:
pred_weekdays = pred_weekdays[['week','check-outs_predicted']].groupby('week').sum().rename(columns={'check-outs_predicted':'count'}).reset_index()
pred_weekends = pred_weekends[['week','check-outs_predicted']].groupby('week').sum().rename(columns={'check-outs_predicted':'count'}).reset_index()

In [None]:
df_plot_weekday = pd.concat([df_best_weekday_grouped, pred_weekdays])
df_plot_weekend = pd.concat([df_best_weekend_grouped, pred_weekends])

In [None]:
df_plot_weekday['count'] = df_plot_weekday['count'] / 5
df_plot_weekend['count'] = df_plot_weekend['count'] / 2

In [None]:
df_plot_weekday["color"] = "blue" 
df_plot_weekday["color"][df_plot_weekday["week"] == week_no] = "lightskyblue"   # Change each week
#df_plot["color"][df_plot["week_year"].isin(["9 ('20)", "13 ('20)", "39 ('20)"])] = "lightblue"  # Change each week

df_plot_weekend["color"] = "blue" 
df_plot_weekend["color"][df_plot_weekend["week"] == week_no] = "lightskyblue"   # Change each week
#df_plot["color"][df_plot["week_year"].isin(["9 ('20)", "13 ('20)", "39 ('20)"])] = "lightblue"  # Change each week


In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (11,4), dpi = 100, frameon = False, sharey = True, constrained_layout = True)

ax1.bar(x = df_plot_weekday['week'], height = df_plot_weekday['count'], color = df_plot_weekday['color'])
ax1.set_title('Doordeweeks (ma-vr)')
ax1.set_xlabel("Week")
ax1.set_ylabel("Aantal Reis-uitchecks per dag \n")
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.grid(axis = 'y', color = 'lightgrey')

ax2.bar(x = df_plot_weekend['week'], height = df_plot_weekend['count'], color = df_plot_weekend['color'])
ax2.set_title('Weekend (za-zo)')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.grid(axis = 'y', color = 'lightgrey')
ax2.set_xlabel("Week")

gemeten = mpatches.Patch(color='blue', label='Gemeten')
voorspelling = mpatches.Patch(color='lightskyblue', label='Voorspelling')
plt.legend(handles=[gemeten, voorspelling], bbox_to_anchor=(1.2, 0.5, 0.5, 0.5))

ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.grid(axis = 'y', color = 'lightgrey')
plt.rcParams["axes.axisbelow"] = True

fig.suptitle('Centraal Station (Metro)\n', fontsize = 18)

my_filename = "output/weekly_report_gvb_prediction_" + today_str + ".png" 
        
plt.savefig(my_filename, bbox_inches='tight')
plt.show()

### 8. Evaluations

In [None]:
#def read_csv_dir(dir):
    
    
#     fields = ['datetime', 'predict_xg_CMSA-GAKH-01', 'predict_xg_GACM-02', 'predict_xg_CMSA-GAWW-15', 'predict_xg_CMSA-GAWW-14']
#    dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S%z')
#    read_csv_beta = pd.read_csv(dir,sep=','
                                #usecols=fields
#                               )
    
#    return read_csv_beta

In [None]:
#read csv files from the previous week
#df_pred = pd.concat(map(read_csv_dir, glob.glob("output/prediction_all_week_*.csv")))

In [None]:
#predicted counts
#df_pred['datetime'] = df_pred['hour'] + + df['datetime']
#df_pred['datetime'] = pd.to_datetime(df_pred['datetime'])
#df_pred = df_pred.set_index('datetime')
#df_pred['week'] = df_pred.index.isocalendar().week
#slices data from only the last 4 weeks
#df_pred = df_pred[df_pred['week'].isin([my_week-1, my_week-2, my_week-3])]
#df_pred = df_pred.groupby('week').sum()