The code in this notebook is used to generate a csv dataset from the information on the website https://dashboards.nsjaarverslag.nl/reizigersgedrag regarding NS station data.

# Load Libraries

In [3]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver  
from selenium.common.exceptions import NoSuchElementException  
from selenium.webdriver.common.keys import Keys  

##### Function that reads single webpage and returns results as dataframe
** Access to the computer's webdriver is required **

In [4]:
def read_ns_data(station:str,year:str):
    """
    Input: NS station and year
    Method: Use Selenium which uses local browser to load a webpage
    Reason: NS page is dynamic (not static) therefore the page loads in several phases (Due to animations)
    Output: Statistics for the input station and year

    """
    URL = "https://dashboards.nsjaarverslag.nl/reizigersgedrag/" + station +"?dtYear=" + year
    browser = webdriver.Safari()  # Uses Safari, Can be modified to use Chrome
    browser.get(URL)  
    time.sleep(3)
    html_source = browser.page_source  
    browser.quit()
    soup = BeautifulSoup(html_source,'html.parser')  
    results = soup.find(id="content")
    kpi_value_containers = soup.find_all(class_='db-kpi_value-container')

    # Initialize a dictionary to store the extracted values
    kpi_values = {}

    # Iterate through the value containers
    for container in kpi_value_containers:
        # Find the associated title (assuming it's in the parent div)
        title = container.find_previous(class_='db-kpi_title').text.strip()

        # Extract the value (assumes it's within a <span> element)
        value_element = container.find('span', class_='db-kpi_value')
        value = value_element.text.strip()

        # Check if the title is already in the dictionary
        if title in kpi_values:
            # If it is, append the value to the existing list
            kpi_values[title].append(value)
        else:
            # If it's not, create a new list with the value
            kpi_values[title] = [value]
            
    # create data for dataframe
    data ={}
    for title, value in kpi_values.items():
        if len(value) == 2:
            data[title + '_voor'] = value[0]
            data[title + '_na'] = value[1]
        else:
            data[title] = value

    df = pd.DataFrame(data)
    df.index = pd.MultiIndex.from_tuples([(station, year)], names=['Station', 'Year'])
    return df

#### Test function

In [5]:
read_ns_data('delft','2019')

Unnamed: 0_level_0,Unnamed: 1_level_0,Reizigers per dag,Klantoordeel,In- en uitstappers,Overstappers,Ochtendspits,Avondspits,Daluren,Lopend_voor,Lopend_na,Fiets_voor,Fiets_na,Bus/tram/metro_voor,Bus/tram/metro_na,Auto (bestuurder)_voor,Auto (bestuurder)_na,Auto (passagier)_voor,Auto (passagier)_na,(Deel)taxi_voor,(Deel)taxi_na
Station,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
delft,2019,40.818,92%,40.435,383,22%,19%,59%,27%,38%,53%,37%,17%,21%,1%,1%,2%,3%,0%,0%


#### Create a list of all NS stations in South and North Holland and the list of years for which there is information

In [6]:
ns_stations_south = ['alphen-aan-den-rijn','barendrecht','bodegraven','boskoop','boskoop-snijdelwijk','capelle-schollevaar',
               'de-vink','delft','delft-campus','den-haag-centraal','den-haag-hs','den-haag-laan-van-noi','den-haag-mariahoeve',
               'den-haag-moerwijk','den-haag-ypenburg','dordrecht','dordrecht-zuid','gouda','gouda-goverwelle','hillegom',
               'lansingerland-zoetermeer','leiden-centraal','leiden-lammenschans','nieuwerkerk-a-d-ijssel','rijswijk',
               'rotterdam-alexander','rotterdam-blaak','rotterdam-centraal','rotterdam-lombardijen','rotterdam-noord',
               'rotterdam-zuid','sassenheim','schiedam-centrum','voorburg','voorhout','voorschoten','waddinxveen','waddinxveen-noord',
               'waddinxveen-triangel','zoetermeer','zoetermeer-oost','zwijndrecht'
              ]


ns_stations_north = ['alkmaar','alkmaar-noord','amsterdam-amstel','amsterdam-bijlmer-arena','amsterdam-centraal', 'amsterdam-holendrecht',
               'amsterdam-lelylaan','amsterdam-muiderpoort','amsterdam-rai','amsterdam-science-park','amsterdam-sloterdijk','amsterdam-zuid',
               'anna-paulowna','beverwijk','bloemendaal','bovenkarspel-flora','bovenkarspel-grootebroek','bussum-zuid', 'castricum',
               'den-helder','den-helder-zuid','diemen','diemen-zuid','driehuis','duivendrecht','enkhuizen','haarlem','haarlem-spaarnwoude',
               'halfweg-zwanenburg','heemskerk','heemstede-aerdenhout','heerhugowaard','heiloo','hilversum','hilversum-media-park',
               'hilversum-sportpark','hollandsche-rading','hoofddorp','hoogkarspel','hoorn','hoorn-kersenboogerd','koog-aan-de-zaan',
               'krommenie-assendelft','naarden-bussum','nieuw-vennep','obdam','overveen','purmerend','purmerend-overwhere','purmerend-weidevenne',
               'santpoort-noord','santpoort-zuid','schagen','schiphol-airport','uitgeest','weesp','wormerveer','zaandam','zaandam-kogerveld',
               'zaandijk-zaanse-schans','zandvoort-aan-zee'        
              ]

ns_stations = ns_stations_south + ns_stations_north


years = ['2019','2020','2021','2022']

#### Create Dataframe for all stations/years

In [None]:
# Create Dataframe

df_ns_data = pd.DataFrame() # create empty dataframe to be filled-in
# Loop through stations and years
for station in ns_stations:
    for year in years:
        # Call the read_ns_data function
        df = read_ns_data(station, year)
        
        # Concatenate the obtained DataFrame with the df_ns_data
        df_ns_data = pd.concat([df_ns_data, df])
df_ns_data.to_csv('unfiltered_ns_data_Holland.csv') # save unfiltered dataset to csv

#### Filter dataframe

In [None]:
# Filter Dataframe

df_ns = pd.read_csv('unfiltered_ns_data_Holland.csv',dtype={'Reizigers per dag': str,'In- en uitstappers':str,'Overstappers':str}) # load the csv

# Account for passenger values over 1000 which are displayed with a dot
def multiply_by_1000(value):
    if '.' in value:
        return int( float(value) * 1000 )
    else:
        return value
# 'In- en uitstappers','Overstappers'
df_ns['Reizigers per dag'] = df_ns['Reizigers per dag'].apply(multiply_by_1000)
df_ns['In- en uitstappers'] = df_ns['In- en uitstappers'].apply(multiply_by_1000)
df_ns['Overstappers'] = df_ns['Overstappers'].apply(multiply_by_1000)

df_ns.to_csv('df_ns_Holland.csv') #save dataframe


# df_ns dataframe is now ready to use

#### Test

In [None]:
# extract data for a specific station on a specific year
df_ns[(df_ns['Station'] == 'alphen-aan-den-rijn') & (df_ns['Year'] == 2019)]

In [None]:
# extract data for a specific year
df_ns[df_ns['Year'] == 2019].head()