In [148]:
from bs4 import BeautifulSoup
import pandas as pd
import lxml


In [149]:
ids = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31-identifiers.csv')

In [199]:
ids.drop_duplicates(inplace=True)
ids['RideInstance'] = ids.Date + '#' + ids.RideId.astype(str)
print(ids.shape)
ids.head(3)

(48587, 4)


Unnamed: 0,Date,RideId,TrainId,RideInstance
0,2019-06-30,7393,7393,2019-06-30#7393
1,2019-06-30,2981,2981,2019-06-30#2981
2,2019-06-30,2985,2985,2019-06-30#2985


In [151]:
def safetext(tag):
    return tag.text.strip() if tag is not None else None

def safefindtext(tag):
    return tag.find(text=True).strip() if tag is not None else None
    
def get_platform(ride):
    try:
        columns = ride.find_all('div', recursive=False)
        return safetext(columns[3])
    except IndexError:
        return None

def get_materials(ride):
    return ';'.join([material.get('title') for material in ride.select('div.material-parts .material-part')])



In [198]:
dfresults = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31_scrapped.csv')
dfresults['RideInstance'] = dfresults.Date + '#' + dfresults.RideId.astype(str)
print(dfresults.shape)
dfresults.sample(3)


(998, 16)


Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance
73,2019-07-01,222,222,Utrecht Centraal,09:03,+2½,Amsterdam Centraal,09:28,,5,7b,ICE-3M 4651,,,,2019-07-01#222
41,2019-07-01,3016,3016,Utrecht Centraal,07:08,,Amsterdam Centraal,07:35,,5,8,VIRM-6 8633;VIRM-4 9512,VIRM-6 8633;VIRM-4 9512,Amsterdam Amstel,,2019-07-01#3016
617,2019-07-03,3018,3018,Utrecht Centraal,07:38,,Amsterdam Centraal,08:05,,5,8,VIRM-4 9592;VIRM-6 8648,VIRM-4 9592;VIRM-6 8648,Amsterdam Amstel,,2019-07-03#3018


In [200]:
from requests import get
data = {}
scrapecols = ['DepartureStation', 'DepartureTime', 'DepartureDelay', 'DestinationStation','ArrivalTime',  'ArrivalDelay', 
               'DeparturePlatform', 'ArrivalPlatform', 'DepartureMaterials', 'ArrivalMaterials', 'InbetweenStations', 'FaultMessages']
colnames = ['Date', 'RideId', 'TrainId', *scrapecols]

for column in colnames:
    data[column] = []


for day, rid, tid, instance in ids[['Date', 'RideId', 'TrainId', 'RideInstance']].loc[1000:5000].values:

    if instance in dfresults.RideInstance.values:
        continue

    search_url = f'https://www.rijdendetreinen.nl/en/train-archive/{day}/{rid}'
    result = get(search_url) 

    search_soup = BeautifulSoup(result.text, features='lxml')

    data['Date'].append(day)
    data['RideId'].append(rid)
    data['TrainId'].append(tid)

    if result.status_code == 404:
        for column in scrapecols:
            data[column].append(None)
        print('404 cannot find ride', search_url)
        continue

    origin = None
    destination = None
    inbetween = []
    rides = search_soup.find('div', { 'class': 'train-archive'}).select('div.row.service')

    if len(rides) == 0:
        for column in scrapecols:
            data[column].append(None)
        print('no entries found for ride', search_url)
        continue

    for ride in rides:

        station = safetext(ride.find('a'))
        if station not in ['Amsterdam Centraal', 'Utrecht Centraal']:
            if origin and not destination:
                inbetween.append(station)
            continue

        if origin is None:
            origin = station
            data['DepartureStation'].append(origin)

            data['DepartureTime'].append(safefindtext(ride.select_one('div.departure-time')))
            data['DepartureDelay'].append(safefindtext(ride.select_one('div.departure-time .label')))
            
            data['DeparturePlatform'].append(get_platform(ride))
            data['DepartureMaterials'].append(get_materials(ride))

            data['FaultMessages'].append(';'.join([message.text.strip() for message in ride.select('.text-danger')]))             
        else:
            destination = station
            data['DestinationStation'].append(destination)
            data['ArrivalTime'].append(safefindtext(ride.select_one('div.arrival-time')) )
            data['ArrivalDelay'].append(safefindtext(ride.select_one('div.arrival-time .label')))

            data['ArrivalPlatform'].append(get_platform(ride))
            data['ArrivalMaterials'].append(get_materials(ride))

            data['InbetweenStations'].append(';'.join(inbetween))
    if origin is None:
        for column in scrapecols:
            data[column].append(None)
        print('Could not find Amsterdam or Utrecht stations', search_url)
        continue


Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-05/11409
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-05/11413
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-11/11409
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-11/11413
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-12/11409
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-12/11413
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-12/300844
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-18/11409
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-18/11413
Could not

In [201]:
for key in data:
    print(key, len(data[key]))

Date 4000
RideId 4000
TrainId 4000
DepartureStation 4000
DepartureTime 4000
DepartureDelay 4000
DestinationStation 3993
ArrivalTime 3993
ArrivalDelay 3993
DeparturePlatform 4000
ArrivalPlatform 3993
DepartureMaterials 4000
ArrivalMaterials 3993
InbetweenStations 3993
FaultMessages 4000


In [202]:
df = pd.concat([ dfresults, pd.DataFrame(data)])
df['RideInstance'] = df.Date + '#' + df.RideId.astype(str)
df

ValueError: arrays must all be same length

In [None]:
df.to_csv('gg.csv',index=None)
df.to_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31_scrapped.csv',index=None)