In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import lxml


In [3]:
ids = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-01-01--2019-06-30-idetifiers.csv')

In [4]:
ids.drop_duplicates(inplace=True)
ids['RideInstance'] = ids.Date + '#' + ids.RideId.astype(str)
print(ids.shape)
ids.head(3)

(46399, 4)


Unnamed: 0,Date,RideId,TrainId,RideInstance
0,2018-12-31,1405,1405,2018-12-31#1405
1,2018-12-31,1402,1402,2018-12-31#1402
2,2019-01-01,1409,1409,2019-01-01#1409


In [5]:
def safetext(tag):
    return tag.text.strip() if tag is not None else None

def safefindtext(tag):
    return tag.find(text=True).strip() if tag is not None else None
    
def get_platform(ride):
    try:
        columns = ride.find_all('div', recursive=False)
        return safetext(columns[3])
    except IndexError:
        return None

def get_materials(ride):
    return ';'.join([material.get('title') for material in ride.select('div.material-parts .material-part')])

def blank_scrape(data):
    for column in scrapecols:
        data[column].append(None)


In [18]:
dfresults = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31_scrapped.csv')
dfresults['RideInstance'] = dfresults.Date + '#' + dfresults.RideId.astype(str)
print(dfresults.shape)


(1498, 16)


In [41]:
# Get all the rides we have scrapped to make sure we don't scrape them again

dfresults = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-01-01--2019-06-30_scrapped.csv')
dfresults['RideInstance'] = dfresults.Date + '#' + dfresults.RideId.astype(str)
print(dfresults.shape)


from requests import get
data = {}
scrapecols = ['DepartureStation', 'DepartureTime', 'DepartureDelay', 'DestinationStation','ArrivalTime',  'ArrivalDelay', 
               'DeparturePlatform', 'ArrivalPlatform', 'DepartureMaterials', 'ArrivalMaterials', 'InbetweenStations', 'FaultMessages']
colnames = ['Date', 'RideId', 'TrainId', *scrapecols]

for column in colnames:
    data[column] = []

#Change this .loc slice to scrape a different interval, but make sure you write the previous interval to a file
for day, rid, tid, instance in ids[['Date', 'RideId', 'TrainId', 'RideInstance']].loc[40001:46399].values:

    # Skip if we already scrapped this
    if instance in dfresults.RideInstance.values:
        continue

    search_url = f'https://www.rijdendetreinen.nl/en/train-archive/{day}/{rid}'
    result = get(search_url) 

    search_soup = BeautifulSoup(result.text, features='lxml')

    data['Date'].append(day)
    data['RideId'].append(rid)
    data['TrainId'].append(tid)

    # If the site returns 404 make everything blank
    if result.status_code == 404:
        blank_scrape(data)
        print('404 cannot find ride', search_url)
        continue

    origin = None
    destination = None
    inbetween = []

    # find every single stop of that specific tide
    stops = search_soup.find('div', { 'class': 'train-archive'}).select('div.row.service')

    if len(stops) == 0:
        blank_scrape(data)
        print('no entries found for ride', search_url)
        continue

    for ride in stops:

        station = safetext(ride.find('a'))

        # ignore everything before the first station that interests us and after the last
        if station not in ['Amsterdam Centraal', 'Utrecht Centraal']:
            if origin and not destination:
                inbetween.append(station)
            continue

        
        if origin is None:
            origin = station
            data['DepartureStation'].append(origin)

            data['DepartureTime'].append(safefindtext(ride.select_one('div.departure-time')))
            data['DepartureDelay'].append(safefindtext(ride.select_one('div.departure-time .label')))
            
            data['DeparturePlatform'].append(get_platform(ride))
            data['DepartureMaterials'].append(get_materials(ride))

            data['FaultMessages'].append(';'.join([message.text.strip() for message in ride.select('.text-danger')]))             
        else:
            destination = station
            data['DestinationStation'].append(destination)
            data['ArrivalTime'].append(safefindtext(ride.select_one('div.arrival-time')) )
            data['ArrivalDelay'].append(safefindtext(ride.select_one('div.arrival-time .label')))

            data['ArrivalPlatform'].append(get_platform(ride))
            data['ArrivalMaterials'].append(get_materials(ride))

            data['InbetweenStations'].append(';'.join(inbetween))
    
    # We couldn't find Amsterdam or Utrech in the list
    if origin is None:
        blank_scrape(data)
        print('Could not find Amsterdam or Utrecht stations', search_url)

    # We found Amsterdam or Utrech but did not find the other one, the destination is None
    elif destination is None:
        data['DestinationStation'].append(None)
        data['ArrivalTime'].append(None)
        data['ArrivalDelay'].append(None)

        data['ArrivalPlatform'].append(None)
        data['ArrivalMaterials'].append(None)

        data['InbetweenStations'].append(';'.join(inbetween))
        print('Found', origin,'but could not find the destination in', search_url)


(40001, 16)
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-06-06/11413
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-06-07/11413
Found Utrecht Centraal but could not find the destination in https://www.rijdendetreinen.nl/en/train-archive/2019-06-07/23403
Found Utrecht Centraal but could not find the destination in https://www.rijdendetreinen.nl/en/train-archive/2019-06-08/23403
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-06-14/11413
Found Utrecht Centraal but could not find the destination in https://www.rijdendetreinen.nl/en/train-archive/2019-06-14/23403
Found Utrecht Centraal but could not find the destination in https://www.rijdendetreinen.nl/en/train-archive/2019-06-15/23403
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-06-19/300824
Could not find Amsterdam or Utrecht station

In [42]:
for key in data:
    print(key, len(data[key]))

Date 6398
RideId 6398
TrainId 6398
DepartureStation 6398
DepartureTime 6398
DepartureDelay 6398
DestinationStation 6398
ArrivalTime 6398
ArrivalDelay 6398
DeparturePlatform 6398
ArrivalPlatform 6398
DepartureMaterials 6398
ArrivalMaterials 6398
InbetweenStations 6398
FaultMessages 6398


In [43]:
df = pd.concat([ dfresults, pd.DataFrame(data)])
df['RideInstance'] = df.Date + '#' + df.RideId.astype(str)
df

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,+1,Amsterdam Centraal,01:29,+1½,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,,Utrecht Centraal,01:53,,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,,Amsterdam Centraal,02:44,+1,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,+1½,Utrecht Centraal,02:45,,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,,Amsterdam Centraal,03:44,,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6393,2019-06-30,3089,3089,Amsterdam Centraal,23:54,,Utrecht Centraal,00:22,,4b,15,VIRM-6 8730,,Amsterdam Amstel,,2019-06-30#3089
6394,2019-06-30,2989,2989,Amsterdam Centraal,23:40,,Utrecht Centraal,00:08,+3½,4b,18,VIRM-6 8666,VIRM-6 8666,Amsterdam Amstel;Amsterdam Bijlmer ArenA,,2019-06-30#2989
6395,2019-06-30,2986,2986,Utrecht Centraal,00:23,+2,Amsterdam Centraal,00:52,,7,10a,VIRM-6 8719,,Amsterdam Bijlmer ArenA;Amsterdam Amstel,,2019-06-30#2986
6396,2019-06-30,203091,203091,Amsterdam Centraal,00:24,,Utrecht Centraal,00:52,,4,19,VIRM-6 8633;VIRM-4 9512,VIRM-6 8633;VIRM-4 9512,Amsterdam Amstel,,2019-06-30#203091


In [44]:
df.to_csv('gg.csv',index=None)
df.to_csv('../assets/data/2019-UT-ASD-Full/2019-01-01--2019-06-30_scrapped.csv',index=None)