In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import lxml


In [2]:
ids = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31-identifiers.csv')

In [3]:
ids.drop_duplicates(inplace=True)
ids['RideInstance'] = ids.Date + '#' + ids.RideId.astype(str)
print(ids.shape)
ids.head(3)

(48587, 4)


Unnamed: 0,Date,RideId,TrainId,RideInstance
0,2019-06-30,7393,7393,2019-06-30#7393
1,2019-06-30,2981,2981,2019-06-30#2981
2,2019-06-30,2985,2985,2019-06-30#2985


In [6]:
def safetext(tag):
    return tag.text.strip() if tag is not None else None

def safefindtext(tag):
    return tag.find(text=True).strip() if tag is not None else None
    
def get_platform(ride):
    try:
        columns = ride.find_all('div', recursive=False)
        return safetext(columns[3])
    except IndexError:
        return None

def get_materials(ride):
    return ';'.join([material.get('title') for material in ride.select('div.material-parts .material-part')])

def blank_scrape(data):
    for column in scrapecols:
        data[column].append(None)


In [18]:
dfresults = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31_scrapped.csv')
dfresults['RideInstance'] = dfresults.Date + '#' + dfresults.RideId.astype(str)
print(dfresults.shape)


(1498, 16)


In [13]:
# Get all the rides we have scrapped to make sure we don't scrape them again

dfresults = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31_scrapped.csv')
dfresults['RideInstance'] = dfresults.Date + '#' + dfresults.RideId.astype(str)
print(dfresults.shape)


from requests import get
data = {}
scrapecols = ['DepartureStation', 'DepartureTime', 'DepartureDelay', 'DestinationStation','ArrivalTime',  'ArrivalDelay', 
               'DeparturePlatform', 'ArrivalPlatform', 'DepartureMaterials', 'ArrivalMaterials', 'InbetweenStations', 'FaultMessages']
colnames = ['Date', 'RideId', 'TrainId', *scrapecols]

for column in colnames:
    data[column] = []

#Change this .loc slice to scrape a different interval, but make sure you write the previous interval to a file
for day, rid, tid, instance in ids[['Date', 'RideId', 'TrainId', 'RideInstance']].loc[1500:2500].values:

    # Skip if we already scrapped this
    if instance in dfresults.RideInstance.values:
        continue

    search_url = f'https://www.rijdendetreinen.nl/en/train-archive/{day}/{rid}'
    result = get(search_url) 

    search_soup = BeautifulSoup(result.text, features='lxml')

    data['Date'].append(day)
    data['RideId'].append(rid)
    data['TrainId'].append(tid)

    # If the site returns 404 make everything blank
    if result.status_code == 404:
        blank_scrape(data)
        print('404 cannot find ride', search_url)
        continue

    origin = None
    destination = None
    inbetween = []

    # find every single stop of that specific tide
    stops = search_soup.find('div', { 'class': 'train-archive'}).select('div.row.service')

    if len(stops) == 0:
        blank_scrape(data)
        print('no entries found for ride', search_url)
        continue

    for ride in stops:

        station = safetext(ride.find('a'))

        # ignore everything before the first station that interests us and after the last
        if station not in ['Amsterdam Centraal', 'Utrecht Centraal']:
            if origin and not destination:
                inbetween.append(station)
            continue

        
        if origin is None:
            origin = station
            data['DepartureStation'].append(origin)

            data['DepartureTime'].append(safefindtext(ride.select_one('div.departure-time')))
            data['DepartureDelay'].append(safefindtext(ride.select_one('div.departure-time .label')))
            
            data['DeparturePlatform'].append(get_platform(ride))
            data['DepartureMaterials'].append(get_materials(ride))

            data['FaultMessages'].append(';'.join([message.text.strip() for message in ride.select('.text-danger')]))             
        else:
            destination = station
            data['DestinationStation'].append(destination)
            data['ArrivalTime'].append(safefindtext(ride.select_one('div.arrival-time')) )
            data['ArrivalDelay'].append(safefindtext(ride.select_one('div.arrival-time .label')))

            data['ArrivalPlatform'].append(get_platform(ride))
            data['ArrivalMaterials'].append(get_materials(ride))

            data['InbetweenStations'].append(';'.join(inbetween))
    
    # We couldn't find Amsterdam or Utrech in the list
    if origin is None:
        blank_scrape(data)
        print('Could not find Amsterdam or Utrecht stations', search_url)

    # We found Amsterdam or Utrech but did not find the other one, the destination is None
    elif destination is None:
        data['DestinationStation'].append(None)
        data['ArrivalTime'].append(None)
        data['ArrivalDelay'].append(None)

        data['ArrivalPlatform'].append(None)
        data['ArrivalMaterials'].append(None)

        data['InbetweenStations'].append(';'.join(inbetween))
        print('Found', origin,'but could not find the destination in', search_url)


(1498, 16)
Found Utrecht Centraal but could not find the destination in https://www.rijdendetreinen.nl/en/train-archive/2019-07-09/300839
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-11/11409
Could not find Amsterdam or Utrecht stations https://www.rijdendetreinen.nl/en/train-archive/2019-07-11/11413


In [14]:
for key in data:
    print(key, len(data[key]))

Date 1000
RideId 1000
TrainId 1000
DepartureStation 1000
DepartureTime 1000
DepartureDelay 1000
DestinationStation 1000
ArrivalTime 1000
ArrivalDelay 1000
DeparturePlatform 1000
ArrivalPlatform 1000
DepartureMaterials 1000
ArrivalMaterials 1000
InbetweenStations 1000
FaultMessages 1000


In [20]:
df = pd.concat([ dfresults, pd.DataFrame(data)])
df['RideInstance'] = df.Date + '#' + df.RideId.astype(str)
df

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance
0,2019-06-30,7393,7393,Amsterdam Centraal,00:28,,Utrecht Centraal,01:10,,2b,14,SLT-4 2435;SLT-6 2648,,Amsterdam Muiderpoort;Amsterdam Amstel;Duivend...,,2019-06-30#7393
1,2019-06-30,2981,2981,Amsterdam Centraal,21:40,,Utrecht Centraal,22:07,,4b,18,VIRM-4 9480,VIRM-4 9480,Amsterdam Amstel,,2019-06-30#2981
2,2019-06-30,2985,2985,Amsterdam Centraal,22:40,,Utrecht Centraal,23:07,,4b,18,VIRM-6 8649,VIRM-6 8649,Amsterdam Amstel,,2019-06-30#2985
3,2019-06-30,3087,3087,Amsterdam Centraal,23:24,,Utrecht Centraal,23:52,,4a,19,VIRM-6 8621,VIRM-6 8621,Amsterdam Amstel,,2019-06-30#3087
4,2019-06-30,2980,2980,Utrecht Centraal,22:53,+1½,Amsterdam Centraal,23:21,,7,8a,VIRM-6 8676,VIRM-6 8676,Amsterdam Bijlmer ArenA;Amsterdam Amstel,,2019-06-30#2980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2019-07-11,3953,3953,Amsterdam Centraal,14:45,,Utrecht Centraal,15:12,,4b,18,VIRM-4 9560,VIRM-4 9560,Amsterdam Amstel,,2019-07-11#3953
996,2019-07-11,3943,303943,Amsterdam Centraal,12:14,,Utrecht Centraal,12:40,,5b,18,VIRM-6 8717,VIRM-6 8717,Amsterdam Amstel,,2019-07-11#3943
997,2019-07-11,3948,3948,Utrecht Centraal,14:48,,Amsterdam Centraal,15:15,,7,8a,VIRM-4 9580,VIRM-4 9580,Amsterdam Amstel,,2019-07-11#3948
998,2019-07-11,7446,7446,Utrecht Centraal,14:37,,Amsterdam Centraal,15:20,,14,7b,SGMM-3 2994;SGMM-2 2142,SGMM-3 2994;SGMM-2 2142,Utrecht Zuilen;Maarssen;Breukelen;Abcoude;Amst...,,2019-07-11#7446


In [21]:
df.to_csv('gg.csv',index=None)
df.to_csv('../assets/data/2019-UT-ASD-Full/2019-07-01--2019-12-31_scrapped.csv',index=None)