In [2]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
import pandas as pd
import time
import pickle

Website with ambulance data for the region of Amsterdam-Amstelland: http://www.112-nederland.nl/alerts/regiodetails.aspx?from=617475&vreg=5168&name=Amsterdam-Amstelland  
Calls per 15 on page, page numbering goes up with 15, but you have to check last page number again before you run code  


In [None]:
# List all pages with ambulance calls
# Set last page number manually

to_scrape_list = []
for i in range(0,617475,15):
    url_name = 'http://www.112-nederland.nl/alerts/regiodetails.aspx?from=' + str(i) + '&vreg=5168&name=Amsterdam-Amstelland'
    to_scrape_list.append(url_name)

In [None]:
def ambulancecalls_scraper(url_list, name_pickle):
    """ Scrapes ambulance calls from http://www.112-nederland.nl
    Parameters: 
        url_list: a list with all urls to scrape
        name_pickle: str with the name of the pickle/csv to be saved
    Returns: 
        A list of dicts with date, time, address, description of call saved in pickle (in case crash occurs)
        and in .csv file at the end.
    """
    all_calls_list = []
    for url in url_list:
        page = requests.get(url)
        strainer = SoupStrainer(id='alert-ambulance')
        soup = BeautifulSoup(page.content, 'lxml', parse_only=strainer)
        alerts_list = soup.find_all('li')
        del alerts_list[3] # remove advertisement
        for alert in alerts_list[:-1]:
            calls_dict = {}
            heading = alert.find('h4', class_="media-heading").get_text().split()
            calls_dict['date'] = heading[0]
            calls_dict['time'] = heading[1][:-1]
            calls_dict['address'] = ' '.join(heading[2:-4])
            descr = alert.find('div', itemprop="description").get_text()
            calls_dict['descr'] = re.sub('\s+',' ',descr)
            all_calls_list.append(calls_dict)
            pickle.dump(all_calls_list, open(name_pickle + '.p','wb')) # in case crash occurs
    all_calls_df = pd.DataFrame(all_calls_list, columns=['date','time','address','descr'])
    all_calls_df.to_csv(name_pickle + '.csv', index=False) 
        

In [None]:
# scrape

start = time.time()

ambulancecalls_scraper(to_scrape_list, 'scrape_results_page0-617475')

end = time.time()
print("Elapsed time in minutes: {}".format((end-start)/60))

In [None]:
# for checking last scraped page

part1 = pickle.load(open('scrape_results_page0-617475.p', 'rb'))

In [16]:
# Get addresses of ambulance stands

url = 'https://ambulanceamsterdam.nl/regio-amsterdam-amstelland'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html5lib')
soup_contact_list = soup.find_all(class_='contactgegeven')

# 7 stands, named naam1-naam7
list_names = ['naam' + str(i) for i in range(1,8,1)]

indices = [i for i, s in enumerate(soup_contact_list) for name in list_names if name in str(s)]

ambulancestands_list = []
for index in indices:
    ambulancestands_dict = {}
    ambulancestands_dict['name_stand'] = soup_contact_list[index].strong.get_text()
    ambulancestands_dict['address1'] = soup_contact_list[(index + 1)].get_text().replace(u'\xa0', u' ')
    ambulancestands_dict['address2'] = soup_contact_list[(index + 2)].get_text().replace(u'\xa0', u' ')
    ambulancestands_list.append(ambulancestands_dict) 

ambulancestands_df = pd.DataFrame(ambulancestands_list)
ambulancestands_df.to_csv('.././Data/Address_ambulancestands.csv', index=False)