In [58]:
# Imports:
import pandas as pd  # DataFrames
import requests as rq  # Send HTTP requests to servers
import re  # Regular expressions
from datetime import date  # To search for images up to today
import time

# Other information:
today = str(date.today())
print("Today's date:", today)

Today's date: 2020-12-22


In [55]:
# Import the birds for which we have to retrieve images:
bd_path = r"C:\Users\emmav\Documents\AI Master year 1\TMM\Project\BirdsForClassification.txt"
bd = pd.read_csv(bd_path, header=0)
bd.head()


Unnamed: 0,Species,Scientific,ID
0,Dodaars,Tachybaptus ruficollis,2
1,Fuut,Podiceps cristatus,91
2,Roodhalsfuut,Podiceps grisegena,367
3,Geoorde Fuut,Podiceps nigricollis,3
4,Aalscholver,Phalacrocorax carbo,58


In [3]:
# Test searching for images for a bird from the list:
species_id = bd.loc[89,'ID']  # Roodborst

parameters = {'after_date': '2016-01-01',
              'before_date': today,
              'species': species_id,
              'species_group': 1,
              'advanced': 'on',
              'is_validated': 'on',
              'type': 1,
             }

base_url = 'https://waarneming.nl/photos/'

test = rq.Request('GET', base_url, params=parameters).prepare()
print(test.url)

response = rq.get(base_url, parameters)
print(response.text)


https://waarneming.nl/photos/?after_date=2016-01-01&before_date=2020-12-22&species=168&species_group=1&advanced=on&is_validated=on&type=1
<!DOCTYPE html>
<html lang="nl">
<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    
        <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
    

    <meta name="format-detection" content="telephone=no">
    <title>Foto's
        - Waarneming.nl</title>

    

<link rel="apple-touch-icon" sizes="180x180" href="/static/favicon/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/static/favicon/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/static/favicon/favicon-16x16.png">
<link rel="manifest" href="/site.webmanifest">
<link rel="mask-icon" href="/static/favicon/safari-pinned-tab.svg" color="#0066b1">
<link rel="shortcut icon" href="/static/favicon/favicon.ico">
<meta name="msapplication-TileColor" content="#0066b1

In [18]:
# Now we have a response... Let's take the links to photos from it:
regex = 'src="https://old.waarneming.nl/fotonew/([^\.]*)\.jpg'
print("Roodborst ID:", species_id)
sources = re.findall(regex, response.text)
print("Photos found:", len(sources))
print(sources)


Roodborst ID: 168
Photos found: 24
['4/32669834', '8/32669588', '6/32668576', '1/32667591', '8/32666978', '8/32666968', '6/32665986', '7/32665767', '5/32665425', '6/32661866', '0/32658260', '9/32658259', '8/32658258', '4/32657784', '6/32657556', '5/32657555', '4/32657554', '3/32657553', '8/32657048', '0/32656530', '5/32656445', '8/32656408', '0/32654880', '0/32654510']


In [43]:
# Create a new DataFrame for storing the IDs of the photos:
photo_ids = pd.DataFrame()


In [44]:
# Add the IDs with to the DataFrame with the scientific name as the column name:
column_name = bd.loc[89,'Scientific']
photo_ids.loc[:,'PhotoID'] = sources
photo_ids.loc[:,'Scientific'] = column_name
photo_ids.head()


Unnamed: 0,PhotoID,Scientific
0,4/32669834,Erithacus rubecula
1,8/32669588,Erithacus rubecula
2,6/32668576,Erithacus rubecula
3,1/32667591,Erithacus rubecula
4,8/32666978,Erithacus rubecula


In [72]:
# Now do it for all species:
url = 'https://waarneming.nl/photos/'
parameters = {'after_date': '2016-01-01',
              'before_date': today,
              'species': 0,
              'species_group': 1,  # Birds
              'advanced': 'on',
              'is_validated': 'on',
              'type': 1,  # Photos
             }
i_counter = 0
photo_df = pd.DataFrame()


In [73]:
for i, species_id in enumerate(bd.loc[:,'ID']):
    parameters['species'] = species_id  # Search for the current species
    response = rq.get(url, parameters)  # Ask the website
    
    # Retrieve the photo ids:
    regex = 'src="https://old.waarneming.nl/fotonew/([^\.]*)\.jpg'
    photo_ids = re.findall(regex, response.text)
    n_ids = len(photo_ids)
    new_ids = pd.DataFrame(photo_ids, index=None, columns=['ID'])
    
    # Add to DF:
    photo_df = photo_df.append(new_ids, ignore_index=True, sort=False)
    photo_df.loc[i_counter:i_counter+n_ids, 'Scientific'] = bd.loc[i, 'Scientific']
    
    # Update the counter and wait to crawl kindly:
    i_counter += n_ids
    time.sleep(4.95)


In [74]:
# Export:
photo_df.to_csv("PhotoIDs.txt", index=False)
