In [10]:
# Import relevant libraries.
import io
import json
import requests

import pandas as pd

Step 1: Scrape metadata about documents from TweedeKamer

In [3]:
# Define root url of API.
api_root_url = 'https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document'

# Define an empty Dataframe for the retrieved documents.
df_documents = pd.DataFrame()

# Define a separate empty Dataframe for the retrieved document texts.
df_documents_text = pd.DataFrame(columns = ['id', 'text'])

# Define the amount of batches (=250 documents) to scrape from the API.
# E.g. 10 batches = the first 2500 documents.

num_batches = 2500 #This will attempt to scrape all documents (n = 2500*250), but will likely be restricted by API security measures.

In [4]:
# Retrieve Documents from API, 250 at a time (API restriction).

for i in range(0, num_batches):
  # Define the query for retrieving the current batch of documents.
  api_query_batch = '?$skip=' + str(i * 250)
  
  # Send request and store response.
  response = requests.get(api_root_url + api_query_batch)
  response = response.json()

  # Convert JSON response to Dataframe
  df_batch = pd.json_normalize(response, 'value')

  # Append the new batch to the existing collection.
  df_documents = pd.concat([df_documents, df_batch], ignore_index=True)

In [None]:
# Convert dates to a Pandas date format.
df_documents['date'] = pd.to_datetime(df_documents['Datum'], dayfirst=True)

# Then sort on date.
df_documents = df_documents.sort_values(by='date')

In [None]:
#Save the scraped data.
df_documents.to_csv('api_documents.csv')
df_documents.shape

Step 2: Download and extract textual data associated with each scraped metadata-row.

In [None]:
# Use the content detection+extraction framework Apache Tika, ported to Python.
from tika import parser

In [None]:
# Prepare new DataFrame to hold extracted textual data.
df_documents_text = pd.DataFrame(columns = ['id', 'text'])

In [None]:
# Convert any textual files attached to documents in df_documents to text and add to the df_documents_text Dataframe.

for index, row in df_documents.iterrows():
    
  # If document does not contain attached pdf file, don't try extracting text.
  if pd.isnull(row['ContentType']):
    df_documents_text = pd.concat([df_documents_text, pd.DataFrame.from_records([{'id' : row['Id'], 'date' : row['Datum'], 'text' : ''}])])
    continue
    
  # Build query based on document ID.
  query = '(' + row['Id'] + ')/resource'

  # Get attached text file from the API.
  response = requests.get(api_root_url + query)

  # Parse the text file using apache tika.
  parsed_data = parser.from_buffer(response.content)
  document_text = parsed_data['content']

  # Append the contents to the df_contents_text dataframe.
  df_documents_text = pd.concat([df_documents_text, pd.DataFrame.from_records([{'id' : row['Id'], 'date' : row['Datum'], 'text' : document_text}])], ignore_index=True)

In [None]:
# Save as csv file.
df_documents_text.to_csv('kamers_text.csv')