<a href="https://colab.research.google.com/github/CalinRusu95/parltrack-data-fetcher/blob/main/fetch_parltrack_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install requests zstandard



In [2]:
# Import necessary libraries
import requests
import zstandard as zstd
import json
import os
from bs4 import BeautifulSoup
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define Parltrack dumps URL and Drive folder path
PARLTRACK_DUMPS_URL = 'https://parltrack.eu/dumps/'
DRIVE_FOLDER_PATH = '/content/drive/My Drive/EU Tableau/ParltrackData/'
os.makedirs(DRIVE_FOLDER_PATH, exist_ok=True)

In [None]:
def get_parltrack_files():
    """
    Fetches the list of available files from the Parltrack dumps page.
    """
    try:
        print(f"Fetching available files from {PARLTRACK_DUMPS_URL}...")
        response = requests.get(PARLTRACK_DUMPS_URL)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract all file links ending with .zst
        file_links = [
            PARLTRACK_DUMPS_URL + link['href']
            for link in soup.find_all('a', href=True)
            if link['href'].endswith('.zst')
        ]
        print(f"Found {len(file_links)} files.")
        return file_links

    except Exception as e:
        print(f"Error fetching file list: {str(e)}")
        return []

def fetch_and_save_file(file_url):
    """
    Fetches, decompresses, and saves a single file from Parltrack to Google Drive.
    """
    try:
        print(f"Processing file: {file_url}")

        # Fetch the compressed data
        response = requests.get(file_url, stream=True, timeout=60)

        # Decompress Zstandard data
        dctx = zstd.ZstdDecompressor()
        decompressed = dctx.stream_reader(response.raw)

        # Parse JSON
        data = json.load(decompressed)

        # Save JSON file to Google Drive
        file_name = os.path.basename(file_url).replace('.zst', '.json')
        file_path = os.path.join(DRIVE_FOLDER_PATH, file_name)
        with open(file_path, 'w') as f:
            json.dump(data, f)

        print(f"File saved successfully to {file_path}")
    except requests.exceptions.RequestException as req_error:
        print(f"Request error for {file_url}: {req_error}")
    except json.JSONDecodeError as json_error:
        print(f"JSON decode error for {file_url}: {json_error}")
    except Exception as e:
        print(f"Unexpected error for {file_url}: {str(e)}")

def process_all_files():
    """
    Fetches and processes all available data files from Parltrack into Google Drive.
    """
    file_urls = get_parltrack_files()
    for file_url in file_urls:
        fetch_and_save_file(file_url)

# Run the process
process_all_files()

Fetching available files from https://parltrack.eu/dumps/...
Found 8 files.
Processing file: https://parltrack.eu/dumps/ep_amendments.json.zst
