<a href="https://colab.research.google.com/github/CalinRusu95/parltrack-data-fetcher/blob/main/fetch_parltrack_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install requests zstandard

In [2]:
import requests
import zstandard as zstd
import json
import os
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from bs4 import BeautifulSoup

In [None]:
# Google Drive folder ID where files will be uploaded
DRIVE_FOLDER_ID = "16QmR40u_BF8K_DCSttSEOxYidzsuEkTU"  # Replace with your Google Drive folder ID

# Parltrack dumps URL
PARLTRACK_DUMPS_URL = "https://parltrack.eu/dumps/"

# Write the secret to a temporary credentials.json file
CREDENTIALS_PATH = "/tmp/credentials.json"

In [None]:
def write_credentials(secret):
    with open(CREDENTIALS_PATH, "w") as f:
        f.write(secret)

# Authenticate with Google Drive API
def authenticate_google_drive():
    credentials = Credentials.from_service_account_file(
        CREDENTIALS_PATH,
        scopes=["https://www.googleapis.com/auth/drive"]
    )
    service = build("drive", "v3", credentials=credentials)
    return service

# Upload file to Google Drive
def upload_to_drive(service, file_path, file_name):
    file_metadata = {
        "name": file_name,
        "parents": [DRIVE_FOLDER_ID]
    }
    media = MediaFileUpload(file_path, resumable=True)
    uploaded_file = service.files().create(body=file_metadata, media_body=media, fields="id").execute()
    print(f"Uploaded file ID: {uploaded_file.get('id')}")

# Fetch, decompress, and upload files
def fetch_and_upload_files():
    try:
        # Fetch the list of available files
        response = requests.get(PARLTRACK_DUMPS_URL)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract all .zst file URLs
        file_urls = [
            PARLTRACK_DUMPS_URL + link["href"]
            for link in soup.find_all("a", href=True)
            if link["href"].endswith(".zst")
        ]

        # Authenticate with Google Drive
        drive_service = authenticate_google_drive()

        # Process each file
        for file_url in file_urls:
            print(f"Processing file: {file_url}")

            # Fetch the compressed file
            response = requests.get(file_url, stream=True)
            response.raise_for_status()

            # Decompress Zstandard data
            dctx = zstd.ZstdDecompressor()
            decompressed = dctx.stream_reader(response.raw)

            # Parse JSON
            data = json.load(decompressed)

            # Save the file locally
            file_name = os.path.basename(file_url).replace(".zst", ".json")
            file_path = os.path.join("/tmp", file_name)
            with open(file_path, "w") as f:
                json.dump(data, f)

            # Upload the file to Google Drive
            upload_to_drive(drive_service, file_path, file_name)

            # Clean up the local file
            os.remove(file_path)
            print(f"Finished processing: {file_name}")

    except Exception as e:
        print(f"Error: {e}")

# Run the process
if __name__ == "__main__":
    # Read the secret from an environment variable (provided by GitHub Actions)
    secret = os.getenv("GOOGLE_CREDENTIALS")
    if not secret:
        raise ValueError("GOOGLE_CREDENTIALS secret is missing!")

    # Write the secret to a temporary file
    write_credentials(secret)

    # Fetch and upload files
    fetch_and_upload_files()

    # Clean up credentials
    os.remove(CREDENTIALS_PATH)