In [1]:
import requests
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

In [None]:
### Download and save each file from the website
# 8 downloads can run parallel (improving speed)

index_url = "https://trein.fwrite.org/idx/dedup_OVFiets.html"
response = requests.get(index_url)
soup = BeautifulSoup(response.text, "html.parser")

links = [a["href"] for a in soup.find_all("a", href=True) if a["href"].endswith(".csv.xz")]

os.makedirs("ovfiets_2023", exist_ok=True)

# Filter to only include files from a certain date onward
filtered_links = []
for link in links:
    basename = os.path.basename(link)
    # Expecting something like OVFiets_YYYYMMDD.csv.xz
    try:
        date_str = basename.split("_")[1].split(".")[0].replace("-", "")  # "YYYYMMDD"
        if date_str >= "20240315":  # Change this for the startdate
            filtered_links.append(link)
    except Exception:
        continue

def download_file(link):
    if link.startswith("http"):
        file_url = link
    else:
        file_url = "https://trein.fwrite.org/" + link.lstrip("./").lstrip("../")
    
    filename = os.path.join("ovfiets_2023", os.path.basename(link))
    
    print(f"Downloading {filename} ...")
    r = requests.get(file_url)
    
    if r.status_code == 200:
        with open(filename, "wb") as f:
            f.write(r.content)
        return filename
    else:
        print(f"⚠️ Skipped {file_url} (status {r.status_code})")
        return None

# Download files in parallel
with ThreadPoolExecutor(max_workers=8) as executor:  # Adjust max_workers based on your connection
    downloaded_files = list(executor.map(download_file, filtered_links))



Downloading ovfiets_2023\OVFiets_2024-03-15.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-16.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-17.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-18.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-19.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-20.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-21.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-22.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-23.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-24.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-25.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-26.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-27.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-28.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-29.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-30.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-03-31.csv.xz ...
Downloading ovfiets_2023\OVFiets_2024-04-01.csv.xz ...
Downloadin