In [0]:
%sql
use catalog hive_metastore;

CREATE DATABASE IF NOT EXISTS file_history LOCATION 'abfss://datalaketaxi@taxistorag.dfs.core.windows.net/taxi_data/file_history/';
use file_history;
create table if not exists file_history_downloads(
  file_name STRING,
  downloaded_on date);
-- insert into file_history_downloads(file_name, downloaded_on) SELECT "last_parquet", date("2024-07-04")

In [0]:
%sql
SELECT * FROM file_history_downloads;
-- DELETE FROM file_history_downloads where downloaded_on='2024-07-04';


file_name,downloaded_on
last_parquet,2024-07-04


In [0]:
import dateutil.relativedelta as relativedelta

# Récupère la dernière date de téléchargement depuis la table file_history_downloads
last_downloaded_on = _sqldf.select("downloaded_on").agg({"downloaded_on": "max"}).collect()[0][0]

# Calcule le mois suivant à partir de la dernière date téléchargée
date_next = last_downloaded_on + relativedelta.relativedelta(months=1)

# Affiche l'année et le mois du prochain téléchargement prévu
display(date_next.year, date_next.month)

2024

8

In [0]:
file_name=""
download_on=""

In [0]:
%pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>=1.6.1 (from beautifulsoup4)
  Downloading soupsieve-2.8-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.14.3-py3-none-any.whl (107 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.7 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m102.4/107.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading soupsieve-2.8-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.14.3 soupsieve-2.8
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import os
import io
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

from azure.storage.filedatalake import DataLakeServiceClient
from azure.core.exceptions import AzureError

# ------------ CONFIG À MODIFIER ------------

PAGE_URL = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"  

# Nom du filesystem (container) dans ton Storage Account ADLS Gen2
ADLS_FILESYSTEM_NAME = "datalaketaxi"              

# Dossier logique dans le data lake (optionnel)
ADLS_PREFIX = "taxi_data/"                           # ex: "pdfs/" ou "" pour la racine

# ⚠️ Pour apprendre : on met la connection string en dur ici
ADLS_CONNECTION_STRING=dbutils.secrets.get("adls-scope", "storage_account_key")
# ADLS_CONNECTION_STRING = "<TA_CONNECTION_STRING_AZURE>"

# -------------------------------------------

if not ADLS_CONNECTION_STRING:
    raise RuntimeError(
        "ADLS_CONNECTION_STRING n'est pas défini."
    )

# Création du client ADLS
datalake_service_client = DataLakeServiceClient.from_connection_string(
    ADLS_CONNECTION_STRING
)

file_system_client = datalake_service_client.get_file_system_client(
    file_system=ADLS_FILESYSTEM_NAME
)

def get_pdf_links(page_url: str):
    """Récupère tous les liens parquet d'une page HTML."""
    resp = requests.get(page_url, timeout=15)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")
    links = set()

    # Correction: date_next n'est pas défini dans cette fonction.
    # Il faut passer date_next comme argument ou le définir globalement.
    # De plus, .endswith() attend une chaîne, pas un int.
    # Supposons que date_next est défini globalement.

    for a in soup.find_all("a", href=True):
        href = a["href"]
        
        if "yellow_tripdata" in href :
            if href.lower().endswith(f"{date_next.strftime('%Y')}-{date_next.strftime('%m')}.parquet"):
                full_url = href
                links.add(full_url)
                # file_name et download_on ne sont pas utilisés ici, à retirer ou à gérer ailleurs
                file_name = href
                download_on = date_next
    return list(links)

def get_filename_from_url(url: str) -> str:
    """Déduit un nom de fichier à partir de l'URL."""
    path = urlparse(url).path
    name = os.path.basename(path)
    return name or "document.parquet"

def download_pdf_to_bytes(url: str) -> bytes:
    """Télécharge un PDF et renvoie son contenu en bytes."""
    resp = requests.get(url, stream=True, timeout=30)
    resp.raise_for_status()

    buffer = io.BytesIO()
    for chunk in resp.iter_content(chunk_size=8192):
        if chunk:
            buffer.write(chunk)

    buffer.seek(0)
    return buffer.getvalue()

def upload_bytes_to_adls(data: bytes, path: str):
    """Upload un contenu binaire dans ADLS Gen2."""
    try:
        directory_path, _, filename = path.rpartition("/")
        file_client = file_system_client.get_file_client(path)
        file_client.upload_data(data, overwrite=True)
        print(f"✅ Upload vers adls://{ADLS_FILESYSTEM_NAME}/{path}")
    except AzureError as e:
        print(f"❌ Erreur upload {path} : {e}")

def run():
    print(f"🔎 Récupération des liens PDF depuis : {PAGE_URL}")
    pdf_links = get_pdf_links(PAGE_URL)

    if not pdf_links:
        print("⚠️ Aucun PDF trouvé sur la page.")
        return

    print(f"📄 PDFs trouvés : {len(pdf_links)}")

    for url in pdf_links:
        print(f"\n➡ Téléchargement : {url}")
        try:
            data = download_pdf_to_bytes(url)
            filename = get_filename_from_url(url)
            adls_path = f"{ADLS_PREFIX}{filename}" if ADLS_PREFIX else filename
            upload_bytes_to_adls(data, adls_path)
            # download_on_str = download_on.strftime('%Y-%m-%d')
            spark.sql(f"INSERT INTO file_history.file_history_downloads(file_name, downloaded_on) VALUES ('{file_name}', '{date_next}')")
        except requests.RequestException as e:
            print(f"❌ Erreur téléchargement {url} : {e}")

# 👉 Dans un notebook Databricks, on appelle juste la fonction :
run()

🔎 Récupération des liens PDF depuis : https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
📄 PDFs trouvés : 1

➡ Téléchargement : https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-08.parquet
✅ Upload vers adls://datalaketaxi/taxi_data/yellow_tripdata_2024-08.parquet
