In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import StaleElementReferenceException
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import os
import requests

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_service = Service('D:\chrome-win64\chromedriver.exe')

driver = webdriver.Chrome(service=chrome_service, options=chrome_options)


In [2]:
url = 'https://meteo.data.gouv.fr/datasets/donnees-climatologiques-de-base-quotidiennes/'

driver.get(url)
WebDriverWait(driver, 10)
time.sleep(3)

In [3]:

def get_name_link_map_for_current_page(driver):
    name_link = {}
    articles = driver.find_elements(By.TAG_NAME, 'article')
    for article in articles:
        html = article.get_attribute('outerHTML')
        soup = BeautifulSoup(html, 'html.parser')
        divs = soup.find_all('div')
        filename = divs[0].find_all('div')[1].find('h4').text
        url =soup.find(class_='matomo_download').attrs['href']
        name_link[filename] = url

    return name_link

In [6]:
def get_n_pages(driver):
    all_pages = driver.find_element(By.CLASS_NAME, "fr-pagination__list")  
    n_pages = int(all_pages.find_elements(By.TAG_NAME, "li")[-3].text)
    return n_pages

n_pages = get_n_pages(driver)

In [7]:
def go_to_next_page(driver):
    next_page = driver.find_element(By.CLASS_NAME, "fr-pagination__link--next")  
    next_page.click()
    
def retrieve_all_name_link_maps(driver, n_pages):
    name_links = {}
    for current_page in tqdm(range(n_pages), desc='Pages', total=n_pages):
        got_new_links = False
        while not got_new_links:
            try:
                new_links = get_name_link_map_for_current_page(driver)
                got_new_links = True
            except StaleElementReferenceException:
                time.sleep(0.2)

        name_links.update(new_links)
        go_to_next_page(driver)
        current_page += 1
    return name_links


name_links = retrieve_all_name_link_maps(driver, n_pages=n_pages)
len(name_links)

Pages:   0%|          | 0/141 [00:00<?, ?it/s]

638

In [None]:
def filter_links_with_suffix(links, suffix):
    return {fname: l for fname, l in links.items() if fname.endswith(suffix)}

def filter_links_metropole(links):
    filtered = {}
    for fname, l in links.items():
        departement = int(fname.split('_departement_')[1].split('_')[0])
        if departement <= 95:
            filtered[fname] = l
    return filtered

filtered_links = filter_links_with_suffix(name_links, suffix='periode_1950-2023_RR-T-Vent')
filtered_links |= filter_links_with_suffix(name_links, suffix='periode_2024-2025_RR-T-Vent')
filtered_links = filter_links_metropole(filtered_links)

len(filtered_links)

243

In [15]:
def download_links(links, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    for fname, link in tqdm(links.items(), desc='Downloading files', total=len(links)):
        filename = os.path.join(folder_path, f'{fname}.csv.gz')
        if os.path.exists(filename):
            #  skip existing files
            continue

        response = requests.get(link)
        with open(filename, 'wb') as file:
            file.write(response.content)

folder_path = '../data/rtt-vent'
download_links(filtered_links, folder_path)

Downloading files: 100%|██████████| 243/243 [01:24<00:00,  2.86it/s]
