<a href="https://colab.research.google.com/github/Desk1002/CreditDash/blob/main/Scraper_SHOM_Ultra_Fast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üåä Scraper SHOM Ultra-Fast (2013-2024)

Ce notebook permet de collecter massivement les donn√©es de mar√©es du SHOM pour tous vos sites de baignade en Finist√®re.

### ‚ú® Points forts :
- **Vitesse** : Utilise l'API interne du SHOM (donn√©es JSON structur√©es).
- **R√©silience** : Syst√®me de **Checkpoint** automatique. Si Colab s'arr√™te, relancez simplement la cellule pour reprendre l√† o√π √ßa s'est arr√™t√©.
- **Pr√©cision** : Identifie automatiquement le port le plus proche pour chaque site.
- **Variables** : Coefficient, marnage, heures et hauteurs des pleines/basses mers.

In [1]:
# 1. Montage du Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 2. Installation des d√©pendances
!pip install geopy tqdm pandas requests

Mounted at /content/drive


In [2]:
import pandas as pd
import requests
import json
import os
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from geopy.distance import geodesic
from tqdm.notebook import tqdm

# --- CONFIGURATION ---
START_YEAR = 2013
END_YEAR = 2024
INPUT_CSV = '/content/drive/MyDrive/scraper_marees_shom_complet/sites_baignade_finistere.csv' #
OUTPUT_CSV = '/content/drive/MyDrive/PrevisiBaignade/donnees_marees_finales.csv' #
CHECKPOINT_DIR = '/content/drive/MyDrive/checkpoints_shom'

PORTS_FINISTERE = {
    'BREST': {'lat': 48.383, 'lon': -4.495},
    'PORTSALL': {'lat': 48.567, 'lon': -4.717},
    'CONCARNEAU': {'lat': 47.871, 'lon': -3.917},
    'DOUARNENEZ': {'lat': 48.093, 'lon': -4.333},
    'LE_CONQUET': {'lat': 48.360, 'lon': -4.775},
    'CAMARET-SUR-MER': {'lat': 48.279, 'lon': -4.595},
    'MORGAT': {'lat': 48.233, 'lon': -4.500},
    'AUDIERNE': {'lat': 48.023, 'lon': -4.540},
    'PENMARCH': {'lat': 47.798, 'lon': -4.373},
    'BENODET': {'lat': 47.875, 'lon': -4.108},
    'LOCTUDY': {'lat': 47.833, 'lon': -4.167},
    'ROSCOFF': {'lat': 48.727, 'lon': -3.967},
    'MORLAIX': {'lat': 48.583, 'lon': -3.833}
}

class ShomFastScraper:
    def __init__(self, checkpoint_dir):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Referer': 'https://maree.shom.fr/',
            'Accept': 'application/json, text/plain, */*'
        }
        self.base_url = "https://services.data.shom.fr/b2q8lrcdl4s04cbabsj4nhcb/hdm/spm/hlt"
        self.checkpoint_dir = checkpoint_dir
        os.makedirs(checkpoint_dir, exist_ok=True)

    def find_nearest_port(self, lat, lon):
        site_coords = (lat, lon)
        distances = {port: geodesic(site_coords, (data['lat'], data['lon'])).km
                     for port, data in PORTS_FINISTERE.items()}
        return min(distances, key=distances.get)

    def fetch_batch(self, port, date_str):
        params = {'harborName': port, 'duration': 7, 'date': date_str, 'utc': 'standard', 'correlation': 1}
        try:
            response = self.session.get(self.base_url, params=params, headers=self.headers, timeout=10)
            if response.status_code == 200: return response.json()
            return None
        except: return None

    def process_json(self, json_data, port):
        rows = []
        if not json_data or not isinstance(json_data, dict): return rows
        for date_str, tides in json_data.items():
            if not isinstance(tides, list): continue
            try:
                heights = [float(t[2]) for t in tides if len(t) >= 3 and t[2] != '---']
                marnage = max(heights) - min(heights) if heights else None
            except: marnage = None
            coeffs = [int(t[3]) for t in tides if len(t) >= 4 and t[3] != '---']
            coef = coeffs[0] if coeffs else None
            hw = [t for t in tides if 'high' in t[0]]
            lw = [t for t in tides if 'low' in t[0]]
            rows.append({
                'date': date_str, 'port_reference': port, 'coef_maree': coef,
                'marnage_jour': round(marnage, 2) if marnage else None,
                'heures_pleines_mers': ";".join([t[1] for t in hw]),
                'hauteurs_pleines_mers': ";".join([t[2] for t in hw]),
                'heures_basses_mers': ";".join([t[1] for t in lw]),
                'hauteurs_basses_mers': ";".join([t[2] for t in lw]),
                'nb_pleines_mers': len(hw), 'nb_basses_mers': len(lw)
            })
        return rows

def run():
    scraper = ShomFastScraper(CHECKPOINT_DIR)
    df_sites = pd.read_csv(INPUT_CSV)
    df_sites['nearest_port'] = df_sites.apply(lambda r: scraper.find_nearest_port(r.latitude, r.longitude), axis=1)
    unique_ports = df_sites['nearest_port'].unique()

    tasks = []
    start_date = datetime(START_YEAR, 1, 1)
    end_date = datetime(END_YEAR, 12, 31)
    for port in unique_ports:
        curr = start_date
        while curr <= end_date:
            tasks.append((port, curr.strftime('%Y-%m-%d')))
            curr += timedelta(days=7)

    results = []
    checkpoint_file = os.path.join(CHECKPOINT_DIR, "master_progress.csv")
    processed_tasks = set()
    if os.path.exists(checkpoint_file):
        try:
            df_cp = pd.read_csv(checkpoint_file)
            if not df_cp.empty:
                processed_tasks = set(df_cp['checkpoint_key'].unique())
                results = df_cp.to_dict('records')
                print(f"‚úì Reprise : {len(processed_tasks)} paquets d√©j√† r√©cup√©r√©s.")
        except: pass

    tasks_to_do = [t for t in tasks if f"{t[0]}_{t[1]}" not in processed_tasks]

    if tasks_to_do:
        print(f"üöÄ Collecte ({len(tasks_to_do)} paquets restants)... (Max 2 workers pour stabilit√©)")
        with ThreadPoolExecutor(max_workers=2) as executor:
            future_to_task = {executor.submit(scraper.fetch_batch, p, d): (p, d) for p, d in tasks_to_do}
            pbar = tqdm(total=len(tasks_to_do))
            count = 0
            for future in as_completed(future_to_task):
                port, date_str = future_to_task[future]
                try:
                    data = future.result()
                    if data:
                        processed = scraper.process_json(data, port)
                        for item in processed: item['checkpoint_key'] = f"{port}_{date_str}"
                        results.extend(processed)
                    count += 1
                    if count % 50 == 0 and results: pd.DataFrame(results).to_csv(checkpoint_file, index=False)
                except Exception as e: print(f"\nErreur {port} {date_str}: {e}")
                pbar.update(1)
            pbar.close()

    print("\nüì¶ Finalisation...")
    if results:
        df_tides = pd.DataFrame(results).drop_duplicates(subset=['date', 'port_reference'])
        all_dates = pd.date_range(start=f"{START_YEAR}-01-01", end=f"{END_YEAR}-12-31").strftime('%Y-%m-%d')
        df_dates = pd.DataFrame({'date': all_dates})
        df_final = df_sites.assign(key=1).merge(df_dates.assign(key=1), on='key').drop('key', axis=1)
        df_final = df_final.merge(df_tides, left_on=['nearest_port', 'date'], right_on=['port_reference', 'date'], how='left')
        df_final.to_csv(OUTPUT_CSV, index=False)
        print(f"‚úÖ Sauvegard√© : {OUTPUT_CSV}")
    else: print("‚ùå Aucune donn√©e collect√©e.")

run()

üöÄ Collecte (8151 paquets restants)... (Max 2 workers pour stabilit√©)


  0%|          | 0/8151 [00:00<?, ?it/s]


üì¶ Finalisation...
‚úÖ Sauvegard√© : /content/drive/MyDrive/PrevisiBaignade/donnees_marees_finales.csv
