In [1]:
import requests, pandas as pd, math
from src.utils.file_utils import convert_time_to_seconds
from Levenshtein import ratio
import nest_asyncio, asyncio, pandas as pd
nest_asyncio.apply() 

from src.utils.ffa_fast import get_all_results_fast as get_all_athlete_results
from src.utils.http_utils import search_athletes            # FFA autocomplete

In [2]:
def find_most_similar_club(club_name, clubs_list):
    """Trouve le club le plus similaire dans une liste de clubs.
    
    Args:
        club_name (str): Le nom du club à chercher
        clubs_list (list): Liste des noms de clubs possibles
        
    Returns:
        str: Le club le plus similaire trouvé ou None si aucun match satisfaisant
    """
    # Nettoyage des noms de clubs
    club_name = club_name.strip().lower()
    
    best_match = None
    best_ratio = 0
    
    for club in clubs_list:
        if not club or len(club.strip()) == 0:
            continue
            
        # Nettoyage du club de la liste
        clean_club = club.strip().lower()
        # Suppression du tiret en début de ligne s'il existe
        if clean_club.startswith('- '):
            clean_club = clean_club[2:]
            
        # Calcul de la similarité
        similarity = ratio(club_name, clean_club)
        
        # Mise à jour du meilleur match si la similarité est meilleure
        if similarity > best_ratio:
            best_ratio = similarity
            best_match = club
            
    # On ne renvoie le match que si la similarité est suffisante (> 0.6)
    return best_match if best_ratio > 0.6 else None

In [11]:
# 1) paramètres
chOID    = "67e0519e45b2d8eaf995ae71"
groupOID = "67e0545d0ed16f20fdbf88f8"
url      = "https://athle.live/api/results"

params  = {"chOID": chOID, "groupOID": groupOID}
headers = {"Accept": "application/json",
           "User-Agent": "Mozilla/5.0 (python-requests)"}

# 2) requête – surtout **ne PAS** envoyer If-None-Match si tu veux la réponse complète
r = requests.get(url, params=params, headers=headers, timeout=10)
r.raise_for_status()
j = r.json()

# 3) extraction des engagés
rows = []
for c in j.get("commitments", []):
    perf_ms = None

    # ordre de priorité : perf engagée (pe), PB, SB, temps inscrit (tReg)
    if isinstance(c.get("pe"), dict) and c["pe"].get("u") == "ms":
        perf_ms = c["pe"]["v"]
    elif isinstance(c.get("pb"), dict) and c["pb"].get("u") == "ms":
        perf_ms = c["pb"]["v"]
    elif isinstance(c.get("sb"), dict) and c["sb"].get("u") == "ms":
        perf_ms = c["sb"]["v"]
    elif isinstance(c.get("tReg"), dict) and c["tReg"].get("u") == "ms":
        perf_ms = c["tReg"]["v"]

    # mise en forme mm:ss.cc (ex. 1:59.84) si on a bien une valeur
    if perf_ms is not None and math.isfinite(perf_ms):
        minutes, ms = divmod(int(perf_ms), 60000)
        seconds     = ms / 1000
        perf_str    = f"{minutes}:{seconds:05.2f}"
    else:
        perf_str    = ""

    rows.append({
        "Prénom" : c.get("first"),
        "Nom"    : c.get("last"),
        "Club"   : c.get("clubName") or c.get("club"),
        "Temps d'engagement" : perf_str
    })

df_athle_live = pd.DataFrame(rows)
df_athle_live.to_csv("engages_800m_tcm.csv", index=False, encoding="utf-8")

print(f"{len(df_athle_live)} athlètes exportés dans engages_800m_tcm.csv")
print(df_athle_live.head())


83 athlètes exportés dans engages_800m_tcm.csv
   Prénom         Nom                 Club Temps d'engagement
0  Jeremy  DACHICOURT   ATHLETIC CLUBS 92*            1:57.00
1   Lucas  CANTALOUBE   ATHLETIC CLUBS 92*            1:57.63
2    Hugo     DEL DIN  AC DU PAYS DE MEAUX            1:55.00
3  Valere     GRAFFIN         SAM PARIS 12            1:59.90
4     Max   FRADILLON         SAM PARIS 12                   


In [6]:
df_athle_live[df_athle_live.Prénom =="Paul"]

Unnamed: 0,Prénom,Nom,Club,Temps d'engagement
53,Paul,GULZINSKI,AMIENS UC,1:54.42


# TEST

In [None]:
# TEST
for i in range(len(df_athle_live)):
    
    ath = df_athle_live.iloc[i]['Prénom'] + ' ' + df_athle_live.iloc[i]['Nom']
    
    ath = ' '.join(ath.split())
    print(ath)
    
    results = search_athletes(ath)
    print(f"Results found: {len(results)}")
    
    if len(results) > 1:
        
        club = df_athle_live.iloc[i]['Club']
        
        clubs_candidat = []
        for i in range(len(results)):
            clubs_candidat.append(results[i]['club'])
        
        most_similar_club = find_most_similar_club(club, clubs_candidat)
        
        if most_similar_club is None:
            print(f"No similar club found for {club}. Skipping athlete.")
            continue
        
        index = clubs_candidat.index(most_similar_club)
        
        seq = results[index]['seq']
        
        df = get_all_athlete_results(results[index]['seq'])
        
        
        df = df[(df.Epreuve == '800m') | (df.Epreuve == '800m Piste Courte')]
        
        if df.empty:
            print(f"No results found for {ath}.")
            continue

        df = df[~df["Perf."].str.contains("|".join(["DNS", "DNF", "AB", "DQ"]), na=False)]
        df["time"] = df["Perf."].apply(convert_time_to_seconds)
        
        best_800_2025 = df[df.Annee == "2025"]['time'].min()
        
        best_800_all_time = df['time'].min()
        
        year_of_best_800 = df.loc[df['time'].idxmin()]['Annee']
        
        print(f"Best 800m time in 2025: {best_800_2025}")
        print(f"Best 800m time all time: {best_800_all_time}")
        print(f"Year of best 800m time: {year_of_best_800}")
        
    elif len(results) == 1:
        seq = results[0]['seq']
        
        df = get_all_athlete_results(seq)
        
        df = df[(df.Epreuve == '800m') | (df.Epreuve == '800m Piste Courte')]
        
        if df.empty:
            print(f"No results found for {ath}.")
            continue

        df = df[~df["Perf."].str.contains("|".join(["DNS", "DNF", "AB", "DQ"]), na=False)]
        df["time"] = df["Perf."].apply(convert_time_to_seconds)
        
        best_800_2025 = df[df.Annee == "2025"]['time'].min()
        
        best_800_all_time = df['time'].min()
        
        year_of_best_800 = df.loc[df['time'].idxmin()]['Annee']
        
        print(f"Best 800m time in 2025: {best_800_2025}")
        print(f"Best 800m time all time: {best_800_all_time}")
        print(f"Year of best 800m time: {year_of_best_800}")
        
        
    else:
        print(f"No results found for {ath}.")
        continue
    
                                
            
            
            
        
    

Benjamin LEROY
Results found: 9
Best 800m time in 2025: 121.97
Best 800m time all time: 119.84
Year of best 800m time: 2022
Emeric LASNIER
Results found: 1
Best 800m time in 2025: 128.35
Best 800m time all time: 125.24
Year of best 800m time: 2019
Emeric LACIRE
Results found: 0
No results found for Emeric LACIRE.
Mathias GAYRAUD
Results found: 1
Best 800m time in 2025: nan
Best 800m time all time: 127.07
Year of best 800m time: 2023
Lucas CANTALOUBE
Results found: 1
Best 800m time in 2025: 118.55
Best 800m time all time: 117.63
Year of best 800m time: 2024
Christophe DONNARD
Results found: 1
No results found for Christophe DONNARD.
Bensaada ANIS
Results found: 0
No results found for Bensaada ANIS.
Samy MACHWATE
Results found: 1
Best 800m time in 2025: 125.69
Best 800m time all time: 125.69
Year of best 800m time: 2025
Brice PANCHOT
Results found: 1
Best 800m time in 2025: nan
Best 800m time all time: 125.29
Year of best 800m time: 2013
Mathieu MAS
Results found: 15
Best 800m time in 20

# version lente

In [8]:
import pandas as pd
from tqdm import tqdm

# ── fonctions utilitaires ────────────────────────────────────────────────────
IGNORED_MARKS = ("DNS", "DNF", "AB", "DQ")

def clean_name(row: pd.Series) -> str:
    """Prénom + Nom normalisés, sans double-espaces."""
    return ' '.join(f"{row['Prénom']} {row['Nom']}".split())

def choose_match(results, target_club):
    """Heuristique club + premier résultat sinon."""
    if not results:
        return None
    if len(results) == 1:
        return results[0]

    clubs = [r["club"] for r in results]
    best = find_most_similar_club(target_club, clubs)
    return results[clubs.index(best)] if best else None

def extract_bests(df800):
    """Renvoie (best_all_time, best_2025, year_of_best_all_time) ou (None, None, None)."""
    if df800.empty:
        return None, None, None

    df800 = df800[~df800["Perf."].str.contains("|".join(IGNORED_MARKS), na=False)]
    df800["time"] = df800["Perf."].apply(convert_time_to_seconds)

    if df800.empty:
        return None, None, None

    best_all = df800["time"].min()
    best_2025 = df800.loc[df800.Annee == "2025", "time"].min()  # NaN si pas 2025
    year_best = df800.loc[df800["time"].idxmin(), "Annee"]
    return best_all, best_2025, year_best

# ── pipeline principal ───────────────────────────────────────────────────────
memo_seq = {}            #   nom complet  → seq  (évite les doubles hits API)
memo_results = {}        #   seq          → (best_all, best_2025, year_best)

best_all_col   = []
best_2025_col  = []
year_best_col  = []

for _, row in tqdm(df_athle_live.iterrows(), total=len(df_athle_live)):
    full_name = clean_name(row)

    # 1) Trouver ou récupérer le seq -----------------------------------------
    if full_name not in memo_seq:
        search = search_athletes(full_name)
        match  = choose_match(search, row["Club"])
        if not match:
            memo_seq[full_name] = None          # on retient l’échec pour ne pas recommencer
        else:
            memo_seq[full_name] = match["seq"]

    seq = memo_seq[full_name]
    if not seq:
        best_all_col.append(None); best_2025_col.append(None); year_best_col.append(None)
        continue

    # 2) Scraper / récupérer les perfs ---------------------------------------
    if seq not in memo_results:
        df_res   = get_all_athlete_results(seq)
        df_800   = df_res[df_res.Epreuve.isin(["800m", "800m Piste Courte"])]
        memo_results[seq] = extract_bests(df_800)

    best_all, best_2025, year_best = memo_results[seq]
    best_all_col.append(best_all)
    best_2025_col.append(best_2025)
    year_best_col.append(year_best)

# ── enrichissement du DataFrame ---------------------------------------------
df_athle_live = df_athle_live.assign(best_800_all_time=best_all_col,
                                     best_800_2025=best_2025_col,
                                     year_of_best_800=year_best_col)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df800["time"] = df800["Perf."].apply(convert_time_to_seconds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df800["time"] = df800["Perf."].apply(convert_time_to_seconds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df800["time"] = df800["Perf."].apply(convert_time_to_seconds)
A value is trying t

KeyboardInterrupt: 

#  version rapide

In [12]:
import pandas as pd
from tqdm import tqdm
import pickle
import os
import time
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor
import threading
from queue import Queue

# ── Configuration ────────────────────────────────────────────────────────
IGNORED_MARKS = ("DNS", "DNF", "AB", "DQ")
CACHE_FILE = "athlete_cache.pkl"
MAX_CONCURRENT = 10  # Nombre de requêtes simultanées

# ── Fonctions utilitaires ────────────────────────────────────────────────
def clean_name(row: pd.Series) -> str:
    return ' '.join(f"{row['Prénom']} {row['Nom']}".split())

def choose_match(results, target_club):
    if not results:
        return None
    if len(results) == 1:
        return results[0]
    clubs = [r["club"] for r in results]
    best = find_most_similar_club(target_club, clubs)
    return results[clubs.index(best)] if best else None

def extract_bests(df800):
    if df800.empty:
        return None, None, None
    df800 = df800[~df800["Perf."].str.contains("|".join(IGNORED_MARKS), na=False)]
    df800["time"] = df800["Perf."].apply(convert_time_to_seconds)
    if df800.empty:
        return None, None, None
    best_all = df800["time"].min()
    best_2025 = df800.loc[df800.Annee == "2025", "time"].min()
    year_best = df800.loc[df800["time"].idxmin(), "Annee"]
    return best_all, best_2025, year_best

# ── Cache management ─────────────────────────────────────────────────────
def load_cache():
    if os.path.exists(CACHE_FILE):
        try:
            with open(CACHE_FILE, 'rb') as f:
                return pickle.load(f)
        except:
            pass
    return {"seq": {}, "results": {}}

def save_cache(cache):
    try:
        with open(CACHE_FILE, 'wb') as f:
            pickle.dump(cache, f)
    except Exception as e:
        print(f"Erreur sauvegarde: {e}")

# ── Approche par batch avec threading ────────────────────────────────────
def worker_search_athletes(work_queue, results_queue, cache, lock):
    """Worker thread pour la recherche d'athlètes."""
    while True:
        item = work_queue.get()
        if item is None:
            break
        
        idx, full_name, club = item
        try:
            search = search_athletes(full_name)
            match = choose_match(search, club)
            seq = match["seq"] if match else None
            
            with lock:
                cache["seq"][full_name] = seq
            
            results_queue.put((idx, seq))
            time.sleep(0.1)  # Petite pause
            
        except Exception as e:
            print(f"Erreur recherche {full_name}: {e}")
            with lock:
                cache["seq"][full_name] = None
            results_queue.put((idx, None))
            time.sleep(0.5)
        
        work_queue.task_done()

def worker_get_results(work_queue, results_queue, cache, lock):
    """Worker thread pour récupérer les résultats."""
    while True:
        item = work_queue.get()
        if item is None:
            break
        
        idx, seq = item
        try:
            df_res = get_all_athlete_results(seq)
            df_800 = df_res[df_res.Epreuve.isin(["800m", "800m Piste Courte"])]
            result = extract_bests(df_800)
            
            with lock:
                cache["results"][seq] = result
            
            results_queue.put((idx, result))
            time.sleep(0.1)
            
        except Exception as e:
            print(f"Erreur résultats seq {seq}: {e}")
            result = (None, None, None)
            with lock:
                cache["results"][seq] = result
            results_queue.put((idx, result))
            time.sleep(0.5)
        
        work_queue.task_done()

def process_batch_threaded(items, worker_func, cache, num_workers=5):
    """Traite un batch d'items avec des threads."""
    work_queue = Queue()
    results_queue = Queue()
    lock = threading.Lock()
    
    # Créer et démarrer les workers
    workers = []
    for _ in range(num_workers):
        worker = threading.Thread(target=worker_func, args=(work_queue, results_queue, cache, lock))
        worker.daemon = True
        worker.start()
        workers.append(worker)
    
    # Ajouter les tâches
    for item in items:
        work_queue.put(item)
    
    # Collecter les résultats
    results = {}
    for _ in tqdm(range(len(items)), desc="Traitement batch"):
        idx, result = results_queue.get()
        results[idx] = result
    
    # Arrêter les workers
    for _ in workers:
        work_queue.put(None)
    for worker in workers:
        worker.join()
    
    return results

# ── Version ultra-optimisée ──────────────────────────────────────────────
def process_athletes_ultra_fast(df_athle_live):
    """Version ultra-rapide avec traitement par batch et threading."""
    
    cache = load_cache()
    print(f"Cache initial: {len(cache['seq'])} noms, {len(cache['results'])} résultats")
    
    # Identifier les tâches à faire
    search_tasks = []
    result_tasks = []
    
    for idx, row in df_athle_live.iterrows():
        full_name = clean_name(row)
        
        if full_name not in cache["seq"]:
            search_tasks.append((idx, full_name, row["Club"]))
        elif cache["seq"][full_name] and cache["seq"][full_name] not in cache["results"]:
            result_tasks.append((idx, cache["seq"][full_name]))
    
    print(f"À traiter: {len(search_tasks)} recherches, {len(result_tasks)} résultats")
    
    # Phase 1: Recherche des athlètes par batch
    if search_tasks:
        print("Phase 1: Recherche des athlètes...")
        batch_size = 50
        for i in range(0, len(search_tasks), batch_size):
            batch = search_tasks[i:i+batch_size]
            process_batch_threaded(batch, worker_search_athletes, cache, num_workers=3)
            save_cache(cache)  # Sauvegarder après chaque batch
            time.sleep(1)  # Pause entre les batchs
    
    # Recalculer les tâches de résultats après la phase 1
    result_tasks = []
    for idx, row in df_athle_live.iterrows():
        full_name = clean_name(row)
        seq = cache["seq"].get(full_name)
        if seq and seq not in cache["results"]:
            result_tasks.append((idx, seq))
    
    # Phase 2: Récupération des résultats par batch
    if result_tasks:
        print("Phase 2: Récupération des résultats...")
        batch_size = 30
        for i in range(0, len(result_tasks), batch_size):
            batch = result_tasks[i:i+batch_size]
            process_batch_threaded(batch, worker_get_results, cache, num_workers=3)
            save_cache(cache)
            time.sleep(1)
    
    # Phase 3: Construction des résultats finaux
    print("Phase 3: Construction des colonnes...")
    best_all_col = []
    best_2025_col = []
    year_best_col = []
    
    for _, row in df_athle_live.iterrows():
        full_name = clean_name(row)
        seq = cache["seq"].get(full_name)
        
        if seq and seq in cache["results"]:
            best_all, best_2025, year_best = cache["results"][seq]
        else:
            best_all, best_2025, year_best = None, None, None
        
        best_all_col.append(best_all)
        best_2025_col.append(best_2025)
        year_best_col.append(year_best)
    
    save_cache(cache)
    return best_all_col, best_2025_col, year_best_col

# ── Version simple mais plus rapide ──────────────────────────────────────
def process_athletes_simple_fast(df_athle_live):
    """Version simple mais avec optimisations de base."""
    
    cache = load_cache()
    
    # Pré-calculer tous les noms pour éviter les recalculs
    names_map = {}
    for idx, row in df_athle_live.iterrows():
        names_map[idx] = clean_name(row)
    
    print("Phase 1: Recherche des athlètes manquants...")
    search_count = 0
    for idx, row in tqdm(df_athle_live.iterrows(), desc="Recherche"):
        full_name = names_map[idx]
        
        if full_name not in cache["seq"]:
            try:
                search = search_athletes(full_name)
                match = choose_match(search, row["Club"])
                cache["seq"][full_name] = match["seq"] if match else None
                search_count += 1
                
                # Sauvegarder tous les 25 nouveaux
                if search_count % 25 == 0:
                    save_cache(cache)
                
                time.sleep(0.2)  # Pause réduite
                
            except Exception as e:
                cache["seq"][full_name] = None
                time.sleep(1)
    
    save_cache(cache)
    
    print("Phase 2: Récupération des résultats manquants...")
    result_count = 0
    for idx, row in tqdm(df_athle_live.iterrows(), desc="Résultats"):
        full_name = names_map[idx]
        seq = cache["seq"].get(full_name)
        
        if seq and seq not in cache["results"]:
            try:
                df_res = get_all_athlete_results(seq)
                df_800 = df_res[df_res.Epreuve.isin(["800m", "800m Piste Courte"])]
                cache["results"][seq] = extract_bests(df_800)
                result_count += 1
                
                if result_count % 25 == 0:
                    save_cache(cache)
                
                time.sleep(0.2)
                
            except Exception as e:
                cache["results"][seq] = (None, None, None)
                time.sleep(1)
    
    save_cache(cache)
    
    # Construction finale
    print("Construction des colonnes finales...")
    best_all_col = []
    best_2025_col = []
    year_best_col = []
    
    for idx in df_athle_live.index:
        full_name = names_map[idx]
        seq = cache["seq"].get(full_name)
        
        if seq and seq in cache["results"]:
            best_all, best_2025, year_best = cache["results"][seq]
        else:
            best_all, best_2025, year_best = None, None, None
        
        best_all_col.append(best_all)
        best_2025_col.append(best_2025)
        year_best_col.append(year_best)
    
    return best_all_col, best_2025_col, year_best_col

# ── Utilisation ──────────────────────────────────────────────────────────
# Choisissez la version selon votre situation

# Version ultra-rapide (recommandée si le serveur peut gérer la charge)
# best_all_col, best_2025_col, year_best_col = process_athletes_ultra_fast(df_athle_live)

# Version simple mais optimisée (plus sûre)
best_all_col, best_2025_col, year_best_col = process_athletes_simple_fast(df_athle_live)

# Enrichissement du DataFrame
df_athle_live = df_athle_live.assign(
    best_800_all_time=best_all_col,
    best_800_2025=best_2025_col,
    year_of_best_800=year_best_col
)

print(f"Traitement terminé pour {len(df_athle_live)} athlètes")

Phase 1: Recherche des athlètes manquants...


Recherche: 83it [01:10,  1.18it/s]


Phase 2: Récupération des résultats manquants...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df800["time"] = df800["Perf."].apply(convert_time_to_seconds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df800["time"] = df800["Perf."].apply(convert_time_to_seconds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df800["time"] = df800["Perf."].apply(convert_time_to_seconds)
A value is trying t

Construction des colonnes finales...
Traitement terminé pour 83 athlètes





In [15]:
df_athle_live.head()

Unnamed: 0,Prénom,Nom,Club,Temps d'engagement,best_800_all_time,best_800_2025,year_of_best_800
0,Jeremy,DACHICOURT,ATHLETIC CLUBS 92*,1:57.00,1:58.06,1:58.06,2025.0
1,Lucas,CANTALOUBE,ATHLETIC CLUBS 92*,1:57.63,1:57.63,1:58.55,2024.0
2,Hugo,DEL DIN,AC DU PAYS DE MEAUX,1:55.00,1:59.58,,2024.0
3,Valere,GRAFFIN,SAM PARIS 12,1:59.90,1:58.42,,2015.0
4,Max,FRADILLON,SAM PARIS 12,,,,


In [14]:
def format_time(x):

    minutes = int(x) // 60
    seconds = int(x) % 60
    centiseconds = int((x * 100) % 100)
    return f"{minutes}:{seconds:02d}.{centiseconds:02d}"

df_athle_live.best_800_all_time = df_athle_live.best_800_all_time.apply(lambda x: format_time(x) if pd.notnull(x) else x)
df_athle_live.best_800_2025 = df_athle_live.best_800_2025.apply(lambda x: format_time(x) if pd.notnull(x) else x)

# df_athle_live.to_excel('800m_TCM_st_maur_4juin.xlsx', index=False)