In [24]:
# ...existing code...
import requests
import pandas as pd

def get_top_list_html(year, event_id, event_slug="800-metres", category_slug="middlelong", gender="women", age_category="senior"):
    """
    Scrape le tableau des bilans depuis la page HTML de World Athletics.
    
    L'URL est construite ainsi :
    /records/toplists/{category_slug}/{event_slug}/all/{gender}/{age_category}/{year}
    """
    
    # Construction de l'URL de base (partie chemin)
    base_url = f"https://worldathletics.org/records/toplists/{category_slug}/{event_slug}/all/{gender}/{age_category}/{year}"
    
    # Les paramètres d'URL (query string) que tu as identifiés
    params = {
        "regionType": "world",
        "timing": "electronic",
        "page": 1,
        "bestResultsOnly": "true",
        "maxResultsByCountry": "all",
        "eventId": event_id,
        "ageCategory": age_category
    }

    # Headers indispensables pour ne pas être bloqué (User-Agent)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
    }
    
    try:
        # On fait une requête GET standard sur la page web
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        
        # OPTIMISATION ICI :
        # 1. On utilise flavor='lxml' (C'est 10x à 50x plus rapide que le défaut)
        # 2. On cible la classe spécifique 'records-table' pour ne pas scanner toute la page
        dfs = pd.read_html(response.text, flavor='lxml', attrs={'class': 'records-table'})
        
        if dfs:
            return dfs[0]
        else:
            return "Aucun tableau trouvé sur la page."
            
    except Exception as e:
        return f"Erreur lors du scraping : {str(e)}"

# Test avec tes paramètres (800m Femmes Senior 2023)
# Note : Il faut connaitre le slug ("middlelong", "800-metres") qui est dans l'URL
df_results = get_top_list_html(
    year=2023, 
    event_id=10229501, 
    event_slug="800-metres", 
    category_slug="middlelong", # Attention: change selon l'épreuve (sprints, jumps, etc.)
    gender="men"
)




Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



In [25]:
df_results.head()

Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,1:42.80,,Emmanuel WANYONYI,01 AUG 2004,KEN,1,,"Hayward Field, Eugene, OR (USA)",17 SEP 2023,1241
1,2,1:42.85,,Marco AROP,20 SEP 1998,CAN,2,,"Hayward Field, Eugene, OR (USA)",17 SEP 2023,1240
2,3,1:43.06,,Djamel SEDJATI,03 MAY 1999,ALG,3,,"Hayward Field, Eugene, OR (USA)",17 SEP 2023,1233
3,4,1:43.22,,Wyclife KINYAMAL,02 JUL 1997,KEN,1,,"Stade Louis II, Monaco (MON)",21 JUL 2023,1228
4,5,1:43.38,,Slimane MOULA,25 FEB 1999,ALG,3,,"Stade Charléty, Paris (FRA)",09 JUN 2023,1223


In [39]:
results = []

for i in range(2001, 2026):

    df_results = get_top_list_html(
        year=i, 
        event_id=10229501, 
        event_slug="800-metres", 
        category_slug="middlelong", 
        gender="men")
    
    results.append(df_results)

    final_df = pd.concat(results)

    print(final_df.shape)
    


Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(100, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(200, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(300, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(400, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(500, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(600, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(700, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(800, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(900, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1000, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1100, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1200, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1300, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1400, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1500, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1600, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1700, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1800, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(1900, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(2000, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(2100, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(2200, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(2300, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



(2400, 11)
(2500, 11)



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



In [40]:
final_df.shape

(2500, 11)

In [41]:
def convert_time_to_seconds(time_str):
    minutes, seconds, centiseconds = time_str.split(":")[0], time_str.split(".")[0].split(":")[1], time_str.split(".")[1]
    total_seconds = int(minutes) * 60 + int(seconds) + int(centiseconds) / 100
    return total_seconds

In [42]:
final_df['Mark_s'] = final_df['Mark'].apply(convert_time_to_seconds)
final_df['Date'] = pd.to_datetime(final_df['Date'], format='%d %b %Y', errors='coerce')
final_df['Year'] = final_df['Date'].dt.year

In [43]:
final_df

Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Unnamed: 5,Pos,Unnamed: 7,Venue,Date,Results Score,Mark_s,Year
0,1,1:42.47,,Yuriy BORZAKOVSKIY,12 APR 1981,RUS,1,,Bruxelles (BEL),2001-08-24,1252,102.47,2001
1,2,1:42.55,,André BUCHER,19 OCT 1976,SUI,1f1,,"Letzigrund, Zürich (SUI)",2001-08-17,1249,102.55,2001
2,3,1:42.81,,Jean-Patrick NDUWIMANA,09 MAR 1978,BDI,2f1,,"Letzigrund, Zürich (SUI)",2001-08-17,1241,102.81,2001
3,4,1:42.96,,Wilfred BUNGEI,24 JUL 1980,KEN,3f1,,"Letzigrund, Zürich (SUI)",2001-08-17,1236,102.96,2001
4,5,1:43.00,,William YIAMPOY,17 MAY 1974,KEN,4f1,,"Letzigrund, Zürich (SUI)",2001-08-17,1235,103.00,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,1:44.88,,Tibo DE SMET,28 MAY 1999,BEL,2f3,,"TSV Stadion, Pfungstadt (GER)",2025-08-20,1177,104.88,2025
96,97,1:44.89,,Camden MARSHALL,01 MAY 2003,USA,8,,"Hayward Field, Eugene, OR (USA)",2025-08-03,1177,104.89,2025
97,98,1:44.91,,Amel TUKA,09 JAN 1991,BIH,3,,"Stadio G. Teghil, Lignano Sabbiadoro (ITA)",2025-07-13,1176,104.91,2025
98,99,1:44.92,,Wes FERGUSON,05 MAR 2001,USA,3,,"Ocean Breeze Athl. Complex, New York, NY (USA)...",2025-02-23,1234,104.92,2025


In [44]:
# ...existing code...
import plotly.graph_objects as go
import pandas as pd

base_date = pd.Timestamp("2024-01-01")
final_df['Time_Obj'] = base_date + pd.to_timedelta(final_df['Mark_s'], unit='s')

# 1. Préparation : On construit le texte complet qu'on veut voir au survol
# On combine le Nom, la Nation (si dispo) et le temps officiel exact
# Astuce: On utilise du HTML (<b>) pour mettre le nom en gras
final_df['Hover_Label'] = (
    "<b>" + final_df['Competitor'] + "</b> (" + final_df['Unnamed: 5'] + ")<br>" + 
    "Date: " + final_df['Date'].dt.strftime('%d/%m/%Y')
)

unique_years = sorted(final_df['Year'].dropna().unique())

fig = go.Figure()

for year in unique_years:
    # On filtre le DataFrame pour l'année (on garde toutes les colonnes cette fois)
    df_year = final_df[final_df['Year'] == year]
    
    if not df_year.empty:
        fig.add_trace(go.Box(
            y=df_year['Time_Obj'],
            
            # ICI : On passe la liste des noms correspondants
            text=df_year['Hover_Label'],
            
            name=str(int(year)),
            width=0.6,
            boxpoints='outliers' 
        ))

fig.update_layout(
    title='Evolution du top 100 mondial masculin sur 800m Outdoor (2001-2025)',
    xaxis_title='Année',
    yaxis_title='Temps',
    boxmode='group',
    showlegend=False,
    width=1000,
    height=600,
    yaxis=dict(tickformat="%M:%S.%L") # Pas de centièmes sur l'axe Y
)

# 2. Configuration de l'affichage au survol
# %{text} affiche notre colonne 'Hover_Label' (Nom + Nat)
# %{y} affiche le temps (formaté par le tickformat ou spécifique ici)
fig.update_traces(
    hovertemplate="%{text}<br>Temps: %{y|%M'%S''%L}"
)

fig.write_html('C:\\Users\\Lucas\\Documents\\DATA_SCIENCE\\GRAPH\\evol_top_100_mondial_masculin_800m_outdoor.html')
fig.show()

In [48]:
import requests
import pandas as pd
import plotly.graph_objects as go
import os
from datetime import datetime

# ==========================================
# 1. ANALYSIS PARAMETERS (ENGLISH)
# ==========================================

# --- Event Parameters ---
EVENT_ID = 10229501       # World Athletics ID (ex: 800m=10229512, 100m=10229501)
EVENT_SLUG = "800-metres" # URL Slug (ex: "800-metres", "100-metres")
CATEGORY_SLUG = "middlelong" # URL Category (ex: "sprints", "middlelong", "jumps")
GENDER = "men"            # "men" or "women"
AGE_CATEGORY = "u18"   # "senior", "u20", etc.

# --- Region & Venue ---
REGION_API = "world"      # API parameter value (ex: "world", "europe")
REGION_NAME = "world"     # For filename/title (ex: "world", "europe", "france")
VENUE = "outdoor"         # "outdoor" or "indoor"

# --- Time Period ---
YEAR_START = 2001
YEAR_END = 2025

# --- Output Parameters ---
OUTPUT_FOLDER = r"C:\Users\Lucas\Documents\DATA_SCIENCE\GRAPH"
# Title generated in English
GRAPH_TITLE = f"Top 100 Evolution - {REGION_NAME.capitalize()} {GENDER.capitalize()} {EVENT_SLUG} {AGE_CATEGORY} {VENUE.capitalize()} ({YEAR_START}-{YEAR_END})"


# ==========================================
# 2. UTILITY FUNCTIONS
# ==========================================

def get_top_list_html(year):
    """
    Scrapes the Top List table from World Athletics for a specific year.
    Injects global parameters (VENUE, REGION, ETC).
    """
    # URL construction with VENUE (indoor/outdoor)
    base_url = f"https://worldathletics.org/records/toplists/{CATEGORY_SLUG}/{EVENT_SLUG}/{VENUE}/{GENDER}/{AGE_CATEGORY}/{year}"
    
    params = {
        "regionType": REGION_API,
        "timing": "electronic",
        "page": 1,
        "bestResultsOnly": "true",
        "maxResultsByCountry": "all",
        "eventId": EVENT_ID,
        "ageCategory": AGE_CATEGORY
    }
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36"
    }
    
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        dfs = pd.read_html(response.text, flavor='lxml', attrs={'class': 'records-table'})
        return dfs[0] if dfs else None
    except Exception as e:
        return None

def convert_time_to_seconds(time_str):
    try:
        time_str = str(time_str).strip()
        if ':' in time_str:
            parts = time_str.split(':')
            if len(parts) == 2: # MM:SS.cc
                m, s = parts
                return int(m) * 60 + float(s)
            elif len(parts) == 3: # HH:MM:SS
                h, m, s = parts
                return int(h) * 3600 + int(m) * 60 + float(s)
        return float(time_str)
    except ValueError:
        return None

# ==========================================
# 3. EXECUTION & GENERATION
# ==========================================

print(f"--- {GRAPH_TITLE} ---")
results = []

for year in range(YEAR_START, YEAR_END + 1):
    df_year = get_top_list_html(year)
    
    if df_year is not None and not df_year.empty:
        # Date parsing
        df_year['Date'] = pd.to_datetime(df_year['Date'], format='%d %b %Y', errors='coerce')
        df_year['Year'] = year
        results.append(df_year)
        print(f"✅ {year}: {len(df_year)} performances found")
    else:
        print(f"❌ {year}: -")

if not results:
    print("No data retrieved. Please check parameters (e.g., does Indoor exist for this event/year?).")
else:
    # 1. Consolidation
    final_df = pd.concat(results, ignore_index=True)
    
    # 2. Cleaning & Conversion
    final_df['Mark_s'] = final_df['Mark'].apply(convert_time_to_seconds)
    final_df = final_df.dropna(subset=['Mark_s'])

    # 3. Create Date Object for Plotly Y-Axis scaling
    base_date_ref = pd.Timestamp("2024-01-01")
    final_df['Time_Obj'] = base_date_ref + pd.to_timedelta(final_df['Mark_s'], unit='s')

    # 4. Create Hover Text
    nat_col = 'Nat' if 'Nat' in final_df.columns else 'Country'
    col_nat_data = final_df[nat_col] if nat_col in final_df.columns else final_df.iloc[:, 5]

    final_df['Hover_Label'] = (
        "<b>" + final_df['Competitor'] + "</b> (" + col_nat_data.fillna('') + ")<br>" + 
        "Date: " + final_df['Date'].dt.strftime('%d/%m/%Y')
    )

    # 5. Building the Graph
    unique_years = sorted(final_df['Year'].unique())
    fig = go.Figure()

    for year in unique_years:
        df_display = final_df[final_df['Year'] == year]
        if not df_display.empty:
            fig.add_trace(go.Box(
                y=df_display['Time_Obj'],
                text=df_display['Hover_Label'],
                name=str(year),
                width=0.6,
                boxpoints='outliers'
            ))

    fig.update_layout(
        title=GRAPH_TITLE,
        xaxis_title='Year',    # ENGLISH
        yaxis_title='Time',    # ENGLISH
        boxmode='group',
        showlegend=False,
        width=1000,
        height=600,
        yaxis=dict(tickformat="%M:%S.%L") # Format: Minutes'Seconds''
    )

    # Hover template in English
    fig.update_traces(hovertemplate="%{text}<br>Time: %{y|%M'%S''%L}")

    # 6. Save File
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)
    
    # Filename construction: evol_top_100_world_men_800-metres_outdoor.html
    filename = f"{GRAPH_TITLE}.html"
    full_path = os.path.join(OUTPUT_FOLDER, filename)
    
    fig.write_html(full_path)
    print(f"\nSuccessfully saved: {full_path}")
    fig.show()

--- Top 100 Evolution - World Men 800-metres u18 Outdoor (2001-2025) ---



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2001: 6 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2002: 7 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2003: 10 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2004: 6 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2005: 10 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2006: 11 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2007: 6 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2008: 8 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2009: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2010: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2011: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2012: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2013: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2014: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2015: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2016: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2017: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2018: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2019: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2020: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2021: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2022: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2023: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2024: 100 performances found



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



✅ 2025: 100 performances found

Successfully saved: C:\Users\Lucas\Documents\DATA_SCIENCE\GRAPH\Top 100 Evolution - World Men 800-metres u18 Outdoor (2001-2025).html
