#### Get athletic records data
The code is adapted (updated) from [https://www.kaggle.com/code/jeannicolasduval/world-athletics-all-time-data-acquisition/notebook](https://www.kaggle.com/code/jeannicolasduval/world-athletics-all-time-data-acquisition/notebook).
<br>
A csv file with all best performances for all track and fields sports is created.

In [6]:
import pandas as pd

import re
import requests
import json
from lxml import etree
from collections import namedtuple
import concurrent.futures
import itertools
import numpy as np

from tqdm.notebook import tqdm
from IPython.display import display, HTML

In [3]:
base_url = "https://worldathletics.org/records/all-time-toplists"
landing_page = "/sprints/100-metres/outdoor/women/senior"
url = f"{base_url}{landing_page}"

In [5]:
html = requests.get(url).text
tree = etree.HTML(html)

In [6]:
json_compile = re.compile("toplists.init\(\\n(.*?),\\n", re.DOTALL)
element_text = "".join(
    [
        e.text
        for e in tree.xpath(".//script")
        if e.text and "toplists.init" in e.text
    ]
)
matches = json_compile.search(element_text)
matches.group(1).strip()
json_dicts = json.loads(matches.group(1).strip())

# Get list of cases from json_dicts where name is eventId

region_type = 'world'
category = 'senior'

cases = [d for d in dict(*[ json_dict for json_dict in json_dicts if json_dict["name"] == "eventId"])["cases"] if d["regionType"] == region_type and d["ageCategory"] == category]

In [7]:
Ranking = namedtuple('Ranking', ['label', 'gender', 'age_category', 'disciplines'])
Discipline = namedtuple ('Discipline', ['value', 'label', 'type_slug', 'name_slug'])

rankings = list()
for case in cases:
    # get static slugs (ie environment, gender, age_category)
    # print(case.keys())
    static_slugs = dict(
        [ (k, v) for k, v in case.items()
            if k not in ["values", "defaultValue"]
        ]
    )

    # environement = static_slugs["environment"]
    gender = (
        static_slugs["gender"]
        .replace("male", "men")
        .replace("female", "women")
        .replace("femen", "women")
    )
    age_category = static_slugs["ageCategory"]

    # Loop through disciplines
    disciplines_info = [v for k, v in case.items() if k == "values"]
    # Re-arrange each discipline 
    disciplines = list()
    for discipline_info in disciplines_info[0]:
        # Gather info as a Discipline class/namedtuple and add to disciplines
        discipline = Discipline(
            value = discipline_info['value'],
            label = discipline_info['label'],
            type_slug = discipline_info['typeNameUrlSlug'], 
            name_slug = discipline_info['disciplineNameUrlSlug']
        )
        disciplines.append(discipline)
    # Gather info on as a Ranking class/namedtuple and add to ranking
    ranking = Ranking(
        label=f"{age_category.capitalize()} - {gender.capitalize()}",
        gender = gender,
        age_category = age_category,
        disciplines = disciplines
    )
    rankings.append(ranking)

In [8]:
print(f"\nNumber Category: {len(rankings)}")
print(f"\nRankins Categories Info:\n--------")
print(*[f" - {ranking.label} (Nb of events: {len(ranking.disciplines)})" for ranking in rankings], sep='\n')


Number Category: 3

Rankins Categories Info:
--------
 - Senior - Men (Nb of events: 76)
 - Senior - Women (Nb of events: 74)
 - Senior - Mixed (Nb of events: 3)


In [9]:
def get_discipline_records(discipline: Discipline, ranking: Ranking, base_url: str = base_url, max_pages: int = 100) -> pd.DataFrame:
    url = f"{base_url}/{discipline.type_slug}/{discipline.name_slug}/all/{ranking.gender}/{ranking.age_category}"
    page_records_df, page_no, results_found = pd.DataFrame(), 1, True
    while page_no <= max_pages and results_found:
        params = f"regionType=world&timing=all&windReading=regular&page={page_no}&bestResultsOnly=false&firstDay=1899-12-30"
        page_url = f"{url}?{params}"
        try:
            page_records_df = pd.concat([page_records_df, get_page_records(page_url, discipline)])
            page_no += 1
        except:
            results_found = False
    return page_records_df

def get_page_records(page_url: str, discipline: Discipline) -> pd.DataFrame:
    html = requests.get(page_url).text
    tree = etree.HTML(html)
    records_table_el = tree.xpath('.//table[@class="records-table"]')[0]
    records_table_html = etree.tostring(records_table_el)
    if discipline.type_slug != "combined-events":
        page_records_df = (pd
                           .read_html(records_table_html)[0]
                           .dropna(axis=1, how="all")
                          )
    else:
        df = pd.read_html(records_table_html)[0]
        page_records_df = (df
                           .loc[~df.Rank.isna()]
                           .reset_index(drop=True)
                           .dropna(axis=1, how="all")
                           .join(df[df.Rank.isna()]
                                 .reset_index(drop=True).Mark,
                                 rsuffix="_x")
                           .rename(columns={"Mark_x": "mark_details"})
                        )
    page_records_df = page_records_df.assign(
        event_label=discipline.label,
        event_name=discipline.name_slug,
        event_type=discipline.type_slug
    ).astype(dtype={'Pos': str})
    return page_records_df

def get_ranking_records(ranking: Ranking) -> pd.DataFrame:
    ranking_df = pd.DataFrame()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(get_discipline_records, ranking.disciplines, itertools.repeat(ranking))
    
        results_progress = tqdm(results)
        for result in results_progress:
            results_progress.set_description(f'Processing {ranking.label}')
            discipline_df = result
            discipline_df = discipline_df.assign(
                ranking_label=ranking.label,
                age_category=ranking.age_category, 
                gender=ranking.gender
            )
            ranking_df = pd.concat([ranking_df, discipline_df])
    return ranking_df


overall_df = pd.DataFrame()
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(get_ranking_records, rankings)

    results_progress = tqdm(results)
    for result in results_progress:
        results_progress.set_description(f'Overall Progress')
        ranking_df = result
        overall_df = pd.concat([overall_df, ranking_df])

overall_df

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Unnamed: 0,Rank,Mark,Competitor,DOB,Nat,Pos,Venue,Date,Results Score,event_label,event_name,event_type,ranking_label,age_category,gender,WIND,mark_details
0,1.0,5.56,Donovan BAILEY,16 DEC 1967,CAN,1,"Reno, NV (USA) (i)",09 FEB 1996,1269.0,50 Metres,50-metres,sprints,Senior - Men,senior,men,,
1,1.0,5.56,Maurice GREENE,23 JUL 1974,USA,1,"Los Angeles, CA (USA) (i)",13 FEB 1999,1269.0,50 Metres,50-metres,sprints,Senior - Men,senior,men,,
2,3.0,5.60,Michael GREEN,07 NOV 1970,JAM,,Liévin (FRA) (i),16 FEB 1997,1241.0,50 Metres,50-metres,sprints,Senior - Men,senior,men,,
3,4.0,5.61,James SANFORD,27 DEC 1957,USA,1,"San Diego, CA (USA) (i)",20 FEB 1981,1234.0,50 Metres,50-metres,sprints,Senior - Men,senior,men,,
4,4.0,5.61,Deji ALIU,22 NOV 1975,NGR,,Liévin (FRA) (i),21 FEB 1999,1234.0,50 Metres,50-metres,sprints,Senior - Men,senior,men,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,264.0,3:19.91,Belarus,,BLR,6,Minsk (BLR),26 JUN 2019,,4x400 Metres Relay,4x400-metres-relay,relays,Senior - Mixed,senior,mixed,,
65,266.0,3:19.93,India,,IND,8h2,"National Stadium, Tokyo (JPN)",30 JUL 2021,,4x400 Metres Relay,4x400-metres-relay,relays,Senior - Mixed,senior,mixed,,
66,267.0,3:19.98,Jamaica U20,,JAM,3,"Pascual Guerrero Stadium, Cali (COL)",02 AUG 2022,,4x400 Metres Relay,4x400-metres-relay,relays,Senior - Mixed,senior,mixed,,
67,268.0,3:19.99,Sri Lanka,,SRI,3,"Mahinda Rajapaksha Stadium, Diyagama (SRI)",30 JUL 2023,,4x400 Metres Relay,4x400-metres-relay,relays,Senior - Mixed,senior,mixed,,


In [10]:
overall_df.to_csv('records.csv', index=False)

In [11]:
overall_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 442763 entries, 0 to 68
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Rank           442763 non-null  float64
 1   Mark           442763 non-null  object 
 2   Competitor     442763 non-null  object 
 3   DOB            435150 non-null  object 
 4   Nat            442763 non-null  object 
 5   Pos            442763 non-null  object 
 6   Venue          442763 non-null  object 
 7   Date           442763 non-null  object 
 8   Results Score  442494 non-null  object 
 9   event_label    442763 non-null  object 
 10  event_name     442763 non-null  object 
 11  event_type     442763 non-null  object 
 12  ranking_label  442763 non-null  object 
 13  age_category   442763 non-null  object 
 14  gender         442763 non-null  object 
 15  WIND           73315 non-null   float64
 16  mark_details   3767 non-null    object 
dtypes: float64(2), object(15)
memory usage

#### Reduced version
> Keep only 10 records per sport

In [39]:
records = pd.read_csv('records.csv')
sports = np.unique(records['event_label']) 
df_reduced = pd.DataFrame()
for sport in sports:
    df_reduced = pd.concat([df_reduced, records[records['event_label'] == sport][:10]], ignore_index=True)

  records = pd.read_csv('records.csv')


In [40]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           780 non-null    float64
 1   Mark           780 non-null    object 
 2   Competitor     780 non-null    object 
 3   DOB            698 non-null    object 
 4   Nat            780 non-null    object 
 5   Pos            756 non-null    object 
 6   Venue          780 non-null    object 
 7   Date           780 non-null    object 
 8   Results Score  780 non-null    float64
 9   event_label    780 non-null    object 
 10  event_name     780 non-null    object 
 11  event_type     780 non-null    object 
 12  ranking_label  780 non-null    object 
 13  age_category   780 non-null    object 
 14  gender         780 non-null    object 
 15  WIND           58 non-null     float64
 16  mark_details   40 non-null     object 
dtypes: float64(3), object(14)
memory usage: 103.7+ KB


In [None]:
df_reduced.to_csv('records_reduced.csv', index=False)