In [3]:
import pandas as pd
import yaml
import time
import os
import re
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

In [9]:
BASE_URL = "https://www.wser.org/{year}-entrants-list/"
YEARS = range(2016, 2026)  # 2016–2025 inclusive

# Columns we actually want
TARGET_COLS = [
    "First Name", "Last Name", "Gender", "Age",
    "City", "State", "Country", "Bib",
    "Entry Type", "WS Finishes"
]

DROP_COLS = ["Rollover", "Awards"]  # 2020-specific extras


def normalize_columns(df):
    """
    Standardize column names:
      - strip whitespace
      - capitalize consistently
      - unify 'gender'/'Gender' → 'Gender'
      - unify WS Finishes variants
      - drop extra columns (Rollover, Awards)
    """
    new_cols = {}
    for col in df.columns:
        c = str(col).strip()

        # Unify gender
        if c.lower() == "gender":
            new_cols[col] = "Gender"

        # Unify WS Finishes (some years: WS finishes, WS Finishes, WS Finishes 2020)
        elif c.lower().startswith("ws finishes"):
            new_cols[col] = "WS Finishes"

        # Everything else: title-case the first letter of each word
        else:
            new_cols[col] = " ".join([w.capitalize() for w in c.split()])

    df = df.rename(columns=new_cols)

    # Drop the known unwanted columns if present
    df = df.drop(columns=[c for c in DROP_COLS if c in df.columns], errors="ignore")

    # Make sure all expected columns exist (if a year is missing something)
    for col in TARGET_COLS:
        if col not in df.columns:
            df[col] = pd.NA

    # Reorder
    df = df[TARGET_COLS]

    return df


def scrape_wser_entrants(years=YEARS):
    all_years = []

    for year in years:
        url = BASE_URL.format(year=year)
        print(f"Scraping {year}: {url}")

        try:
            tables = pd.read_html(url)
        except Exception as e:
            print(f"  ! Could not load tables for {year} ({e})")
            continue

        df_year = None
        # Select the table containing "Entry Type"
        for t in tables:
            if any(str(col).strip().lower() == "entry type" for col in t.columns):
                df_year = t
                break

        if df_year is None:
            print(f"  ! No valid entrant table found for {year}")
            continue

        # Normalize column names (unify Gender, WS Finishes, drop extras)
        df_year = normalize_columns(df_year)

        # Filter only pure Golden Ticket + Top Ten
        valid_types = ["Golden Ticket", "Top Ten"]
        df_filtered = df_year[df_year["Entry Type"].isin(valid_types)].copy()

        if df_filtered.empty:
            print(f"  ? No GT/Top Ten entrants found for {year}")
            continue

        df_filtered["Year"] = year
        all_years.append(df_filtered)

    return pd.concat(all_years, ignore_index=True)


if __name__ == "__main__":
    df = scrape_wser_entrants()
    print(df.head())
    print("Shape:", df.shape)

    # df.to_csv("wser_entrants_2016_2025_GT_Top10_clean.csv", index=False)

Scraping 2016: https://www.wser.org/2016-entrants-list/
Scraping 2017: https://www.wser.org/2017-entrants-list/
Scraping 2018: https://www.wser.org/2018-entrants-list/
Scraping 2019: https://www.wser.org/2019-entrants-list/
Scraping 2020: https://www.wser.org/2020-entrants-list/
Scraping 2021: https://www.wser.org/2021-entrants-list/
Scraping 2022: https://www.wser.org/2022-entrants-list/
Scraping 2023: https://www.wser.org/2023-entrants-list/
Scraping 2024: https://www.wser.org/2024-entrants-list/
Scraping 2025: https://www.wser.org/2025-entrants-list/
  First Name    Last Name Gender  Age              City State        Country  \
0       Seth      Swanson      M   37          Missoula    MT  United States   
1     Thomas  Lorblanchet      M   36  Clermont Ferrand   FRA         France   
2        Ian      Sharman      M   35              Bend    OR  United States   
3      David        Laney      M   27          Portland    OR  United States   
4     Andrew       Tuckey      M   40   

In [10]:
df

Unnamed: 0,First Name,Last Name,Gender,Age,City,State,Country,Bib,Entry Type,WS Finishes,Year
0,Seth,Swanson,M,37,Missoula,MT,United States,M2,Top Ten,2.0,2016
1,Thomas,Lorblanchet,M,36,Clermont Ferrand,FRA,France,M5,Top Ten,1.0,2016
2,Ian,Sharman,M,35,Bend,OR,United States,M7,Top Ten,6.0,2016
3,David,Laney,M,27,Portland,OR,United States,M8,Top Ten,2.0,2016
4,Andrew,Tuckey,M,40,Cardiff,GBR,Great Britain,M9,Top Ten,1.0,2016
...,...,...,...,...,...,...,...,...,...,...,...
331,Keely,Henninger,F,33,Portland,OR,USA,36,Golden Ticket,1.0,2025
332,Erin,Clark,F,30,Missoula,MT,USA,37,Golden Ticket,0.0,2025
333,Hans,Troyer,M,25,Newnan,GA,USA,38,Golden Ticket,0.0,2025
334,Hannes,Namberger,M,36,Ruhpolding,DEU,DEU,39,Golden Ticket,0.0,2025


In [11]:
df.to_csv('../../data/gt_top10_ws.csv', index = False)

In [16]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.1-py3-none-any.whl.metadata (1.6 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
Downloading geographiclib-2.1-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy

   -------------------- ------------------- 1/2 [geopy]
   -------------------- ------------------- 1/2 [geopy]
   -------------------- ------------------- 1/2 [geopy]
   ---------------------------------------- 2/2 [geopy]

Successfully installed geographiclib-2.1 geopy-2.4.1


In [18]:
!pip install folium

Collecting folium
  Downloading folium-0.20.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting branca>=0.6.0 (from folium)
  Downloading branca-0.8.2-py3-none-any.whl.metadata (1.7 kB)
Downloading folium-0.20.0-py2.py3-none-any.whl (113 kB)
Downloading branca-0.8.2-py3-none-any.whl (26 kB)
Installing collected packages: branca, folium

   -------------------- ------------------- 1/2 [folium]
   -------------------- ------------------- 1/2 [folium]
   -------------------- ------------------- 1/2 [folium]
   ---------------------------------------- 2/2 [folium]

Successfully installed branca-0.8.2 folium-0.20.0


In [21]:
df = pd.read_csv('../../data/gt_top10_ws.csv')

In [22]:
import pandas as pd
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
import folium

# Initialize geolocator
geolocator = Nominatim(user_agent="geoapi")

# Cache dictionary
cache = {}

# Function to geocode with retries and caching
def get_coords(row):
    key = f"{row['city']}, {row['state']}, {row['country']}"
    if key in cache:
        return cache[key]

    for attempt in range(3):  # Retry up to 3 times
        try:
            location = geolocator.geocode(key, timeout=10)
            if location:
                coords = (location.latitude, location.longitude)
                cache[key] = coords
                return coords
        except (GeocoderTimedOut, GeocoderUnavailable):
            time.sleep(2)  # Wait before retry
    return (None, None)

# Apply geocoding with delay for rate limit
latitudes = []
longitudes = []
for _, row in df.iterrows():
    lat, lon = get_coords(row)
    latitudes.append(lat)
    longitudes.append(lon)
    time.sleep(1)  # Respect Nominatim rate limit

df['latitude'] = latitudes
df['longitude'] = longitudes

print(df)

# Create Folium map
m = folium.Map(location=[20, 0], zoom_start=2)
for _, row in df.dropna(subset=['latitude', 'longitude']).iterrows():
    folium.Marker(
        [row['latitude'], row['longitude']],
        popup=f"{row['athlete_name']} ({row['city']}, {row['country']})"
    ).add_to(m)

# Save map
m.save('athletes_map.html')
print("Map saved as athletes_map.html")


           athlete_name athlete_gender  athlete_age              city state  \
0          seth swanson              M           37          Missoula    MT   
1    thomas lorblanchet              M           36  Clermont Ferrand   FRA   
2           ian sharman              M           35              Bend    OR   
3           david laney              M           27          Portland    OR   
4         andrew tuckey              M           40           Cardiff   GBR   
..                  ...            ...          ...               ...   ...   
331     keely henninger              F           33          Portland    OR   
332          erin clark              F           30          Missoula    MT   
333         hans troyer              M           25            Newnan    GA   
334    hannes namberger              M           36        Ruhpolding   DEU   
335     ryan montgomery              M           31           Hanover    NH   

    country bib     entry_type  num_ws_finish  ws_e

In [27]:
df['state'].value_counts()

state
CO                    55
CA                    38
OR                    32
AZ                    28
UT                    21
WA                    11
ID                     9
MT                     8
CHN                    8
GBR                    7
TX                     7
NE                     6
BC                     6
NZL                    6
QC                     5
ZWE                    5
AB                     5
FRA                    5
GA                     4
SWE                    4
OK                     3
PA                     3
DEU                    3
NY                     3
VT                     3
TN                     2
HUN                    2
ME                     2
WI                     2
AUS                    2
OH                     2
MI                     2
MA                     2
NC                     2
NOR                    2
VA                     1
CHE                    1
WY                     1
Leicestershire         1
Hong Kong Island   