In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests 
from tqdm import tqdm

# Web scrape medals by Athletes

In [22]:
def get_data(url,year,season):
    """
    Purpose of this function is to receive the following arguments:
    
    url : string, the target url.
    year : string, the year that the Olympics took place
    season : string, either Winter or Summer Olympics
    
    Afterwards, it web-scrapes the page with the given url and returns a dataframe with the desired metals by athlete,
    for the given year and season .
    
    """
    
    # Define url
    url = url
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}
    response = requests.get(url,headers=headers)
    
    soup = BeautifulSoup(response.text,'html.parser')
    nr_of_pages = int(soup.find("ul",{"class":"pagination"}).find_all("li")[-2].text)
    
    ## Lets construct the pagination links
    pagination_link_list = []
    for i in range(1,nr_of_pages+1):
        pagination_link_list.append("https://www.espn.com/olympics/{}/{}/medals/_/view/athletes/sort/total/page/{}".format(season,year,i))
     
    # I have to iterate throught the list above,connect as target url, find the main table, store data in a list consisting of 
    # dictionaries. 
    # f.ex -> [{country:USA,athlete:name1,golds:1,silver:0,bronze:0,total:1},
    #      {country:France,athlete:name1,golds:1,silver:2,bronze:0,total:3}]
    medals_by_athlete_list = []
    medals_by_athlete_dict = {}
    for i in range(len(pagination_link_list)):
        url = pagination_link_list[i]
        headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}
        response = requests.get(url,headers=headers)
        soup = BeautifulSoup(response.text,'html.parser')      
        row_athlete_list = soup.find("table",{"class":"medals olympics has-team-logos"}).find("tbody").find_all("tr")
        for i in range(len(row_athlete_list)):
            medals_by_athlete_dict["Athlete Name"] = row_athlete_list[i].find_all("td")[0].text
            medals_by_athlete_dict["Country"] = row_athlete_list[i].find("img")["alt"]
            medals_by_athlete_dict["Gold_medals"] = row_athlete_list[i].find_all("td")[1].text
            medals_by_athlete_dict["Silver_medals"] = row_athlete_list[i].find_all("td")[2].text
            medals_by_athlete_dict["Bronze_medals"] = row_athlete_list[i].find_all("td")[3].text
            medals_by_athlete_dict["Total_medals"] = row_athlete_list[i].find_all("td")[4].text
            medals_by_athlete_list.append(medals_by_athlete_dict)
            medals_by_athlete_dict = {}       
    medals_by_athlete_df = pd.DataFrame(medals_by_athlete_list)
    
    medals_by_athlete_df["Year"] = [year for i in range(len(medals_by_athlete_list))]
    medals_by_athlete_df["Season"] = [season for i in range(len(medals_by_athlete_list))]
    return medals_by_athlete_df

In [23]:
target_urls = ["https://www.espn.com/olympics/summer/2024/medals/_/view/athletes",
              "https://www.espn.com/olympics/winter/2022/medals/_/view/athletes",
              "https://www.espn.com/olympics/summer/2020/medals/_/view/athletes",
              "https://www.espn.com/olympics/winter/2018/medals/_/view/athletes",
              "https://www.espn.com/olympics/summer/2016/medals/_/view/athletes",
              "https://www.espn.com/olympics/summer/2012/medals/_/view/athletes"]

In [24]:
summer_2024 = get_data(target_urls[0],"2024","summer")
winter_2022 = get_data(target_urls[1],"2022","winter")
summer_2020 = get_data(target_urls[2],"2020","summer")
winter_2018 = get_data(target_urls[3],"2018","winter")
summer_2016 = get_data(target_urls[4],"2016","summer")
summer_2012 = get_data(target_urls[5],"2012","summer")

In [25]:
df_combined = pd.concat([summer_2024,winter_2022,summer_2020,winter_2018,summer_2016,summer_2012], ignore_index=True)

In [26]:
df_combined

Unnamed: 0,Athlete Name,Country,Gold_medals,Silver_medals,Bronze_medals,Total_medals,Year,Season
0,Zhang Yufei,China,0,1,5,6,2024,summer
1,Léon Marchand,France,4,0,1,5,2024,summer
2,Torri Huske,United States,3,2,0,5,2024,summer
3,Mollie O'Callaghan,Australia,3,1,1,5,2024,summer
4,Regan Smith,United States,2,3,0,5,2024,summer
...,...,...,...,...,...,...,...,...
5666,Artur Aleksanyan,Armenia,0,0,1,1,2012,summer
5667,Hripsime Khurshudyan,Armenia,0,0,1,1,2012,summer
5668,Juan Martín Del Potro,Argentina,0,0,1,1,2012,summer
5669,Lucas Calabrese,Argentina,0,0,1,1,2012,summer


In [27]:
df_combined.to_csv('medals_by_athlete.csv', index=False)