In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests 
from tqdm import tqdm

# Web scrape medals by Countries

In [37]:
def get_data(url,year,season):
    """
    Purpose of this function is to receive the following arguments:
    
    url : string, the target url.
    year : string, the year that the Olympics took place
    season : string, either Winter or Summer Olympics
    
    Afterwards, it web-scrapes the page with the given url and returns a dataframe with the desired metals by country.
    
    """
    
    # Lets define the target url.
    url = url
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    
    # Here I have each row of the main table f.ex -> USA | 10 | 5 | 1 | 16
    country_rows = soup.find("table",{"class":"medals olympics has-team-logos"}).find("tbody").find_all("tr")
    
    # The plan is to create a list of dictionaries f.ex -> [{country:USA,Gold:10,Silver:5,Bronze:1,Total:16},
    #                                                       {country:India,Gold:5,Silver:3,Bronze:0,Total:8}]
    list_medals_by_country = []
    nr_medals_by_country = {}
    for i in range(len(country_rows)):
        nr_medals_by_country["Country"] = country_rows[i].find_all("td")[0].text
        nr_medals_by_country["Gold_medals"] = country_rows[i].find_all("td")[1].text
        nr_medals_by_country["Silver_medals"] = country_rows[i].find_all("td")[2].text
        nr_medals_by_country["Bronze_medals"] = country_rows[i].find_all("td")[3].text
        nr_medals_by_country["Total_medals"] = country_rows[i].find_all("td")[4].text
        list_medals_by_country.append(nr_medals_by_country)
        nr_medals_by_country = {}
        
    # Create Dataframe
    medals_by_country_df = pd.DataFrame(list_medals_by_country)
    
    # Add Year and Season columns 
    medals_by_country_df["Year"] = [year for i in range(len(list_medals_by_country))]
    medals_by_country_df["Season"] = [season for i in range(len(list_medals_by_country))]
    return medals_by_country_df

In [38]:
target_urls = ["https://www.espn.com/olympics/summer/2024/medals","https://www.espn.com/olympics/winter/2022/medals","https://www.espn.com/olympics/summer/2020/medals","https://www.espn.com/olympics/winter/2018/medals","https://www.espn.com/olympics/summer/2016/medals","https://www.espn.com/olympics/summer/2012/medals"]

In [39]:
summer_2024 = get_data(target_urls[0],"2024","summer")
winter_2022 = get_data(target_urls[1],"2022","winter")
summer_2020 = get_data(target_urls[2],"2020","summer")
winter_2018 = get_data(target_urls[3],"2018","winter")
summer_2016 = get_data(target_urls[4],"2016","summer")
summer_2012 = get_data(target_urls[5],"2012","summer")

In [40]:
df_combined = pd.concat([summer_2024,winter_2022,summer_2020,winter_2018,summer_2016,summer_2012], ignore_index=True)

In [41]:
df_combined

Unnamed: 0,Country,Gold_medals,Silver_medals,Bronze_medals,Total_medals,Year,Season
0,USAUnited States,40,44,42,126,2024,summer
1,CHNChina,40,27,24,91,2024,summer
2,GBRGreat Britain,14,22,29,65,2024,summer
3,FRAFrance,16,26,22,64,2024,summer
4,AUSAustralia,18,19,16,53,2024,summer
...,...,...,...,...,...,...,...
412,KSASaudi Arabia,0,0,1,1,2012,summer
413,AFGAfghanistan,0,0,1,1,2012,summer
414,TJKTajikistan,0,0,1,1,2012,summer
415,KUWKuwait,0,0,1,1,2012,summer


In [42]:
df_combined.to_csv('medals_by_country.csv', index=False)