In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DATA_PATH = "data/raw/athlete_events.csv"
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head()

In [None]:
df.info()

missing = df.isna().mean().sort_values(ascending=False)
missing.head(15)

In [None]:
summer = df[df["Season"] == "Summer"].copy()

summer["medal_flag"] = summer["Medal"].notna().astype(int)

summer[["Year", "NOC", "Name", "Medal", "medal_flag"]].head()

In [None]:
# Athletes per country-year (all participants, not only medal winners)
athletes = (
    summer.groupby(["Year", "NOC"])["ID"]
    .nunique()
    .reset_index(name="athletes")
)

# Medals per country-year (count medal rows)
medals = (
    summer[summer["medal_flag"] == 1]
    .groupby(["Year", "NOC"])["medal_flag"]
    .sum()
    .reset_index(name="medals")
)

country_year = athletes.merge(medals, on=["Year", "NOC"], how="left")
country_year["medals"] = country_year["medals"].fillna(0).astype(int)

country_year.head()

In [None]:
country_year = country_year.sort_values(["NOC", "Year"]).reset_index(drop=True)

country_year["prev_medals"] = (
    country_year.groupby("NOC")["medals"].shift(1).fillna(0).astype(int)
)

country_year.head(10)

In [None]:
plt.figure()
plt.hist(country_year["medals"], bins=30)
plt.title("Distribution of medals per country-year")
plt.xlabel("Medals")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure()
plt.scatter(country_year["athletes"], country_year["medals"])
plt.title("Medals vs Athletes (country-year)")
plt.xlabel("Athletes")
plt.ylabel("Medals")
plt.show()

In [None]:
country_year[["athletes", "prev_medals", "medals"]].describe()