In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import plotly.express as px

In [None]:

df = df = pd.read_csv(r"athlete_events.csv")

In [None]:
print(df['NOC'].nunique())
print(df['NOC'].unique())

The different sports represented in the data:

In [None]:
df['Sport'].unique()

Types of Medals:

In [None]:
df['Medal'].unique()

Statistics over age of athletes:

In [None]:
print(f"mean age: {df['Age'].mean()}")
print(f"median age: {df['Age'].median()}")
print(f"min age: {df['Age'][df['Age']>0].min()}")
print(f"max age: {df['Age'].max()}")
print(f"standard deviation: {df['Age'].std()}")

Gender distribution:

In [None]:
sex_counts = df['Sex'].value_counts().reset_index()
sex_counts.columns = ['Sex', 'Count']

fig_sex_counts = px.bar(
    sex_counts,
    x='Sex',
    y='Count',
    color='Sex',
    color_discrete_map={'M': 'blue', 'F': 'red'},
    title="Distribution of Athletes by Gender"
)

fig_sex_counts.show()

In [None]:
df = df.fillna({
    'Age': df['Age'].median(),
    'Height': df['Height'].median(),
    'Weight': df['Weight'].median(),
    'ID': df['ID'].median()
})

df = df.astype({'Age': 'uint8', 'Height': 'uint8', 'Weight': 'uint8', 'ID': 'uint32' , 'Year': 'int16'})
df.info()

In [None]:
italydf_anon = df[df['NOC'] == 'ITA']
italydf_anon.head()

Italian medals per sport:

In [None]:
medals_per_sport = italydf_anon[italydf_anon['Medal'].notnull()]['Sport'].value_counts().reset_index()
medals_per_sport.columns = ['Sport', 'Count']

fig_medals_per_sport = px.bar(
    medals_per_sport,
    x='Sport',
    y='Count',
    title="Italian Medals per Sport",
    labels={'Sport': 'Sport', 'Count': 'Number of medals'}
)

fig_medals_per_sport.show()

Italian medals per Olympic games:

In [None]:
medals_per_games = (italydf_anon[italydf_anon['Medal'].notnull()]['Games'].value_counts().sort_index().reset_index())

cycling_heatmap_fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharex=True)

fig_medals_per_games = px.bar(
    medals_per_games,
    x='Games',
    y='Count',
    title="Italian medals per OS",
    labels={'Games': 'Olympic Games', 'Count': 'Number of medals'}
)

fig_medals_per_games.show()

Italian age distribution:

In [None]:
ita_age_min = italydf_anon['Age'].min()
ita_age_max = italydf_anon['Age'].max()

bins = list(range(int(ita_age_min - 5), int(ita_age_max + 5), 1))

fig_age_distribution_ita = px.histogram(
    italydf_anon,
    x='Age',
    nbins=len(bins),
    title="Age Distribution of Italian Athletes",
    labels={'Age': 'Age', 'count': 'Number of athletes'}
)

fig_age_distribution_ita.update_traces(marker_line_color="black", marker_line_width=1)
fig_age_distribution_ita.update_layout(bargap=0.05)

fig_age_distribution_ita.show()


# Equestrianism

In [None]:
ita_df_equestrianism = italydf_anon[italydf_anon['Sport'] == 'Equestrianism']

Medal distribution between countries (NOC):

In [None]:
df_equestrianism = df[df['Sport'] == 'Equestrianism']

medals_per_country = (df_equestrianism[df_equestrianism['Medal'].notnull()]['NOC'].value_counts().reset_index())
medals_per_country.columns = ['NOC', 'Count']

fig_eq_NOC_medal_distribution = px.bar(
    medals_per_country,
    x='NOC',
    y='Count',
    title="Equestrianism medals per country",
    labels={'NOC': 'Country', 'Count': 'Number of Medals'}
)

fig_eq_NOC_medal_distribution.show()


Age distribution:

In [None]:
equestrianism = italydf_anon[italydf_anon["Sport"] == "Equestrianism"].copy()
equestrianism["Group"] = "Equestrianism"

other_sports = italydf_anon[italydf_anon["Sport"] != "Equestrianism"].copy()
other_sports["Group"] = "Other sports"

age_compare = pd.concat([equestrianism, other_sports], ignore_index=True)

mean_age = (
    age_compare.groupby("Group")["Age"]
    .mean()
    .reset_index()
    .round(1)
)

fig_age_distribution_eq_vs_other = px.histogram(
    age_compare,
    x="Age",
    nbins=30,
    histnorm="percent",
    facet_row="Group",
    title="Age Distribution - Equestrianism vs Other Italian Sports",
    labels={"Age": "Age", "Group": "Group"}
)
fig_age_distribution_eq_vs_other.update_layout(
    height=700,
    margin=dict(t=80, b=40),
    font=dict(size=14),
)

# Adjust axes to remove the messy right-end overflow
fig_age_distribution_eq_vs_other.update_xaxes(range=[10, 65])
fig_age_distribution_eq_vs_other.update_yaxes(matches=None)

fig_age_distribution_eq_vs_other.show()

Average activity span and age span for athletes:

In [None]:
#Skapa anonyma HASH-värden i den primära dataframen
df.insert(loc=2, column="Name_HASH", value = df["Name"].apply( lambda x:
                                             hashlib.sha256(x.encode()).hexdigest()
                                             ))

#Ta bort namnkolumnen från värdena
df_anon = df.drop(["Name"], axis=1)

#Ditto, fast för cycling
italydf.insert(loc=2, column="Name_HASH", value = italydf["Name"].apply( lambda x:
                                             hashlib.sha256(x.encode()).hexdigest()
                                             ))

italydf_anon = italydf.drop(["Name"], axis=1)

### Panagiotis: Cycling (you knew it)

The Olympic games, like most organized sports, has historically excluded women.\
For this reason we'll compare cycling participation of each event over the years, split by gender.\
\
As we can see, women have only been allowed to compete since 1980, and even then it was only one discipline.\
Participation increased slowly, and in 1996 when the more modern MTB and BMX events were introduced women were ready at the start.\
Since 2012 the cycling events are homogenous - women and men participate in the same, albeit separated, disciplines.

In [None]:
cycling_df = df_anon[df_anon["Sport"] == "Cycling"].copy()
cycling_df = df_anon[df_anon["Sport"] == "Cycling"].copy()

cycling_df["Event"] = cycling_df["Event"].str.replace(
    "Cycling Women's Team Pursuit",
    "Cycling Women's Team Pursuit, 4,000 metres",
)

def cycling_base_event(cycling_event):
    cycling_event = cycling_event.replace("Cycling ", "")
    cycling_event = cycling_event.replace("Men's ", "")
    cycling_event = cycling_event.replace("Women's ", "")
    return cycling_event

cycling_df.loc[:, "Base Event"] = cycling_df["Event"].apply(cycling_base_event)

unique_cycling_events = cycling_df["Base Event"].unique()

men_event_amount = (cycling_df[cycling_df["Sex"] == "M"].groupby("Base Event")["Year"].nunique())
sorted_base_events = sorted(unique_cycling_events, key=lambda x: men_event_amount.get(x, 0), reverse=True)

cycling_df["Grouped Event"] = cycling_df.apply(lambda cycling_heatmap_row: f"{"Men's" if cycling_heatmap_row["Sex"] == "M" else "Women's"} {cycling_heatmap_row["Base Event"]}", axis=1)

cycling_heatmap_data = cycling_df.groupby(["Grouped Event", "Year"])["ID"].count().reset_index()
cycling_heatmap_data["Base Event"] = cycling_heatmap_data["Grouped Event"].apply(lambda x: x.split(" ", 1)[1])

cycling_heatmap_data["Base Event Order"] = cycling_heatmap_data["Base Event"].apply(lambda x: sorted_base_events.index(x))
cycling_heatmap_data["Gender"] = cycling_heatmap_data["Grouped Event"].apply(lambda x: x.split(" ", 1)[0])
cycling_heatmap_data = cycling_heatmap_data.sort_values(by=["Base Event Order", "Gender"])

cycling_heatmap_data = cycling_heatmap_data.drop(columns=["Base Event", "Base Event Order", "Gender"])
cycling_heatmap_data = cycling_heatmap_data.iloc[::-1]

cycling_color_scale = [[0.0, "white"], [0.001, "salmon"], [1.0, "blue"]]

cycling_heatmap_fig = px.density_heatmap(
    cycling_heatmap_data,
    x="Year",
    y="Grouped Event",
    z="ID",
    nbinsx=int((cycling_df["Year"].max()-cycling_df["Year"].min()+4)/2),
    color_continuous_scale=cycling_color_scale,
    title="Cycling through the Olympics",
    labels={"ID": "participants"},
    height=800,
    text_auto=True,
)

cycling_heatmap_end_year = cycling_df["Year"].max()
cycling_heatmap_tick_vals = list(range(1896, cycling_heatmap_end_year + 1, 8))


cycling_heatmap_fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Event",
    coloraxis_colorbar={"title": "Participants"},
    xaxis=dict(tickmode="array", tickvals=cycling_heatmap_tick_vals)
    
)

cycling_heatmap_fig.add_annotation(
    text="Amount of cycling event participants by year and event", xref="paper", yref="paper", x=0.5, y=1.05, showarrow=False, font=dict(size=14))

cycling_heatmap_fig.show()


### Cycling medal distribution by country

Team duplicates are removed, counting only by unique event and year.\
Road cycling culture may be primarily associated with France and Italy, but UK is definitely on level with France, and US & Australia also provide a solid supply of ambitious cyclists.

In [None]:
national_cycling_df = cycling_df[["NOC", "Year", "Event", "Medal"]].drop_duplicates()
cycling_medal_distribution_noc = national_cycling_df.groupby("NOC")["Medal"].value_counts().unstack().fillna(0)
cycling_medal_distribution_noc = cycling_medal_distribution_noc.assign(Total=cycling_medal_distribution_noc.sum(axis=1)).sort_values(by="Total", ascending=False).iloc[:15]


national_cycling_fig = px.bar(
    cycling_medal_distribution_noc.reset_index(),
    x="NOC",
    y=["Bronze", "Silver", "Gold"],
    title="Cycling medal distribution by country",
    labels={"value": "Number of medals", "variable": "Medal Type", "NOC": "Country"},
    color_discrete_map={"Bronze": "saddlebrown", "Silver": "silver", "Gold": "gold"},
    barmode="stack"
)

national_cycling_fig.update_layout(xaxis_tickangle=-45)
national_cycling_fig.show()

### Medal distribution by age

How does aging affect likelihood to score a medal in Olympic cycling?\
While entering their 30's (and even 40's) may not end the chances\
of an Olympic medal for athletes of endurance sports,\
likelihood drops sharply after passing their mid-20's.

In [None]:
cycling_medal_distribution = (
    cycling_df.groupby("Age")["Medal"]
    .value_counts()
    .unstack(fill_value=0)
    .reset_index()
)

cycling_medal_distribution_melted = cycling_medal_distribution.melt(
    id_vars="Age",
    value_vars=["Bronze", "Silver", "Gold"],
    var_name="Medal",
    value_name="Count"
)

cycling_medal_distribution_fig = px.bar(
    cycling_medal_distribution_melted,
    x="Age",
    y="Count",
    color="Medal",
    title="Cycling medal distribution by athlete age",
    labels={"Age": "Athlete age", "Count": "Number of medals"},
    color_discrete_map={"Bronze": "saddlebrown", "Silver": "silver", "Gold": "gold"},
    barmode="stack"
)

cycling_medal_distribution_fig.show()

In [None]:
cycling_participant_distribution = (
    cycling_df.groupby(["Age", "NOC"])["ID"]
    .nunique()
    .unstack(fill_value=0)
    .reset_index()
)

cycling_participant_distribution["Not_Italy"] = (
    cycling_participant_distribution.drop(columns=["Age", "ITA"], errors="ignore")
    .sum(axis=1)
)
cycling_participant_distribution = cycling_participant_distribution[["Age", "ITA", "Not_Italy"]].fillna(0)

participant_distribution_melted = cycling_participant_distribution.melt(
    id_vars="Age",
    value_vars=["Not_Italy", "ITA"],
    var_name="Group",
    value_name="Count"
)

cycling_participant_age_distribution_fig = px.bar(
    participant_distribution_melted,
    x="Age",
    y="Count",
    color="Group",
    title="Cyclist distribution by age",
    labels={"Age": "Athlete age", "Count": "Number of participants"},
    color_discrete_map={"ITA": "salmon", "Not_Italy": "blue"},
    barmode="stack"
)

cycling_participant_age_distribution_fig.update_layout(legend_title_text="Participants by").for_each_trace(
    lambda trace: trace.update(name="Not Italy")
    if trace.name == "Not_Italy" else trace.update(name="Italy")
)
cycling_participant_age_distribution_fig.show()


### Italy's cycling medals over the years

Italy had a significant impact on the Olympic cycling scene during the 20th century.
While claiming few medals early on they start to climb after the first world war,\
and following a bump during the second they enjoy prominent success during the 50's and 60's.

However, the "Years of Lead" during the late 60's to late 80's almost wipes their medal proportion,\
and while they start recovering toward the 21st century cycling also broadens during this time.\

With other nations catching up to Italy in the traditional road & velodrome disciplines,\
the addition of the american BMX & MTB events also dilute Italy's medal proportion during the modern Olympics.


In [None]:
cycling_medal_counts = (
    cycling_df.groupby(["Year", "NOC"])["Medal"].count().unstack(fill_value=0)
)

cycling_medal_proportion_plot = (
    cycling_medal_counts
    .assign(not_italy=lambda df: df.drop(columns=["ITA"]).sum(axis=1))
    [["not_italy", "ITA"]]
    .reset_index()
    .melt(id_vars="Year", var_name="Group", value_name="Medals")
)

cycling_proportion_medal_fig = px.bar(
    cycling_medal_proportion_plot,
    x="Year",
    y="Medals",
    color="Group",
    title="Italy's historical medal proportion in Olympic cycling",
    labels={"Medals": "Number of Medals", "Year": "Year"},
    color_discrete_map={"not_italy": "blue", "ITA": "salmon"},
    barmode="stack"
)

cycling_proportion_medal_fig.update_layout(
    legend_title_text='Medals by'
).for_each_trace(
    lambda trace: trace.update(name="Not Italy")
    if trace.name == "not_italy" else trace.update(name="Italy")
)

cycling_proportion_medal_fig.show()

## Simning

### Medaljer

In [None]:
#Först extraherar vi data för simning
ita_simn = italydf_anon[italydf_anon["Sport"]=="Swimming"]
ita_simn.head()

#och börjar kolla på medaljer och att filtrera ut dubletter 
ita_simn_medaljer = ita_simn.dropna(subset=["Medal"]).drop_duplicates(subset=["Year","Medal","Event","ID"])

#Nu gruperar vi medaljer per år
medaljer_år = ita_simn_medaljer.groupby("Year")["Medal"].count().reset_index()


In [None]:

#Nu kan vi skapa vår första plot med plotly
fig = px.line(
    medaljer_år,
    x= "Year",
    y= "Medal",
    title="Antal Medaljer för Italien per OS",
    markers = True,
    )

fig.update_layout(
    xaxis_title = "OS År",
    yaxis_title = "Antal Medaljer"
)

fig.show()

In [None]:
#Vi ska också titta på vilka olika medaljer italjen vann
medaljer_typ= ita_simn_medaljer.groupby(["Year","Medal"]).size().reset_index(name="Count")
medaljer_typ.head()

#Färgen ska matcha medaljen
color_map = {
    "Gold": "#C6A907",
    "Silver": "#C0C0C0",
    "Bronze": "#CD7F32"
}

fig = px.bar(
    medaljer_typ,
    x = "Year",
    y = "Count",
    color = "Medal",
    color_discrete_map=color_map,
    title= "Medaljfördelning för Italien i Simning per OS-år",
    barmode="group"
)

fig.update_layout(
    xaxis_title = "OS År",
    yaxis_title = "Antal Medaljer"

)
fig.show()

### Ålder

In [None]:
#Först skapar vi ålder-dataset
ita_swim_age = ita_simn[ita_simn["Age"]>0]

In [None]:
fig = px.histogram(
    ita_swim_age,
    x = "Age",
    nbins=20,
    title="Ålderfördelning för italienska simmare"
)
fig.update_layout(
    xaxis_title="Simmarens Åldern",
    yaxis_title= "Antal Atleter"
)
fig.show()

In [None]:
#Nu ska vi kolla åldersdistribution per OS-år
fig = px.box(
    ita_swim_age,
    x="Year",
    y="Age",
    title = "Åldrar av italienska simmare per OS-år"
)

fig.update_layout(
    xaxis_title ="OS-År",
    yaxis_title="Ålder"
)
fig.show()

### Medaljer vs. Ålder

In [None]:
fig = px.density_heatmap(
    ita_simn_medaljer,
    x = "Year",
    y = "Age",
    title="Medaljvinnande Simmare: Ålder och År (Densitet)",
    nbinsx=len(ita_simn_medaljer['Year']),
    nbinsy=20,
    color_continuous_scale="plasma"
)
fig.update_layout(
    xaxis_title="OS År",
    yaxis_title="Ålder",
)
fig.show()

### Simning Vs Andra Länder - Hur gick det för Italien

In [None]:
#Nu kollar vi hur italien gjorde genomfort med resten av världen
#Först skapar vi en df frö alla simning medaljer i alla världen
global_simn = df_anon[
    (df_anon["Sport"]=="Swimming") &
    (df_anon["Medal"].notna())
     ].drop_duplicates(subset=["Year", "Event", "Medal", "ID"])

#Sen grouperar vi medaljerna i länder
medaljer_land = (
    global_simn.groupby("NOC")["Medal"]
    .count()
    .reset_index(name="Antal Medaljer")
    )

#vi skapar en färg kolumn så att Italien har sin egen färg
medaljer_land["Färg"]= medaljer_land["NOC"].apply(lambda x: "blue" if x == "ITA" else "red")

#Och i slutet sorterar vi vår data, och ska kolla på de högsta 20 länder
medaljer_land = medaljer_land.sort_values("Antal Medaljer", ascending=False).head(20)

In [None]:
#Äntligen får vi skapa vår plot
fig = px.bar(
    medaljer_land,
    x="NOC",
    y="Antal Medaljer",
    color="Färg",
    color_discrete_map={"blue":"blue", "red":"red"},
    title = "Top 20 Världens Simning Medaljer"
)

fig.update_layout(
    xaxis_title="Land (NOC)",
    yaxis_title="Antal Medaljer",
    showlegend = False,
    xaxis={'categoryorder':'array', 'categoryarray': medaljer_land["NOC"]}
)

fig.show()

### Italien Simning Ålderdistibution vs andra sport

In [None]:
ita_age_othersports = italydf_anon[(
    italydf_anon["Sport"] != "Swimming") &
    (italydf_anon["Age"]>0)
    ]

age_compare = pd.concat([
    ita_swim_age.assign(Group="Swimming"),
    ita_age_othersports.assign(Group="Other Sports")
])
fig_age_swim_vs_other = px.box(
   age_compare,
   x="Group",
   y="Age",
   title="Age Distribution Swimming vs other Italian Sports",
   points="all"
)

fig_age_swim_vs_other.update_layout(
    xaxis_title="Sport",
    yaxis_title="Age Distribution"
)

In [None]:
#DF with all ages/sports
ita_all_age = italydf_anon[italydf_anon["Age"] > 0][["Sport", "Age"]]

unique_sports = ita_all_age["Sport"].unique()


color_map = {sport: "gray" for sport in unique_sports}
color_map["Swimming"] = "blue"

fig_age_all_sports = px.box(
    ita_all_age,
    x="Sport",
    y="Age",
    color="Sport",
    title="Age Distribution by Sport (Swimming Highlighted)",
    color_discrete_map=color_map,
    #points="all"
)

fig_age_all_sports.update_layout(
    xaxis_title="Sport",
    yaxis_title="Age",
    showlegend=False  # legend not needed, too many categories
)
