In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import plotly.express as px

In [None]:
df = pd.read_csv("athlete_events.csv")
df.head()

Number of countries represented in the data
with specified NOC:

In [None]:
print(df['NOC'].nunique())
print(df['NOC'].unique())

The different sports represented in the data:

In [None]:
df['Sport'].unique()

Types of Medals:

In [None]:
df['Medal'].unique()

Statistics over age of athletes:

In [None]:
print(f"mean age: {df['Age'].mean()}")
print(f"median age: {df['Age'].median()}")
print(f"min age: {df['Age'][df['Age']>0].min()}")
print(f"max age: {df['Age'].max()}")
print(f"standard deviation: {df['Age'].std()}")

Gender distribution:

In [None]:
sex_counts = df['Sex'].value_counts().reset_index()
sex_counts.columns = ['Sex', 'Count']

fig_sex_counts = px.bar(
    sex_counts,
    x='Sex',
    y='Count',
    color='Sex',
    color_discrete_map={'M': 'blue', 'F': 'red'},
    title="Distribution of Athletes by Gender"
)

fig_sex_counts.show()

In [None]:
gender_year_season = df.groupby(['Season', 'Year', 'Sex']).size().unstack(fill_value=0).reset_index()

# Summer OS
summer_df = gender_year_season[gender_year_season['Season'] == 'Summer'].melt(
    id_vars=['Year'], value_vars=['F', 'M'], var_name='Sex', value_name='Number of athletes'
)

fig_summer = px.line(
    summer_df,
    x='Year',
    y='Number of athletes',
    color='Sex',
    markers=True,
    title='Summer OS'
)

# Winter OS
winter_df = gender_year_season[gender_year_season['Season'] == 'Winter'].melt(
    id_vars=['Year'], value_vars=['F', 'M'], var_name='Sex', value_name='Number of athletes'
)

fig_winter = px.line(
    winter_df,
    x='Year',
    y='Number of athletes',
    color='Sex',
    markers=True,
    title='Winter OS'
)

# Visa figurer
fig_summer.show()
fig_winter.show()


Top 10 countries medal distribution:

In [None]:

top10_medals = (
    df.groupby('NOC')['Medal']
    .count()
    .sort_values(ascending=False)
    .iloc[:10]
    .reset_index()
)

top10_medals.columns = ['NOC', 'Count']

fig_top10_medals = px.bar(
    top10_medals,
    x='NOC',
    y='Count',
    title="Top 10 countries by total medals",
    labels={'NOC': 'Country', 'Count': 'Number of medals'}
)

fig_top10_medals.show()

# Fencing

Anonimization of athlete names:

In [None]:
df['Name'] = df['Name'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())

Seperating Italian statistics from the data:

In [None]:
italydf_anon = df[df['NOC'] == 'ITA']
italydf_anon.head()

Italian medals per sport:

In [None]:
medals_per_sport = italydf_anon[italydf_anon['Medal'].notnull()]['Sport'].value_counts().reset_index()
medals_per_sport.columns = ['Sport', 'Count']

fig_medals_per_sport = px.bar(
    medals_per_sport,
    x='Sport',
    y='Count',
    title="Italian Medals per Sport",
    labels={'Sport': 'Sport', 'Count': 'Number of medals'}
)

fig_medals_per_sport.show()

Italian medals per Olympic games:

In [None]:
medals_per_games = (italydf_anon[italydf_anon['Medal'].notnull()]['Games'].value_counts().sort_index().reset_index())

medals_per_games.columns = ['Games', 'Count']

fig_medals_per_games = px.bar(
    medals_per_games,
    x='Games',
    y='Count',
    title="Italian medals per OS",
    labels={'Games': 'Olympic Games', 'Count': 'Number of medals'}
)

fig_medals_per_games.show()

Italian age distribution:

In [None]:
ita_age_min = italydf_anon['Age'].min()
ita_age_max = italydf_anon['Age'].max()

bins = list(range(int(ita_age_min - 5), int(ita_age_max + 5), 1))

fig_age_distribution_ita = px.histogram(
    italydf_anon,
    x='Age',
    nbins=len(bins),
    title="Age Distribution of Italian Athletes",
    labels={'Age': 'Age', 'count': 'Number of athletes'}
)

fig_age_distribution_ita.update_traces(marker_line_color="black", marker_line_width=1)
fig_age_distribution_ita.update_layout(bargap=0.05)

fig_age_distribution_ita.show()


# Equestrianism

In [None]:
ita_df_equestrianism = italydf_anon[italydf_anon['Sport'] == 'Equestrianism']

Medal distribution between countries (NOC):

In [None]:
df_equestrianism = df[df['Sport'] == 'Equestrianism']

medals_per_country = (df_equestrianism[df_equestrianism['Medal'].notnull()]['NOC'].value_counts().reset_index())
medals_per_country.columns = ['NOC', 'Count']

fig_eq_NOC_medal_distribution = px.bar(
    medals_per_country,
    x='NOC',
    y='Count',
    title="Equestrianism medals per country",
    labels={'NOC': 'Country', 'Count': 'Number of Medals'}
)

fig_eq_NOC_medal_distribution.show()


Age distribution:

In [None]:
equestrianism = italydf_anon[italydf_anon["Sport"] == "Equestrianism"].copy()
equestrianism["Group"] = "Equestrianism"

other_sports = italydf_anon[italydf_anon["Sport"] != "Equestrianism"].copy()
other_sports["Group"] = "Other sports"

age_compare = pd.concat([equestrianism, other_sports], ignore_index=True)

mean_age = (
    age_compare.groupby("Group")["Age"]
    .mean()
    .reset_index()
    .round(1)
)

fig_age_distribution_eq_vs_other = px.histogram(
    age_compare,
    x="Age",
    nbins=30,
    histnorm="percent",
    facet_row="Group",
    title="Age Distribution - Equestrianism vs Other Italian Sports",
    labels={"Age": "Age", "Group": "Group"}
)
fig_age_distribution_eq_vs_other.update_layout(
    height=700,
    margin=dict(t=80, b=40),
    font=dict(size=14),
)

# Adjust axes to remove the messy right-end overflow
fig_age_distribution_eq_vs_other.update_xaxes(range=[10, 65])
fig_age_distribution_eq_vs_other.update_yaxes(matches=None)

fig_age_distribution_eq_vs_other.show()

Average activity span and age span for athletes:

In [None]:
age_span_per_person = ita_df_equestrianism.groupby('Name').agg(
    MinAge=('Age', 'min'),
    MaxAge=('Age', 'max'),
    ActiveYears=('Year', lambda x: x.nunique())
    ).reset_index()

longest_active = age_span_per_person['ActiveYears'].max()

bins = list(range(0, int(longest_active + 5), 1))

fig_eq_activity_years = px.histogram(
    age_span_per_person,
    x='ActiveYears',
    nbins=len(bins),
    title="Amount of Active Years for Italian Athletes",
    labels={"ActiveYears": "Number of Games", "count": "Numbr of Athletes"}
)

fig_eq_activity_years.update_traces(marker_line_width=1, marker_line_color="black")
fig_eq_activity_years.update_layout(bargap=0.05)

fig_eq_activity_years.show()

Gender distribution:

In [None]:
counts = ita_df_equestrianism.groupby(['Year','Sex']).size().unstack().fillna(0)
df_counts = counts.reset_index().melt(id_vars='Year', value_name='Count', var_name='Sex')

fig_eq_gender_distribution = px.bar(
    df_counts,
    x='Year',
    y='Count',
    color='Sex',
    barmode='group',
    title="Gender distribution of Italian equestrianism athletes over the years"
)

fig_eq_gender_distribution.show()

Italian Equestrianism medals per year:

In [None]:
ita_eq_medals = ita_df_equestrianism.dropna(subset=['Medal']).drop_duplicates(subset=['Year', 'Medal', 'Event', 'ID'])
medals_type= ita_eq_medals.groupby(['Year', 'Medal']).size().reset_index(name="Count")

color_map = {
    "Gold": "#C6A907",
    "Silver": "#C0C0C0",
    "Bronze": "#CD7F32"
}

fig_eq_medals_type = px.bar(
    medals_type,
    x = 'Year',
    y = 'Count',
    color = 'Medal',
    color_discrete_map=color_map,
    title= "Medal Distribution for Italy per year",
    barmode='group'
)

fig_eq_medals_type.update_layout(
    xaxis_title = "Year",
    yaxis_title = "Number of Medals"
)

fig_eq_medals_type.show()



## Simning

### Medaljer

In [None]:
#Först extraherar vi data för simning
ita_simn = italydf_anon[italydf_anon["Sport"]=="Swimming"]
ita_simn.head()

#och börjar kolla på medaljer och att filtrera ut dubletter 
ita_simn_medaljer = ita_simn.dropna(subset=["Medal"]).drop_duplicates(subset=["Year","Medal","Event","ID"])

#Nu gruperar vi medaljer per år
medaljer_år = ita_simn_medaljer.groupby("Year")["Medal"].count().reset_index()


In [None]:

#Nu kan vi skapa vår första plot med plotly
fig = px.line(
    medaljer_år,
    x= "Year",
    y= "Medal",
    title="Antal Medaljer för Italien per OS",
    markers = True,
    )

fig.update_layout(
    xaxis_title = "OS År",
    yaxis_title = "Antal Medaljer"
)

fig.show()

In [None]:
#Vi ska också titta på vilka olika medaljer italjen vann
medaljer_typ= ita_simn_medaljer.groupby(["Year","Medal"]).size().reset_index(name="Count")
medaljer_typ.head()

#Färgen ska matcha medaljen
color_map = {
    "Gold": "#C6A907",
    "Silver": "#C0C0C0",
    "Bronze": "#CD7F32"
}

fig = px.bar(
    medaljer_typ,
    x = "Year",
    y = "Count",
    color = "Medal",
    color_discrete_map=color_map,
    title= "Medaljfördelning för Italien i Simning per OS-år",
    barmode="group"
)

fig.update_layout(
    xaxis_title = "OS År",
    yaxis_title = "Antal Medaljer"

)
fig.show()

### Ålder

In [None]:
#Först skapar vi ålder-dataset
ita_swim_age = ita_simn[ita_simn["Age"]>0]

In [None]:
fig = px.histogram(
    ita_swim_age,
    x = "Age",
    nbins=20,
    title="Ålderfördelning för italienska simmare"
)
fig.update_layout(
    xaxis_title="Simmarens Åldern",
    yaxis_title= "Antal Atleter"
)
fig.show()

In [None]:
#Nu ska vi kolla åldersdistribution per OS-år
fig = px.box(
    ita_swim_age,
    x="Year",
    y="Age",
    title = "Åldrar av italienska simmare per OS-år"
)

fig.update_layout(
    xaxis_title ="OS-År",
    yaxis_title="Ålder"
)
fig.show()

### Medaljer vs. Ålder

In [None]:
fig = px.density_heatmap(
    ita_simn_medaljer,
    x = "Year",
    y = "Age",
    title="Medaljvinnande Simmare: Ålder och År (Densitet)",
    nbinsx=len(ita_simn_medaljer['Year']),
    nbinsy=20,
    color_continuous_scale="plasma"
)
fig.update_layout(
    xaxis_title="OS År",
    yaxis_title="Ålder",
)
fig.show()

### Simning Vs Andra Länder - Hur gick det för Italien

In [None]:
#Nu kollar vi hur italien gjorde genomfort med resten av världen
#Först skapar vi en df frö alla simning medaljer i alla världen
global_simn = df_anon[
    (df_anon["Sport"]=="Swimming") &
    (df_anon["Medal"].notna())
     ].drop_duplicates(subset=["Year", "Event", "Medal", "ID"])

#Sen grouperar vi medaljerna i länder
medaljer_land = (
    global_simn.groupby("NOC")["Medal"]
    .count()
    .reset_index(name="Antal Medaljer")
    )

#vi skapar en färg kolumn så att Italien har sin egen färg
medaljer_land["Färg"]= medaljer_land["NOC"].apply(lambda x: "blue" if x == "ITA" else "red")

#Och i slutet sorterar vi vår data, och ska kolla på de högsta 20 länder
medaljer_land = medaljer_land.sort_values("Antal Medaljer", ascending=False).head(20)

In [None]:
#Äntligen får vi skapa vår plot
fig = px.bar(
    medaljer_land,
    x="NOC",
    y="Antal Medaljer",
    color="Färg",
    color_discrete_map={"blue":"blue", "red":"red"},
    title = "Top 20 Världens Simning Medaljer"
)

fig.update_layout(
    xaxis_title="Land (NOC)",
    yaxis_title="Antal Medaljer",
    showlegend = False,
    xaxis={'categoryorder':'array', 'categoryarray': medaljer_land["NOC"]}
)

fig.show()

### Italien Simning Ålderdistibution vs andra sport

In [None]:
ita_age_othersports = italydf_anon[(
    italydf_anon["Sport"] != "Swimming") &
    (italydf_anon["Age"]>0)
    ]

age_compare = pd.concat([
    ita_swim_age.assign(Group="Swimming"),
    ita_age_othersports.assign(Group="Other Sports")
])
fig_age_swim_vs_other = px.box(
   age_compare,
   x="Group",
   y="Age",
   title="Age Distribution Swimming vs other Italian Sports",
   points="all"
)

fig_age_swim_vs_other.update_layout(
    xaxis_title="Sport",
    yaxis_title="Age Distribution"
)

In [None]:
#DF with all ages/sports
ita_all_age = italydf_anon[italydf_anon["Age"] > 0][["Sport", "Age"]]

unique_sports = ita_all_age["Sport"].unique()


color_map = {sport: "gray" for sport in unique_sports}
color_map["Swimming"] = "blue"

fig_age_all_sports = px.box(
    ita_all_age,
    x="Sport",
    y="Age",
    color="Sport",
    title="Age Distribution by Sport (Swimming Highlighted)",
    color_discrete_map=color_map,
    #points="all"
)

fig_age_all_sports.update_layout(
    xaxis_title="Sport",
    yaxis_title="Age",
    showlegend=False  # legend not needed, too many categories
)
