In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import plotly
import plotly.express as px



In [None]:

df = pd.read_csv(r"athlete_events.csv")


In [None]:
df = df.fillna({
    'Age': df['Age'].median(),
    'Height': df['Height'].median(),
    'Weight': df['Weight'].median()
})

df = df.astype({'Age': 'uint8', 'Height': 'uint8', 'Weight': 'uint8', 'ID': 'uint32' , 'Year': 'int16'})
df.info()

In [None]:
print(df['Age'].median())
print(df['Age'].mean())


# ITALIA

In [None]:
# Anonymisera namn med SHA-256
df["Name"] = df["Name"].apply(
    lambda x: hashlib.sha256(str(x).encode()).hexdigest()
)

# Fyll tomma medaljer med "None" = ingen medalj
df["Medal"] = df["Medal"].fillna("None")


italydf_anon = df[df["NOC"] == "ITA"].copy()

italydf_anon.head()


In [None]:
# Top 10 sporter där Italien tagit flest medaljer

# En meddalj per lag, för att få korrekt OS statistik 
ita_medals_unique = (
    italydf_anon[italydf_anon["Medal"] != "None"]
    .drop_duplicates(subset=["Games", "Event", "Medal"])
)

medals_by_sport = (
    ita_medals_unique
    .groupby("Sport")["Medal"]
    .count()
    .sort_values(ascending=False)
)

fig_medal_sport = px.bar(
    medals_by_sport.head(10),
    title="Top 10 sporter där Italien tagit flest medaljer",
    labels={"index": "Sport", "value": "Antal medaljer"}
)
fig_medal_sport.show()



In [None]:
# --- Sommar-OS ---
ita_summer_unique = (
    italydf_anon[
        (italydf_anon["Medal"] != "None") &
        (italydf_anon["Season"] == "Summer")
    ]
    .drop_duplicates(subset=["Games", "Event", "Medal"])
)
ita_summer_unique["Year"] = ita_summer_unique["Games"].str[:4].astype(int)

summer = (
    ita_summer_unique
    .groupby("Year")["Medal"]
    .count()
    .reset_index()
)
summer["Season"] = "Sommar"

# --- Vinter-OS ---
ita_winter_unique = (
    italydf_anon[
        (italydf_anon["Medal"] != "None") &
        (italydf_anon["Season"] == "Winter")
    ]
    .drop_duplicates(subset=["Games", "Event", "Medal"])
)
ita_winter_unique["Year"] = ita_winter_unique["Games"].str[:4].astype(int)

winter = (
    ita_winter_unique
    .groupby("Year")["Medal"]
    .count()
    .reset_index()
)
winter["Season"] = "Vinter"

combined = pd.concat([summer, winter], ignore_index=True)

fig_medal_year = px.line(
    combined,
    x="Year",
    y="Medal",
    color="Season",
    markers=True,
    title="Medaljer per år - Italien (Sommar och vinter)"
)

fig_medal_year.update_layout(
    legend_title_text="Säsong",
    yaxis_title="Antal medaljer"
)

fig_medal_year.show()


In [None]:
# Histogram över åldrar för italienska 

figxx = px.histogram(
    df,
    x="Age",
    nbins=30,                     
    title="Åldersfördelning - Italienska OS-idrottare",
    labels={"Age": "Ålder"},
)

figxx.show()


In [None]:
# Histogram över åldrar för italienska 

fig = px.histogram(
    italydf_anon,
    x="Age",
    nbins=30,                     
    title="Åldersfördelning - Italienska OS-idrottare",
    labels={"Age": "Ålder"},
)

fig.show()


In [None]:
ita_unique_df = (
    italydf_anon
    .drop_duplicates(subset=["Games", "ID"])
    .copy()
)

# extract year from Games text
ita_unique_df["Year"] = ita_unique_df["Games"].str[:4].astype(int)

# agg counts
ita_participants_df = (
    ita_unique_df.groupby(["Year", "Season", "Sex"])["ID"]
      .count()
      .reset_index(name="Participants")
)

# sort
ita_participants_df = ita_participants_df.sort_values(["Year", "Season", "Sex"])


fig_ita_participants = px.line(
    ita_participants_df,
    x="Year",
    y="Participants",
    color="Sex",          
    line_dash="Season",   
    markers=True,
    title="Antal deltagare per år - Italien (Sommar/Vinter & kön)",
    labels={
        "Participants": "Antal deltagare",
        "Season": "Säsong",
        "Sex": "Kön",
    }
)

fig_ita_participants.show()


In [None]:
# Antal deltagare per OS – Sommar 
ita_summer_participants = (
    italydf_anon[italydf_anon["Season"] == "Summer"]
    .drop_duplicates(subset=["Games", "ID"])     # en deltagare per OS
    .groupby("Games")["ID"]
    .count()
    .reset_index()
)

fig = px.line(
    ita_summer_participants,
    x="Games",
    y="ID",
    title="Antal italienska deltagare per OS - Sommar",
    markers=True,
    labels={"ID": "Antal deltagare"}
)
fig.show()


ita_winter_participants = (
    italydf_anon[italydf_anon["Season"] == "Winter"]
    .drop_duplicates(subset=["Games", "ID"])     # en deltagare per OS
    .groupby("Games")["ID"]
    .count()
    .reset_index()
)

fig = px.line(
    ita_winter_participants,
    x="Games",
    y="ID",
    title="Antal italienska deltagare per OS - Vinter",
    markers=True,
    labels={"ID": "Antal deltagare"}
)
fig.show()



# Fencing

In [None]:
fencing = italydf_anon[italydf_anon["Sport"] == "Fencing"].copy()

fencing_unique_medals = (
    fencing[fencing["Medal"] != "None"]
    .drop_duplicates(subset=["Games", "Event", "Medal"])
)


In [None]:
medals_by_type_fen = (
    fencing_unique_medals
    .pivot_table(
        index="Year",
        columns="Medal",
        values="ID",      # bara för count
        aggfunc="count",
        fill_value=0
    )
    .reset_index()
)
# Beräkna totala medaljer per år
medals_type_cols = [c for c in medals_by_type_fen.columns if c in ["Gold", "Silver", "Bronze"]]
medals_by_type_fen["Total"] = medals_by_type_fen[medals_type_cols].sum(axis=1)

fig_fenc_medal = px.bar(
    medals_by_type_fen,
    x="Year",
    y="Total",
    title="Italy Fencing - Total Medals per Year",
    labels={"Total": "Total Medals"}
)
fig_fenc_medal.show()


In [None]:
medals_by_type = (
    fencing_unique_medals
    .pivot_table(
        index="Year",
        columns="Medal",
        values="ID",      # bara för count
        aggfunc="count",
        fill_value=0
    )
    .reset_index()
)

fig1 = px.bar(
    medals_by_type,
    x="Year",
    y=["Gold", "Silver", "Bronze"],
    title="Italy Fencing - Medals per Year",
    labels={"value": "Number of Medals", "variable": "Medal Type"},
    color_discrete_map={
        "Gold": "#F6D411",
        "Silver": "#D7D4D4",
        "Bronze": "#CD7532"
    }
)
fig1.update_layout(barmode="stack")
fig1.show()


In [None]:
medals_by_type = (
    fencing_unique_medals
    .pivot_table(
        index="Year",
        columns="Medal",
        values="ID",      # bara för count
        aggfunc="count",
        fill_value=0
    )
    .reset_index()
)

fig1 = px.bar(
    medals_by_type,
    x="Year",
    y=["Gold", "Silver", "Bronze"],
    title="Italy Fencing - Medals per Year",
    labels={"value": "Number of Medals", "variable": "Medal Type"},
    color_discrete_map={
    "Gold": "#F6D411",
    "Silver": "#D7D4D4",
    "Bronze": "#CD7532"
}
)
fig1.update_layout(barmode="stack")
fig1.show()

# Beräkna totala medaljer per år
medals_type_cols = [c for c in medals_by_type.columns if c in ["Gold", "Silver", "Bronze"]]
medals_by_type["Total"] = medals_by_type[medals_type_cols].sum(axis=1)

fig2 = px.bar(
    medals_by_type,
    x="Year",
    y="Total",
    #markers=True,
    title="Italy Fencing - Total Medals per Year",
    labels={"Total": "Total Medals"}
)
fig2.show()


In [None]:
# Fäktning för alla länder
fencing_all = df[df["Sport"] == "Fencing"].copy()

fencing_all_unique_medals = (
    fencing_all[fencing_all["Medal"] != "None"]
    .drop_duplicates(subset=["Games", "Event", "Medal"])
)

medals_country = (
    fencing_all_unique_medals
    .groupby("NOC")["Medal"]
    .count()
    .reset_index()
    .sort_values("Medal", ascending=False)
)

px.bar(
    medals_country.head(20),
    x="NOC",
    y="Medal",
    title="Fencing  Medal Distribution by Country",
    labels={"NOC": "Country", "Medal": "Number of Medals"}
).show()


Italien ligger bland de allra främsta nationerna tillsammans med  Frankrike, Ungern och Ryssland, vilket speglar ländernas långa fäktningstraditioner och välutvecklade tekniska skolor. Grafen visar tydligt hur fäktningen är starkt centrerad till några få, historiskt etablerade fäktningsnationer

In [None]:

fencing = ita[ita["Sport"] == "Fencing"].copy()
fencing["Group"] = "Fencing"

other_sports = ita[ita["Sport"] != "Fencing"].copy()
other_sports["Group"] = "Other sports"

age_compare = pd.concat([fencing, other_sports], ignore_index=True)

# Medelålder per grupp
mean_age = (
    age_compare.groupby("Group")["Age"]
    .mean()
    .reset_index()
    .round(1)
)

fig = px.histogram(
    age_compare,
    x="Age",
    nbins=30,
    histnorm="percent",
    facet_row="Group",
    title="Age Distribution - Fencing vs Other Italian Sports",
    labels={"Age": "Age", "Group": "Group"}
)
fig.update_layout(
    height=700,
    margin=dict(t=80, b=40),
    font=dict(size=14),
)

fig.update_xaxes(range=[10, 50])  
fig.update_yaxes(matches=None)
fig.show()


fäktare har en större andel aktiva i högre åldrar jämfört med andra sporter. Det speglar att fäktning är en tekniskt krävande sport där erfarenhet, precision och taktiskt kunnande gör att många når sin topp senare i karriären.

In [None]:
# Medaljer per eent
medals_by_event = (
    fencing_unique_medals
    .groupby("Event")["Medal"]
    .count()
    .sort_values(ascending=False)  
    .reset_index()
)

fig = px.bar(
    medals_by_event,
    x="Event",
    y="Medal",
    title="Italy Fencing - Medals per Event",
    labels={"Medal": "Number of Medals", "Event": "Event"}
)

fig.update_xaxes(tickangle=45)
fig.show()
