In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import plotly.express as px

In [None]:

df = pd.read_csv(r"athlete_events.csv")


In [None]:
df = df.fillna({
    'Age': df['Age'].median(),
    'Height': df['Height'].median(),
    'Weight': df['Weight'].median()
})

df = df.astype({'Age': 'uint8', 'Height': 'uint8', 'Weight': 'uint8', 'ID': 'uint32' , 'Year': 'int16'})
df.info()

In [None]:
print(df['Age'].median())
print(df['Age'].mean())


# ITALIA

In [None]:
# Anonymisera namn med SHA-256
df["Name"] = df["Name"].apply(
    lambda x: hashlib.sha256(str(x).encode()).hexdigest()
)

# Fyll tomma medaljer med "None" = ingen medalj
df["Medal"] = df["Medal"].fillna("None")


italydf_anon = df[df["NOC"] == "ITA"].copy()

italydf_anon.head()


In [None]:
df = df.fillna({
    'Age': df['Age'].median(),
    'Height': df['Height'].median(),
    'Weight': df['Weight'].median(),
    
})
df = df.astype({'Age': 'float32', 'Height': 'float32', 'Weight': 'float32', 'Year': 'int16'})
df.info()

In [None]:
# Histogram över åldrar för italienska 

fig = px.histogram(
    italydf_anon,
    x="Age",
    nbins=30,                     
    title="Åldersfördelning - Italienska OS-idrottare",
    labels={"Age": "Ålder"},
)

fig.show()


In [None]:
ita_unique_df = (
    italydf_anon
    .drop_duplicates(subset=["Games", "ID"])
    .copy()
)

# extract year from Games text
ita_unique_df["Year"] = ita_unique_df["Games"].str[:4].astype(int)

# agg counts
ita_participants_df = (
    ita_unique_df.groupby(["Year", "Season", "Sex"])["ID"]
      .count()
      .reset_index(name="Participants")
)

# sort
ita_participants_df = ita_participants_df.sort_values(["Year", "Season", "Sex"])


fig_ita_participants = px.line(
    ita_participants_df,
    x="Year",
    y="Participants",
    color="Sex",          
    line_dash="Season",   
    markers=True,
    title="Antal deltagare per år - Italien (Sommar/Vinter & kön)",
    labels={
        "Participants": "Antal deltagare",
        "Season": "Säsong",
        "Sex": "Kön",
    }
)

fig_ita_participants.show()


In [None]:
# Antal deltagare per OS – Sommar 
ita_summer_participants = (
    italydf_anon[italydf_anon["Season"] == "Summer"]
    .drop_duplicates(subset=["Games", "ID"])     # en deltagare per OS
    .groupby("Games")["ID"]
    .count()
    .reset_index()
)

fig = px.line(
    ita_summer_participants,
    x="Games",
    y="ID",
    title="Antal italienska deltagare per OS - Sommar",
    markers=True,
    labels={"ID": "Antal deltagare"}
)
fig.show()


ita_winter_participants = (
    italydf_anon[italydf_anon["Season"] == "Winter"]
    .drop_duplicates(subset=["Games", "ID"])     # en deltagare per OS
    .groupby("Games")["ID"]
    .count()
    .reset_index()
)

fig = px.line(
    ita_winter_participants,
    x="Games",
    y="ID",
    title="Antal italienska deltagare per OS - Vinter",
    markers=True,
    labels={"ID": "Antal deltagare"}
)
fig.show()



# Fencing

In [None]:
fencing = italydf_anon[italydf_anon["Sport"] == "Fencing"].copy()

fencing_unique_medals = (
    fencing[fencing["Medal"] != "None"]
    .drop_duplicates(subset=["Games", "Event", "Medal"])
)


In [None]:
medals_by_type_fen = (
    fencing_unique_medals
    .pivot_table(
        index="Year",
        columns="Medal",
        values="ID",      # bara för count
        aggfunc="count",
        fill_value=0
    )
    .reset_index()
)
# Beräkna totala medaljer per år
medals_type_cols = [c for c in medals_by_type_fen.columns if c in ["Gold", "Silver", "Bronze"]]
medals_by_type_fen["Total"] = medals_by_type_fen[medals_type_cols].sum(axis=1)

fig_fenc_medal = px.bar(
    medals_by_type_fen,
    x="Year",
    y="Total",
    title="Italy Fencing - Total Medals per Year",
    labels={"Total": "Total Medals"}
)
fig_fenc_medal.show()


In [None]:
medals_by_type = (
    fencing_unique_medals
    .pivot_table(
        index="Year",
        columns="Medal",
        values="ID",      # bara för count
        aggfunc="count",
        fill_value=0
    )
    .reset_index()
)

fig1 = px.bar(
    medals_by_type,
    x="Year",
    y=["Gold", "Silver", "Bronze"],
    title="Italy Fencing - Medals per Year",
    labels={"value": "Number of Medals", "variable": "Medal Type"},
    color_discrete_map={
        "Gold": "#F6D411",
        "Silver": "#D7D4D4",
        "Bronze": "#CD7532"
    }
)
fig1.update_layout(barmode="stack")
fig1.show()


In [None]:
medals_by_type = (
    fencing_unique_medals
    .pivot_table(
        index="Year",
        columns="Medal",
        values="ID",      # bara för count
        aggfunc="count",
        fill_value=0
    )
    .reset_index()
)

fig1 = px.bar(
    medals_by_type,
    x="Year",
    y=["Gold", "Silver", "Bronze"],
    title="Italy Fencing - Medals per Year",
    labels={"value": "Number of Medals", "variable": "Medal Type"},
    color_discrete_map={
    "Gold": "#F6D411",
    "Silver": "#D7D4D4",
    "Bronze": "#CD7532"
}
)
fig1.update_layout(barmode="stack")
fig1.show()

#Skapar en anonym version av df utan namn-kolumn
italydf_anon = italydf.drop(["Name"], axis=1)

#Vi ska göra samma sak för den globala df
df.insert(loc=2, column="Name_HASH", value = df["Name"].apply( lambda x:
                                             hashlib.sha256(x.encode()).hexdigest()
                                             ))
df_anon = df.drop(["Name"], axis=1)
df_anon.head()



## Simning

### Medaljer

In [None]:
#Först extraherar vi data för simning
ita_simn = italydf_anon[italydf_anon["Sport"]=="Swimming"]
ita_simn.head()

#och börjar kolla på medaljer och att filtrera ut dubletter 
ita_simn_medaljer = ita_simn.dropna(subset=["Medal"]).drop_duplicates(subset=["Year","Medal","Event","ID"])

#Nu gruperar vi medaljer per år
medaljer_år = ita_simn_medaljer.groupby("Year")["Medal"].count().reset_index()


In [None]:

#Nu kan vi skapa vår första plot med plotly
fig = px.line(
    medaljer_år,
    x= "Year",
    y= "Medal",
    title="Antal Medaljer för Italien per OS",
    markers = True,
    )

fig.update_layout(
    xaxis_title = "OS År",
    yaxis_title = "Antal Medaljer"
)

fig.show()

In [None]:
#Vi ska också titta på vilka olika medaljer italjen vann
medaljer_typ= ita_simn_medaljer.groupby(["Year","Medal"]).size().reset_index(name="Count")
medaljer_typ.head()

#Färgen ska matcha medaljen
color_map = {
    "Gold": "#C6A907",
    "Silver": "#C0C0C0",
    "Bronze": "#CD7F32"
}

fig = px.bar(
    medaljer_typ,
    x = "Year",
    y = "Count",
    color = "Medal",
    color_discrete_map=color_map,
    title= "Medaljfördelning för Italien i Simning per OS-år",
    barmode="group"
)

fig.update_layout(
    xaxis_title = "OS År",
    yaxis_title = "Antal Medaljer"

)
fig.show()

### Ålder

In [None]:
#Först skapar vi ålder-dataset
ita_swim_age = ita_simn[ita_simn["Age"]>0]

In [None]:
fig = px.histogram(
    ita_swim_age,
    x = "Age",
    nbins=20,
    title="Ålderfördelning för italienska simmare"
)
fig.update_layout(
    xaxis_title="Simmarens Åldern",
    yaxis_title= "Antal Atleter"
)
fig.show()

In [None]:
#Nu ska vi kolla åldersdistribution per OS-år
fig = px.box(
    ita_swim_age,
    x="Year",
    y="Age",
    title = "Åldrar av italienska simmare per OS-år"
)

fig.update_layout(
    xaxis_title ="OS-År",
    yaxis_title="Ålder"
)
fig.show()

### Medaljer vs. Ålder

In [None]:
fig = px.density_heatmap(
    ita_simn_medaljer,
    x = "Year",
    y = "Age",
    title="Medaljvinnande Simmare: Ålder och År (Densitet)",
    nbinsx=len(ita_simn_medaljer['Year']),
    nbinsy=20,
    color_continuous_scale="plasma"
)
fig.update_layout(
    xaxis_title="OS År",
    yaxis_title="Ålder",
)
fig.show()

### Simning Vs Andra Länder - Hur gick det för Italien

In [None]:
#Nu kollar vi hur italien gjorde genomfort med resten av världen
#Först skapar vi en df frö alla simning medaljer i alla världen
global_simn = df_anon[
    (df_anon["Sport"]=="Swimming") &
    (df_anon["Medal"].notna())
     ].drop_duplicates(subset=["Year", "Event", "Medal", "ID"])

#Sen grouperar vi medaljerna i länder
medaljer_land = (
    global_simn.groupby("NOC")["Medal"]
    .count()
    .reset_index(name="Antal Medaljer")
    )

#vi skapar en färg kolumn så att Italien har sin egen färg
medaljer_land["Färg"]= medaljer_land["NOC"].apply(lambda x: "blue" if x == "ITA" else "red")

#Och i slutet sorterar vi vår data, och ska kolla på de högsta 20 länder
medaljer_land = medaljer_land.sort_values("Antal Medaljer", ascending=False).head(20)

In [None]:
#Äntligen får vi skapa vår plot
fig = px.bar(
    medaljer_land,
    x="NOC",
    y="Antal Medaljer",
    color="Färg",
    color_discrete_map={"blue":"blue", "red":"red"},
    title = "Top 20 Världens Simning Medaljer"
)

fig.update_layout(
    xaxis_title="Land (NOC)",
    yaxis_title="Antal Medaljer",
    showlegend = False,
    xaxis={'categoryorder':'array', 'categoryarray': medaljer_land["NOC"]}
)

fig.show()

### Italien Simning Ålderdistibution vs andra sport

In [None]:
ita_age_othersports = italydf_anon[(
    italydf_anon["Sport"] != "Swimming") &
    (italydf_anon["Age"]>0)
    ]

age_compare = pd.concat([
    ita_swim_age.assign(Group="Swimming"),
    ita_age_othersports.assign(Group="Other Sports")
])
fig_age_swim_vs_other = px.box(
   age_compare,
   x="Group",
   y="Age",
   title="Age Distribution Swimming vs other Italian Sports",
   points="all"
)

fig_age_swim_vs_other.update_layout(
    xaxis_title="Sport",
    yaxis_title="Age Distribution"
)

In [None]:
#DF with all ages/sports
ita_all_age = italydf_anon[italydf_anon["Age"] > 0][["Sport", "Age"]]

unique_sports = ita_all_age["Sport"].unique()


color_map = {sport: "gray" for sport in unique_sports}
color_map["Swimming"] = "blue"

fig_age_all_sports = px.box(
    ita_all_age,
    x="Sport",
    y="Age",
    color="Sport",
    title="Age Distribution by Sport (Swimming Highlighted)",
    color_discrete_map=color_map,
    #points="all"
)

fig_age_all_sports.update_layout(
    xaxis_title="Sport",
    yaxis_title="Age",
    showlegend=False  # legend not needed, too many categories
)
