In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import plotly
import plotly.express as px



Plotly version: 6.4.0


In [2]:

df = pd.read_csv(r"athlete_events.csv")


In [6]:
df = df.fillna({
    'Age': df['Age'].median(),
    'Height': df['Height'].median(),
    'Weight': df['Weight'].median(),
    'ID': df['ID'].median()
})

df = df.astype({'Age': 'uint8', 'Height': 'uint8', 'Weight': 'uint8', 'ID': 'uint32' , 'Year': 'int16'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      271116 non-null  uint32
 1   Name    271116 non-null  object
 2   Sex     271116 non-null  object
 3   Age     271116 non-null  uint8 
 4   Height  271116 non-null  uint8 
 5   Weight  271116 non-null  uint8 
 6   Team    271116 non-null  object
 7   NOC     271116 non-null  object
 8   Games   271116 non-null  object
 9   Year    271116 non-null  int16 
 10  Season  271116 non-null  object
 11  City    271116 non-null  object
 12  Sport   271116 non-null  object
 13  Event   271116 non-null  object
 14  Medal   39783 non-null   object
dtypes: int16(1), object(10), uint32(1), uint8(3)
memory usage: 23.0+ MB


# ITALIA

In [10]:
italydf = df[df['NOC'] == 'ITA']
italydf

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
158,62,Giovanni Abagnale,M,21,198,90,Italy,ITA,2016 Summer,2016,Summer,Rio de Janeiro,Rowing,Rowing Men's Coxless Pairs,Bronze
197,91,Emanuele Abate,M,27,190,80,Italy,ITA,2012 Summer,2012,Summer,London,Athletics,Athletics Men's 110 metres Hurdles,
198,92,Ignazio Abate,M,21,180,73,Italy,ITA,2008 Summer,2008,Summer,Beijing,Football,Football Men's Football,
214,103,Silvano Abba,M,25,175,70,Italy,ITA,1936 Summer,1936,Summer,Berlin,Modern Pentathlon,Modern Pentathlon Men's Individual,Bronze
218,106,Agostino Abbagnale,M,22,188,96,Italy,ITA,1988 Summer,1988,Summer,Seoul,Rowing,Rowing Men's Quadruple Sculls,Gold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270724,135391,Loredana Zugna,F,32,170,55,Italy,ITA,1984 Summer,1984,Summer,Los Angeles,Shooting,"Shooting Women's Sporting Pistol, 25 metres",
270760,135409,Mauro Carlo Zuliani,M,20,175,62,Italy,ITA,1980 Summer,1980,Summer,Moskva,Athletics,Athletics Men's 400 metres,
270761,135409,Mauro Carlo Zuliani,M,20,175,62,Italy,ITA,1980 Summer,1980,Summer,Moskva,Athletics,Athletics Men's 4 x 400 metres Relay,Bronze
270762,135409,Mauro Carlo Zuliani,M,25,175,62,Italy,ITA,1984 Summer,1984,Summer,Los Angeles,Athletics,Athletics Men's 4 x 400 metres Relay,


In [11]:

#Skapar en Name_HASH kolumn med den anonymiserade namn
italydf.insert(loc=2, column="Name_HASH", value = italydf["Name"].apply( lambda x:
                                             hashlib.sha256(x.encode()).hexdigest()
                                             ))

#Skapar en anonym version av df utan namn-kolumn
italydf_anon = italydf.drop(["Name"], axis=1)



In [None]:
italy_fencing = italydf_anon[italydf_anon["Sport"] == "Fencing"].copy()
medals = italy_fencing[italy_fencing["Medal"].notna()].copy()
unique_medals = medals.drop_duplicates(subset=["Year", "Event", "Medal"])

# pivoting created NaNs so had to fillna
medals_by_type = unique_medals.pivot_table(
    index="Year",
    columns="Medal",
    values="ID",
    aggfunc="count"
).fillna(0)


medals_total = medals_by_type.sum(axis=1)


fig1 = px.bar(
    medals_by_type.reset_index(),
    x="Year",
    y=["Gold", "Silver", "Bronze"],
    title="Italy Fencing - Medals per Year",
    labels={"value": "Number of Medals", "variable": "Medal Type"}
)
fig1.update_layout(barmode="stack")
fig1.show()


fig2 = px.line(
    medals_total.reset_index(),
    x="Year",
    y=0,
    markers=True,
    title="Italy Fencing - Total Medals per Year",
    labels={"0": "Total Medals"}
)
fig2.show()

np.float64(127.0)

# Fencing

In [None]:
fencing = df[df["Sport"] == "Fencing"].copy()

medals = fencing[fencing["Medal"].notna()].copy()

# 1 medal per team-event
medals_unique = medals.drop_duplicates(
    subset=["Year", "Event", "Medal"]
)


In [None]:
medals_country = (
    medals_unique.groupby("NOC")["Medal"]
    .count()
    .reset_index()
    .sort_values("Medal", ascending=False)
)

px.bar(
    medals_country.head(20),
    x="NOC",
    y="Medal",
    title="Fencing - Medal Distribution"
).show()


In [29]:
fig = px.histogram(
    fencing,
    x="Age",
    nbins=30,
    marginal="box",
    title="Age Distribution in Fencing"
)
fig.show()


In [30]:
medals_year = (
    medals_unique.groupby("Year")["Medal"]
    .count()
    .reset_index()
)

px.line(
    medals_year,
    x="Year",
    y="Medal",
    markers=True,
    title="Fencing - Medal Count Over Time"
).show()


In [31]:
fencing["Medalist"] = fencing["Medal"].notna()

fig = px.histogram(
    fencing,
    x="Age",
    color="Medalist",
    barmode="overlay",
    nbins=30,
    title="Age Distribution - Medalists vs Non-Medalists"
)
fig.show()
