# Assignment 1 - Statistics per country
#### Sports Statistics Sweden

In [42]:
import pandas as pd
import plotly.express as px
import hashlib as hl # For hashing 

#### Reading and exploring data

In [43]:
# Reading the athlete_events.csv file
athlete_events = pd.read_csv("data/athlete_events.csv")

# Filtering the data to only include swedish athletes
sweden_athletes = pd.DataFrame(athlete_events[athlete_events["NOC"] == "SWE"]) # Changed here to be able to do the hashing properly

sweden_athletes.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
725,414,Arvid berg,M,26.0,,,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Hammer Throw,
726,415,Bjrn Olof Conny berg,M,23.0,181.0,76.0,Sweden,SWE,1992 Winter,1992,Winter,Albertville,Freestyle Skiing,Freestyle Skiing Men's Moguls,
727,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Long Jump,Bronze
728,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Triple Jump,Silver
729,417,Sara Helena berg,F,17.0,190.0,73.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Swimming,Swimming Women's 50 metres Freestyle,


#### **a)** Börja med att anonymisera kolumnen med idrottarnas namn med hashfunktionen SHA-256.

In [44]:
# Anonymization with hashfunction (algorithm) SHA-256 for the athletes names
sweden_athletes['Name'] = sweden_athletes['Name'].apply(lambda name: hl.sha256(name.encode()).hexdigest())
sweden_athletes.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
725,414,29a708e9c2f5e49e3299680bdc89c707c8c258a226a4dc...,M,26.0,,,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Hammer Throw,
726,415,d712af0c4440a3d677c7b9d6a0fe816d08c40b51f5a00b...,M,23.0,181.0,76.0,Sweden,SWE,1992 Winter,1992,Winter,Albertville,Freestyle Skiing,Freestyle Skiing Men's Moguls,
727,416,a9d3f60fd1cd7e878f36d0bf1c82aea4b071cce2a43f04...,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Long Jump,Bronze
728,416,a9d3f60fd1cd7e878f36d0bf1c82aea4b071cce2a43f04...,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Triple Jump,Silver
729,417,9f1fe5f1cec1ce1cbb1265d84c10c686566e35efa7b36b...,F,17.0,190.0,73.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Swimming,Swimming Women's 50 metres Freestyle,


#### Undersök därefter hur det gått för landet i OS genom tiderna. Visualisera exempelvis:
#### **b)** de sporter landet fått flest medaljer i

In [45]:
# Grouping by sport and counting medals
medals_per_sport = sweden_athletes.groupby("Sport", as_index=False)["Medal"].count()

# Sort values on number of medals in descending order, reseting index and displaying the result in a plot. 
fig=px.bar(
    data_frame=medals_per_sport.sort_values(by='Medal', ascending=False, ignore_index=True).head(10),
    x='Sport',
    y='Medal',
    labels={'Medal': 'Number of Medals'},
    template='plotly_white',
    title="Top 10 Sports with the Most Medals for Swedish Athletes",
    barmode='overlay',
)
fig.update_xaxes(tickangle=45)
fig.show()

#### **c)** antal medaljer per OS

In [46]:
# Filtering out list with medals without NaN values
df_medals_swe = sweden_athletes[~sweden_athletes["Medal"].isin(["NaN"])]   # Seems it doesnt count NaN so it is not needed to use isin filtering here

# Grouping by year, country and counting medals
medals_per_OS = df_medals_swe.groupby(["Year"], as_index=False)["Medal"].count()

print("Sweden has won the following medals in each olympic game:")
for index, row in medals_per_OS.iterrows():
    print(f"Year: {row['Year']}, Medals: {row['Medal']}")


Sweden has won the following medals in each olympic game:
Year: 1896, Medals: 0
Year: 1900, Medals: 4
Year: 1906, Medals: 21
Year: 1908, Medals: 84
Year: 1912, Medals: 190
Year: 1920, Medals: 154
Year: 1924, Medals: 81
Year: 1928, Medals: 53
Year: 1932, Medals: 35
Year: 1936, Medals: 45
Year: 1948, Medals: 88
Year: 1952, Medals: 93
Year: 1956, Medals: 49
Year: 1960, Medals: 16
Year: 1964, Medals: 43
Year: 1968, Medals: 25
Year: 1972, Medals: 26
Year: 1976, Medals: 12
Year: 1980, Medals: 43
Year: 1984, Medals: 64
Year: 1988, Medals: 46
Year: 1992, Medals: 42
Year: 1994, Medals: 24
Year: 1996, Medals: 31
Year: 1998, Medals: 7
Year: 2000, Medals: 32
Year: 2002, Medals: 26
Year: 2004, Medals: 12
Year: 2006, Medals: 64
Year: 2008, Medals: 7
Year: 2010, Medals: 18
Year: 2012, Medals: 22
Year: 2014, Medals: 51
Year: 2016, Medals: 28


In [47]:
fig = px.bar(medals_per_OS, x="Year", y="Medal", title="Medals for Sweden per Olympic Game")
# Have all x-values visible in the fig plot
fig.update_xaxes(tick0=1896, dtick=2)

#### **d)** histogram över åldrar

In [48]:
# Ages of athletes from SWE in 2016 olympics
sweden_athletes_2016 = athlete_events[(athlete_events["NOC"] == "SWE") & (athlete_events["Year"] == 2016)]


In [49]:
# Plotting a histogram over ages of athletes from SWE in 2016 olympics
mean_age_2016 = round(sweden_athletes_2016["Age"].mean(), 1)

# Plotting a histogram of ages of athletes from SWE in 2016 olympics around the mean age
fig = px.histogram(sweden_athletes_2016, x="Age", nbins = 40, title="Age distribution of athletes from SWE in 2016 olympics")

# Setting the mean age in the middle of the x-axis
fig.update_xaxes(range=[mean_age_2016-20, mean_age_2016+20])

# Mark the mean age in the x-axis
fig.add_vline(x=mean_age_2016, line_width=3, line_dash="dash", line_color="green")

# Making spaces between bars
fig.update_layout(bargap=0.01)

fig.show()

In [50]:
# Plotting a histogram over ages of athletes from SWE in all olympic games

mean_age_all = round(sweden_athletes["Age"].mean(), 1)

# Plotting a histogram of ages of athletes from SWE in all olympic games around the mean age
fig = px.histogram(sweden_athletes, x="Age", title="Age distribution of athletes from SWE in all olympic games")

# Setting the mean age in the middle of the x-axis
fig.update_xaxes(range=[mean_age_all-20, mean_age_all+20])

# Mark the mean age in the x-axis
fig.add_vline(x=mean_age_all, line_width=3, line_dash="dash", line_color="green")

# Making spaces between bars
fig.update_layout(bargap=0.01)

fig.show()

#### **e)** Skapa fler plots för att visualisera flera aspekter kring ert land och dess sportprestationer i OS.

In [51]:
# Amount of athletes and ratio between M and F athletes in each olympic game

medals_count_sex = sweden_athletes.groupby(["Year", "Sex"], as_index=False)["ID"].nunique()

grouped_medals_count_sex = medals_count_sex.groupby(["Year", "Sex"], as_index=False).agg("sum")

#Plot amount of M and F athletes in each olympic game in a bar chart
fig = px.bar(grouped_medals_count_sex, x = "Year", y ="ID", color = "Sex", barmode = "group")
# Change title of y-axis
fig.update_yaxes(title_text="Count of athletes")
# Have all x-values visible in the fig plot
fig.update_xaxes(tick0=1896, dtick=2)

# Make the bars wider
fig.update_traces(width=0.8)
fig.update_layout(bargap=0.01)

fig.show()

In [84]:
# Medal Distribution in The Olympic Games in Sweden 1912

medal_data_1912=(
    athlete_events.query("Year == 1912")
    .groupby(['NOC', 'Medal', 'Sex'])
    .size()
    .reset_index(name="Amount of medals")
)

fig = px.bar(
    medal_data_1912,
    x='NOC',
    y='Amount of medals',
    log_y=True,
    hover_name='NOC',
    hover_data=[],
    facet_row='Sex',
    labels={"NOC": 'Country'},
    color='Medal',
    title="Medal Distribution in The Olympic Games in Sweden 1912",
    barmode='group'
    )
fig.show()
