# Sports Statistics

Below you'll find statistics for the following sports:
- Ice Hockey
- Athletics
- Shooting
- Cross Country Skiing

(The four sports that Sweden got the most medals in)

In [1]:
import pandas as pd
import plotly_express as px

In [2]:
# Reading the athlete_events.csv file
athlete_events = pd.read_csv("data/athlete_events.csv")

In [3]:
# NOTE: Create a new DataFrame instance named df_icehockey using pandas.
# Filter the rows from the existing DataFrame athlete_events based on the condition 'Sport == "XX"'.
# The resulting DataFrame, df_icehockey, contains data specific to athletes and events in the sport "XX".
df_icehockey = pd.DataFrame(athlete_events.query('Sport == "Ice Hockey"'))
df_athletics = pd.DataFrame(athlete_events.query('Sport == "Athletics"'))
df_shooting = pd.DataFrame(athlete_events.query('Sport == "Shooting"'))
df_cross_county_skiing = pd.DataFrame(athlete_events.query('Sport == "Cross Country Skiing"'))

selected_sports = ['Ice Hockey', 'Athletics', 'Shooting', 'Cross Country Skiing']
df_all_selected_sports = pd.DataFrame(athlete_events.query('Sport in @selected_sports'))

# NOTE: Displaying the length of the df:s using display method
display(f"{len(df_icehockey)=}")
display(f"{len(df_athletics)=}")
display(f"{len(df_shooting)=}")
display(f"{len(df_cross_county_skiing)=}")
display(f"{len(df_all_selected_sports)=}")

'len(df_icehockey)=5516'

'len(df_athletics)=38624'

'len(df_shooting)=11448'

'len(df_cross_county_skiing)=9133'

'len(df_all_selected_sports)=64721'

In [4]:
df_all_selected_sports.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
10,6,Per Knut Aaland,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 10 kilometres,
11,6,Per Knut Aaland,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 50 kilometres,
12,6,Per Knut Aaland,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 10/15 kilometres Pu...,
13,6,Per Knut Aaland,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,
14,6,Per Knut Aaland,M,33.0,188.0,75.0,United States,USA,1994 Winter,1994,Winter,Lillehammer,Cross Country Skiing,Cross Country Skiing Men's 10 kilometres,


#### Medaljfördelning mellan länder i sporterna

In [5]:
medal_distribution = (
    df_all_selected_sports.groupby(['NOC', 'Sport', 'Medal'])
    .size()
    .reset_index(name="Number of Medals")
)

# Plotting a histogram of ages of athletes from SWE in all olympic games around the mean age
fig = px.histogram(
    medal_distribution, 
    x="NOC",
    y="Number of Medals",
    log_y=True,
    color='Sport',
    title="Medal Distribution in All Olympic Games",
    barmode='group',
)

# Making spaces between bars
fig.update_layout(bargap=0.01)

fig.show()

In [6]:
px.sunburst(medal_distribution, values='Number of Medals', path=['Sport', 'NOC'])

#### Åldersfördelning i sporterna

In [7]:
mean_age_all = round(df_all_selected_sports["Age"].mean(), 1)

age_distribution = (
    df_all_selected_sports.groupby(['Sport', 'Age'])
    .size()
    .reset_index(name="Number of Athletes")
)

# Plotting a histogram of ages of athletes from SWE in all olympic games around the mean age
fig = px.histogram(
    age_distribution, 
    x="Age",
    y="Number of Athletes",
    log_y=True,
    color='Sport',
    title="Age distribution in All Olympic Games",
    range_x=[mean_age_all-20, mean_age_all+20],
    barmode='group',
    nbins=80
)

# Mark the mean age in the x-axis
fig.add_vline(x=mean_age_all, line_width=3, line_dash="dash", line_color="green", annotation_text="Mean age all sports")

# Making spaces between bars
fig.update_layout(bargap=0.01)

fig.show()

In [8]:
px.sunburst(age_distribution, values='Number of Athletes', path=['Sport', 'Age'])

#### Skapa fler plots för att visualisera olika aspekter kring sporterna.

#### Dataframe for countries, contintents and coordinates

#### Countries

In [9]:
# Reading the noc_regions.csv file
noc_regions = pd.read_csv("data/noc_regions.csv")

# Renaming the column "region" to "Country" in the noc_regions DataFrame
noc_regions.rename(columns={"region": "Country"}, inplace=True)

# Read file with coordinates
country_continent_coordinates = pd.read_csv("data/country_continent_coordinates.csv")

In [10]:
# Merge coordinates into noc_regions
merged_df = noc_regions.merge(country_continent_coordinates, left_on= "Country", right_on="Country")

In [11]:
# Merge the merged_df with athlete_events
base_final_df = merged_df.merge(athlete_events, left_on= "NOC", right_on="NOC")


base_final_df.head()

Unnamed: 0,NOC,Country,notes,Continent,Country_latitude,Country_longitude,Continent_latitude,Continent_longitude,ID,Name,...,Height,Weight,Team,Games,Year,Season,City,Sport,Event,Medal
0,AFG,Afghanistan,,Asia,33.93911,67.709953,34.0479,100.6197,502,Ahmad Shah Abouwi,...,,,Afghanistan,1956 Summer,1956,Summer,Melbourne,Hockey,Hockey Men's Hockey,
1,AFG,Afghanistan,,Asia,33.93911,67.709953,34.0479,100.6197,1076,Jammal-ud-Din Affendi,...,,,Afghanistan,1936 Summer,1936,Summer,Berlin,Hockey,Hockey Men's Hockey,
2,AFG,Afghanistan,,Asia,33.93911,67.709953,34.0479,100.6197,1101,Mohammad Anwar Afzal,...,,,Afghanistan,1948 Summer,1948,Summer,London,Football,Football Men's Football,
3,AFG,Afghanistan,,Asia,33.93911,67.709953,34.0479,100.6197,1745,Mohammad Aktar,...,156.0,48.0,Afghanistan,1980 Summer,1980,Summer,Moskva,Wrestling,"Wrestling Men's Light-Flyweight, Freestyle",
4,AFG,Afghanistan,,Asia,33.93911,67.709953,34.0479,100.6197,4628,Mohammad Daoud Anwary,...,,,Afghanistan,1964 Summer,1964,Summer,Tokyo,Wrestling,"Wrestling Men's Bantamweight, Freestyle",


#### Dataframe for participants and medals per year

In [27]:
# Dataframe for participants and medals per year
participants_medals= base_final_df.groupby(['Year', 'Season', 'Country', 'Continent','Country_latitude', 'Country_longitude','Continent_latitude', 'Continent_longitude'], as_index=False)[['Name', 'Medal']].agg(
    {'Name': 'nunique', 'Medal': 'count'})


In [28]:
participants_medals.head()

Unnamed: 0,Year,Season,Country,Continent,Country_latitude,Country_longitude,Continent_latitude,Continent_longitude,Name,Medal
0,1896,Summer,Australia,Oceania,-25.274398,133.775136,-14.235,141.2551,1,3
1,1896,Summer,Austria,Europe,47.516231,14.550072,54.526,15.2551,3,5
2,1896,Summer,Denmark,Europe,56.26392,9.501785,54.526,15.2551,3,6
3,1896,Summer,France,Europe,46.227638,2.213749,54.526,15.2551,12,11
4,1896,Summer,Germany,Europe,51.165691,10.451526,54.526,15.2551,19,32


In [38]:
# Group participants_medals to get a total sum of Name and Medal per country for all years
participants_medals_sum = participants_medals.groupby(['Country', 'Continent','Country_latitude', 'Country_longitude','Continent_latitude', 'Continent_longitude'], as_index=False)[['Name', 'Medal']].sum()

# Group participants_medals to get a total sum of Name and medal per continent for all years
participants_medals_sum_continent = participants_medals.groupby(['Continent','Continent_latitude', 'Continent_longitude'], as_index=False)[['Name', 'Medal']].sum()

# Group participants_medals to get a total sum of Name and medal per year for all countries
participants_medals_sum_year = participants_medals.groupby(['Year', 'Country', 'Continent','Country_latitude', 'Country_longitude','Continent_latitude', 'Continent_longitude'], as_index=False)[['Name', 'Medal']].sum()



In [31]:
fig = px.scatter_mapbox(participants_medals_sum, lat="Country_latitude", lon="Country_longitude", size="Name", color="Medal", height = 800, width= 900 ,hover_name = "Country")

display(fig.update_layout(mapbox_style="open-street-map", title = "Size according to count of participants="))

In [36]:
fig = px.scatter_mapbox(participants_medals_sum_continent, lat="Continent_latitude", lon="Continent_longitude", size="Name", color="Medal", height = 800, width= 900 ,hover_name = "Continent")

display(fig.update_layout(mapbox_style="open-street-map", title = "Size according to count of participants="))

In [39]:
fig = px.scatter_mapbox(participants_medals_sum_year, lat="Country_latitude", lon="Country_longitude", size="Name", color="Medal", height = 800, width= 900 ,hover_name = "Country")

display(fig.update_layout(mapbox_style="open-street-map", title = "Size according to count of participants="))

In [40]:
fig = px.scatter_mapbox(participants_medals_sum_year, lat="Continent_latitude", lon="Continent_longitude", size="Name", color="Medal", height = 800, width= 900 ,hover_name = "Continent")

display(fig.update_layout(mapbox_style="open-street-map", title = "Size according to count of participants="))

#### Dataframe for sports per year and country

In [15]:
# Dataframe for sports per year and country
grouped_sports = base_final_df.groupby(['Year', 'Season', 'Country','Continent','Country_latitude', 'Country_longitude','Continent_latitude', 'Continent_longitude'], as_index=False).agg(
    {'Sport': 'nunique'})




In [16]:
grouped_sports.head()

Unnamed: 0,Year,Season,Country,Continent,Country_latitude,Country_longitude,Continent_latitude,Continent_longitude,Sport
0,1896,Summer,Australia,Oceania,-25.274398,133.775136,-14.235,141.2551,2
1,1896,Summer,Austria,Europe,47.516231,14.550072,54.526,15.2551,3
2,1896,Summer,Denmark,Europe,56.26392,9.501785,54.526,15.2551,5
3,1896,Summer,France,Europe,46.227638,2.213749,54.526,15.2551,6
4,1896,Summer,Germany,Europe,51.165691,10.451526,54.526,15.2551,6


#### Grouped gender data per country

In [17]:
# Grouped per country and gender
grouped_gender_country = base_final_df.groupby(['Year', 'Sex', 'Country', 'Continent','Country_latitude', 'Country_longitude','Continent_latitude', 'Continent_longitude'], as_index=False).agg(
    {'Name': 'nunique'})

# Rename the column "Name" to "Count"
grouped_gender_country.rename(columns={"Name": "Count"}, inplace=True)


In [18]:
grouped_gender_country.head()

Unnamed: 0,Year,Sex,Country,Continent,Country_latitude,Country_longitude,Continent_latitude,Continent_longitude,Count
0,1896,M,Australia,Oceania,-25.274398,133.775136,-14.235,141.2551,1
1,1896,M,Austria,Europe,47.516231,14.550072,54.526,15.2551,3
2,1896,M,Denmark,Europe,56.26392,9.501785,54.526,15.2551,3
3,1896,M,France,Europe,46.227638,2.213749,54.526,15.2551,12
4,1896,M,Germany,Europe,51.165691,10.451526,54.526,15.2551,19


In [19]:
# Create a column named ratio for grouped_gender_country dataframe
# In this column, calculate the ratio of M and F athletes per country and year

grouped_gender_country['Ratio'] = grouped_gender_country.groupby(['Year', 'Country'])['Count'].transform(lambda x: x / x.sum())    # Källa


#### Grouped gender data per continent

In [20]:
# Grouped per continent and gender
grouped_gender_continent = base_final_df.groupby(['Year', 'Sex', 'Continent','Continent_latitude', 'Continent_longitude'], as_index=False).agg(
{'Name': 'nunique'})


# Rename the column "Name" to "Count"

grouped_gender_continent.rename(columns={"Name": "Count"}, inplace=True)


In [21]:
# Create a column named ratio for grouped_gender_continent dataframe
# In this column, calculate the ratio of M and F athletes per continent and year

grouped_gender_continent['Ratio'] = grouped_gender_continent.groupby(['Year', 'Continent'])['Count'].transform(lambda x: x / x.sum())

In [22]:
grouped_gender_continent.head()

Unnamed: 0,Year,Sex,Continent,Continent_latitude,Continent_longitude,Count,Ratio
0,1896,M,Europe,54.526,15.2551,151,1.0
1,1896,M,North America,54.526,-105.2551,14,1.0
2,1896,M,Oceania,-14.235,141.2551,1,1.0
3,1900,F,Europe,54.526,15.2551,15,0.014691
4,1900,F,North America,54.526,-105.2551,7,0.081395


#### Grouped gender data total for all countries 

In [23]:
# Grouped data for alla countries and gender
grouped_gender_total = base_final_df.groupby(['Year', 'Sex'], as_index=False).agg(
{'Name': 'nunique'})

# Rename the column "Name" to "Count"
grouped_gender_total.rename(columns={"Name": "Count"}, inplace=True)

In [24]:
# Create a column named ratio for grouped_gender_total dataframe
# In this column, calculate the ratio of M and F athletes per continent and year

grouped_gender_total['Ratio'] = grouped_gender_total.groupby(['Year'])['Count'].transform(lambda x: x / x.sum())

In [25]:
grouped_gender_total.head()

Unnamed: 0,Year,Sex,Count,Ratio
0,1896,M,166,1.0
1,1900,F,22,0.019713
2,1900,M,1094,0.980287
3,1904,F,6,0.009317
4,1904,M,638,0.990683
