In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv('OlympiansClean.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   athlete_id    145500 non-null  int64  
 1   Used name     145500 non-null  object 
 2   Sex           145500 non-null  object 
 3   Born_date     143693 non-null  object 
 4   Born_city     103109 non-null  object 
 5   Born_country  103109 non-null  object 
 6   NOC           145499 non-null  object 
 7   height_cm     106651 non-null  float64
 8   weight_kg     102070 non-null  float64
 9   Died_date     33940 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 11.1+ MB


In [None]:
# Check how old athletes were when they died and then group them into bins
df['Born_date'] = pd.to_datetime(df['Born_date'],errors='coerce')
df['Died_date'] = pd.to_datetime(df['Died_date'],errors='coerce')
nullValues = df['Born_date'].isnull() | df['Died_date'].isnull()
# Create column with values that aren't in the nullValues using ~
dfAge = df[~nullValues].copy()
# Create new column that is their age at death in years
dfAge['Age'] = (dfAge['Died_date']-dfAge['Born_date']).dt.days // 365
# Define bins and labels for graph
bins = [0,30,40,50,60,100]
labels = ['<29','30-39','40-49','50-59','60+']
# Sort data into specific groups on a new column
dfAge['Age Group'] = pd.cut(dfAge['Age'],bins=bins,labels=labels,right=False, include_lowest=True)
# Graph the results as a bar graph
dfAge['Age Group'].value_counts().sort_index().plot(kind='bar')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.title('Olympians By Age Group')
plt.show()

In [None]:
# Find the Average age based on genders
dfGender = dfAge.groupby('Sex')[['Age','height_cm','weight_kg']].mean().round(2)
dfGenderStat = pd.DataFrame(dfGender, index=['Male','Female'])

fig, axs = plt.subplots(1,3,figsize=(15,6))

# Plot for Average age
axs[0].bar(dfGenderStat.index,dfGenderStat['Age'],color=['Blue','Pink'])
axs[0].set_title('Average Age by Gender')
axs[0].set_xlabel('Gender')
axs[0].set_ylabel('Age')
axs[0].set_ylim(0,max(dfGenderStat['Age']+10))
# Plot for Height
axs[1].bar(dfGenderStat.index,dfGenderStat['height_cm'],color=['Blue','Pink'])
axs[1].set_title('Average Height by Gender (cm)')
axs[1].set_xlabel('Gender')
axs[1].set_ylabel('Height')
axs[1].set_ylim(0,max(dfGenderStat['height_cm']+25))
# Plot for Weight
axs[2].bar(dfGenderStat.index,dfGenderStat['weight_kg'],color=['Blue','Pink'])
axs[2].set_title('Average Weight by Gender (kg)')
axs[2].set_xlabel('Gender')
axs[2].set_ylabel('Weight')
axs[2].set_ylim(0,max(dfGenderStat['weight_kg']+10))
plt.show()

In [46]:
# Group athletes by country and their body mass index
# Create new df that does not contain null values for height and weight
dfBMI = df[['NOC','height_cm','weight_kg','Sex']].dropna().copy()
dfBMI['BMI'] = (dfBMI['weight_kg']/(dfBMI['height_cm']*dfBMI['height_cm'])*10000).round(3)
dfBMI.groupby(['NOC','Sex'])[['BMI']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,BMI
NOC,Sex,Unnamed: 2_level_1
Afghanistan,Female,20.877500
Afghanistan,Male,23.430886
Albania,Female,20.703909
Albania,Male,26.183471
Albania Australia,Male,23.588000
...,...,...
Yugoslavia,Male,23.752673
Zambia,Female,21.000300
Zambia,Male,21.891917
Zimbabwe,Female,21.461169
