In [65]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from plotly.subplots import make_subplots
import pprint

## Reading file
take a small sample to work on to avoid any performance issues

In [66]:
# data=pd.read_csv('C:/Users/COMPUMARTS/Downloads/repo/Data Analysis project/TWO_CENTURIES_OF_UM_RACES.csv')
# # data.head()
# df = data.sample(frac=0.1, random_state=42)
# df.to_csv("small_file.csv", index=False)
data=pd.read_csv('small_file.csv')
data.head()
data.shape




(746120, 13)

In [67]:
data.dtypes

Year of event                  int64
Event dates                   object
Event name                    object
Event distance/length         object
Event number of finishers      int64
Athlete performance           object
Athlete club                  object
Athlete country               object
Athlete year of birth        float64
Athlete gender                object
Athlete age category          object
Athlete average speed         object
Athlete ID                     int64
dtype: object

check null values

In [68]:
print(data.isna().sum())
#percentage of missing values
data.isna().mean() * 100

Year of event                     0
Event dates                       0
Event name                        0
Event distance/length           117
Event number of finishers         0
Athlete performance               0
Athlete club                 281998
Athlete country                   0
Athlete year of birth         58739
Athlete gender                    0
Athlete age category          58411
Athlete average speed            16
Athlete ID                        0
dtype: int64


Year of event                 0.000000
Event dates                   0.000000
Event name                    0.000000
Event distance/length         0.015681
Event number of finishers     0.000000
Athlete performance           0.000000
Athlete club                 37.795261
Athlete country               0.000000
Athlete year of birth         7.872594
Athlete gender                0.000000
Athlete age category          7.828633
Athlete average speed         0.002144
Athlete ID                    0.000000
dtype: float64

In [69]:
# replace spaces in columns with _
data.columns = data.columns.str.replace(' ', '_')
data.columns

Index(['Year_of_event', 'Event_dates', 'Event_name', 'Event_distance/length',
       'Event_number_of_finishers', 'Athlete_performance', 'Athlete_club',
       'Athlete_country', 'Athlete_year_of_birth', 'Athlete_gender',
       'Athlete_age_category', 'Athlete_average_speed', 'Athlete_ID'],
      dtype='object')

### Extracting age

In [70]:
# drop all nulls in athlete year of birth
data=data.dropna(subset=['Athlete_year_of_birth'])
data['Athlete_year_of_birth'].isna().mean() * 100

np.float64(0.0)

In [71]:
# to calculate the age in the race time not current time
data['Age']=data['Year_of_event']-data['Athlete_year_of_birth']
data.head()

Unnamed: 0,Year_of_event,Event_dates,Event_name,Event_distance/length,Event_number_of_finishers,Athlete_performance,Athlete_club,Athlete_country,Athlete_year_of_birth,Athlete_gender,Athlete_age_category,Athlete_average_speed,Athlete_ID,Age
0,2016,09.10.2016,Glacial Trail 50 km Race (USA),50km,127,4:56:39 h,*WI,USA,1975.0,M,M40,10.113,480555,41.0
1,2010,12.09.2010,Trail Cote d'Opale (FRA),58km,402,7:00:36 h,Veolia Eau,FRA,1957.0,M,M50,8.274,432328,53.0
2,2017,18.03.2017,Ultramaratón Montañas del Quetzal (GUA),85km,146,12:02:02 h,*Flores Peten,GUA,1976.0,M,M40,7.063,541408,41.0
3,2018,07.07.2018,Zermatt Ultra Marathon (SUI),45.6km,530,5:38:46 h,Smrun,SUI,1968.0,F,W45,8.076,243849,50.0
4,1985,31.05.1985,Comrades Marathon - Up Run (RSA),88km,8192,8:55:06 h,Amanzimtoti AC,RSA,1952.0,M,M23,9867.0,1578736,33.0


In [72]:
# here i'm testing some filtering
data[data['Event_name'].str.split('(').str.get(1).str.split(')').str.get(0)=='USA'].head(1)

Unnamed: 0,Year_of_event,Event_dates,Event_name,Event_distance/length,Event_number_of_finishers,Athlete_performance,Athlete_club,Athlete_country,Athlete_year_of_birth,Athlete_gender,Athlete_age_category,Athlete_average_speed,Athlete_ID,Age
0,2016,09.10.2016,Glacial Trail 50 km Race (USA),50km,127,4:56:39 h,*WI,USA,1975.0,M,M40,10.113,480555,41.0


## Standardizing gender and distance

In [73]:
# standardizing the gender 
data=data.drop(data[data['Athlete_gender']=='X'].index)
data['Athlete_gender'] = data['Athlete_gender'].replace({'F': 0, 'M': 1})
data['Athlete_gender']=data['Athlete_gender'].astype('int')


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [74]:
data['Athlete_gender'].unique()
data['Event_distance/length'].unique()

array(['50km', '58km', '85km', ..., '126.8km', '100mi/4stages', '53.1mi'],
      shape=(1867,), dtype=object)

In [75]:
 #* multiply mile(mi) distance in 1,6 to be accurate 
 #  
data['Event_distance/length'] = np.where(
    data['Event_distance/length'].str.contains('mi', na=False),               
    data['Event_distance/length'].str.extract(r'(\d+)').astype(float)[0] * 1.6, 
    data['Event_distance/length']                                               
)

In [76]:
 #* Reviewing all unique values in data to take the right action for best accuracy
u=data['Event_distance/length'].unique().tolist()
# pprint.pprint(u)

In [77]:
 #* removed all rows contain units another km or mile
data = data[~data['Event_distance/length'].str.contains('h', na=False)]

In [78]:
data['Event_distance/length']=data['Event_distance/length'].str.split(r'[^0-9\.]').str.get(0)
data['Event_distance/length'].unique()

array(['50', '58', '85', ..., '283', '191', '126.8'],
      shape=(1085,), dtype=object)

In [79]:
data['Event_distance/length']=data['Event_distance/length'].astype('float')


In [80]:
# heat map to understand relations between each column
num_cols=data.select_dtypes('number').columns
corr=data[num_cols].corr()
fig=px.imshow(corr,text_auto=True,title='Correlation heatmap')
fig.update_layout(width=700,height=800)

### the data we have
- we have the normal age based on birth date and event year
- encoded gender to be --1 Male--  , --0 Female--
- removed the units form distance and replaces mile(mi) by it equivalent by kilometer(km)

## Splitting data into periods to analysis every period


In [81]:
d09_00=data[(data['Year_of_event']>=1990) & (data['Year_of_event']<=2000)]
d00_10=data[(data['Year_of_event']>=2000) & (data['Year_of_event']<=2010)]
d10_20=data[(data['Year_of_event']>=2010) & (data['Year_of_event']<=2020)]
d10_20.head(5)

Unnamed: 0,Year_of_event,Event_dates,Event_name,Event_distance/length,Event_number_of_finishers,Athlete_performance,Athlete_club,Athlete_country,Athlete_year_of_birth,Athlete_gender,Athlete_age_category,Athlete_average_speed,Athlete_ID,Age
0,2016,09.10.2016,Glacial Trail 50 km Race (USA),50.0,127,4:56:39 h,*WI,USA,1975.0,1,M40,10.113,480555,41.0
1,2010,12.09.2010,Trail Cote d'Opale (FRA),58.0,402,7:00:36 h,Veolia Eau,FRA,1957.0,1,M50,8.274,432328,53.0
2,2017,18.03.2017,Ultramaratón Montañas del Quetzal (GUA),85.0,146,12:02:02 h,*Flores Peten,GUA,1976.0,1,M40,7.063,541408,41.0
3,2018,07.07.2018,Zermatt Ultra Marathon (SUI),45.6,530,5:38:46 h,Smrun,SUI,1968.0,0,W45,8.076,243849,50.0
5,2018,09.-13.09.2018,Schwarzwaldlauf (GER),262.0,35,36:07:01 h,,GER,1963.0,1,M55,7.254,272763,55.0


In [82]:
# the mean of total distance in every decade
mean_90s=d09_00['Event_distance/length'].mean()
mean_00s=d00_10['Event_distance/length'].mean()
mean_10s=d10_20['Event_distance/length'].mean()

In [83]:
# add them to data frame
decades=['1990s','2000s','2010s']
means=[mean_90s,mean_00s,mean_10s]
mean_df=pd.DataFrame({'decades':decades,'means':means})
mean_df.head()

Unnamed: 0,decades,means
0,1990s,76.877356
1,2000s,76.437969
2,2010s,73.256316


In [84]:
px.pie(names=mean_df['decades'],
       values=mean_df['means'],
       title='<b>Percent of total distance in each decade',
       color=mean_df['decades'],
       color_discrete_map={'1990s':'#576fc7','2000s':'#ffb433','2010s':'#98d8ef'})

### Now analyzing based on gender

In [85]:
# getting the mean of males and females in each decade
count_male_90s = d09_00[d09_00['Athlete_gender']==1]['Athlete_gender'].count()
                            #shape will return (rows , columns)count [0] will return rows count
mean_male_90s=(count_male_90s/d09_00.shape[0])*100
mean_female_90s = 100 - mean_male_90s

        
count_male_00s = d00_10[d00_10['Athlete_gender'] == 1]['Athlete_gender'].count()
mean_male_00s=(count_male_00s/d00_10.shape[0])*100
mean_female_00s = 100 - mean_male_00s



count_male_10s = d10_20[d10_20['Athlete_gender'] == 1]['Athlete_gender'].count()
mean_male_10s=(count_male_10s/d10_20.shape[0])*100
mean_female_10s = 100 - mean_male_10s



In [86]:
male_means = [round(mean_male_90s,2), round(mean_male_00s,2), round(mean_male_10s,2)]
female_means = [round(mean_female_90s, 2), round(mean_female_00s,2),round(mean_female_10s,2)]

mean_gender = pd.DataFrame({
    #! decade of each gender
    'Decade': decades * 2,  
    #!gender for each decade
    'Gender': ['Male']*3 + ['Female']*3,
    'Mean': male_means + female_means
})
mean_gender.head(6)

Unnamed: 0,Decade,Gender,Mean
0,1990s,Male,86.55
1,2000s,Male,82.39
2,2010s,Male,80.15
3,1990s,Female,13.45
4,2000s,Female,17.61
5,2010s,Female,19.85


In [87]:
fig=px.histogram(mean_gender,
             x='Decade',
             y='Mean',
             color='Gender',
             barmode='group',
             text_auto=True,
             title='<b>present of each gender for each decade',
             color_discrete_sequence=px.colors.qualitative.Pastel,
             labels={'Mean':'Percent'}
             
             )
fig.update_layout(yaxis_title='Mean')

In [88]:
fig = px.line(
    mean_gender,
    x='Decade',
    y='Mean',
    color='Gender',
    markers=True,
    title='<b>Percent of genders each decade'
)
fig.show()


In [89]:
 #! for each year 

## Now analyzing distance for each gender

In [90]:
mean_distance_male_90s=((d09_00[d09_00['Athlete_gender']==1]['Event_distance/length'].sum())/d09_00.shape[0])
mean_distance_male_00s=((d00_10[d00_10['Athlete_gender']==1]['Event_distance/length'].sum())/d00_10.shape[0])
mean_distance_male_10s=((d10_20[d10_20['Athlete_gender']==1]['Event_distance/length'].sum())/d10_20.shape[0])

mean_distance_female_90s=100-mean_distance_male_90s
mean_distance_female_00s=100-mean_distance_male_00s
mean_distance_female_10s=100-mean_distance_male_10s


print(mean_distance_female_10s,mean_distance_male_10s)

46.38438390043032 53.61561609956968


In [91]:
males_dis=[round(mean_distance_male_90s,2),round(mean_distance_male_00s,2),round(mean_distance_male_10s,2)]
females_dis=[round(mean_distance_female_90s,2),round(mean_distance_female_00s,2),round(mean_distance_female_10s,2)]

mean_dis=pd.DataFrame({
    'Decades':decades*2,
    'Gender':['male']*3 + ['female']*3,
    'Mean distance':males_dis+females_dis
})
mean_dis.head()

Unnamed: 0,Decades,Gender,Mean distance
0,1990s,male,59.14
1,2000s,male,56.14
2,2010s,male,53.62
3,1990s,female,40.86
4,2000s,female,43.86


In [92]:
px.line(mean_dis,
        x='Decades',
        y='Mean distance',
        color='Gender',
        markers=True
        )

In [1]:
#