In [91]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

## Reading file
take a small sample to work on to avoid any performance issues

In [92]:
# data=pd.read_csv('C:/Users/COMPUMARTS/Downloads/repo/Data Analysis project/TWO_CENTURIES_OF_UM_RACES.csv')
# # data.head()
# df = data.sample(frac=0.1, random_state=42)
# df.to_csv("small_file.csv", index=False)
data=pd.read_csv('small_file.csv')
data.head()
data.shape

(746120, 13)

In [93]:
data.dtypes

Year of event                  int64
Event dates                   object
Event name                    object
Event distance/length         object
Event number of finishers      int64
Athlete performance           object
Athlete club                  object
Athlete country               object
Athlete year of birth        float64
Athlete gender                object
Athlete age category          object
Athlete average speed         object
Athlete ID                     int64
dtype: object

check null values

In [94]:
print(data.isna().sum())
#percentage of missing values
data.isna().mean() * 100

Year of event                     0
Event dates                       0
Event name                        0
Event distance/length           117
Event number of finishers         0
Athlete performance               0
Athlete club                 281998
Athlete country                   0
Athlete year of birth         58739
Athlete gender                    0
Athlete age category          58411
Athlete average speed            16
Athlete ID                        0
dtype: int64


Year of event                 0.000000
Event dates                   0.000000
Event name                    0.000000
Event distance/length         0.015681
Event number of finishers     0.000000
Athlete performance           0.000000
Athlete club                 37.795261
Athlete country               0.000000
Athlete year of birth         7.872594
Athlete gender                0.000000
Athlete age category          7.828633
Athlete average speed         0.002144
Athlete ID                    0.000000
dtype: float64

In [95]:
# replace spaces in columns with _
data.columns = data.columns.str.replace(' ', '_')
data.columns

Index(['Year_of_event', 'Event_dates', 'Event_name', 'Event_distance/length',
       'Event_number_of_finishers', 'Athlete_performance', 'Athlete_club',
       'Athlete_country', 'Athlete_year_of_birth', 'Athlete_gender',
       'Athlete_age_category', 'Athlete_average_speed', 'Athlete_ID'],
      dtype='object')

### Extracting age

In [96]:
# drop all nulls in athlete year of birth
data=data.dropna(subset=['Athlete_year_of_birth'])
data['Athlete_year_of_birth'].isna().mean() * 100

np.float64(0.0)

## Problem
some records have a year of birth bigger than the Year of event
### Actions 
- remove all outliers (make the min age 12 and max 75)
- if the -year of birth- bigger than -year of event- then replace it or remove it all

In [97]:
# to calculate the age in the race time not current time
data['Age'] = data['Year_of_event'] - data['Athlete_year_of_birth']
data['Age'].max()

np.float64(101.0)

In [98]:
 #* fixing problem
condition=data['Athlete_year_of_birth']>data['Year_of_event']
data.loc[condition,['Athlete_year_of_birth','Year_of_event']]=data.loc[condition,['Year_of_event','Athlete_year_of_birth']].values

In [99]:
#calculating age after fixing the problem
data['Age'] = data['Year_of_event'] - data['Athlete_year_of_birth']

In [100]:
#take a look in the new data
data.loc[data['Age'].idxmin(), ['Age', 'Year_of_event', 'Athlete_year_of_birth']]

Age                         1.0
Year_of_event              1982
Athlete_year_of_birth    1981.0
Name: 356168, dtype: object

now set a min and max age

In [101]:
data = data[(data['Age'] >= 12) & (data['Age'] <= 75)]
print(f'MIN Age : {data['Age'].min()}\nMax Age : {data['Age'].max()}')

MIN Age : 12.0
Max Age : 75.0


In [102]:
# here i'm testing some filtering
data[data['Event_name'].str.split('(').str.get(1).str.split(')').str.get(0)=='USA'].head(1)

Unnamed: 0,Year_of_event,Event_dates,Event_name,Event_distance/length,Event_number_of_finishers,Athlete_performance,Athlete_club,Athlete_country,Athlete_year_of_birth,Athlete_gender,Athlete_age_category,Athlete_average_speed,Athlete_ID,Age
0,2016,09.10.2016,Glacial Trail 50 km Race (USA),50km,127,4:56:39 h,*WI,USA,1975.0,M,M40,10.113,480555,41.0


## Standardizing gender and distance

In [103]:
'''standardizing the gender
1 for male 0 for female '''
data=data.drop(data[data['Athlete_gender']=='X'].index)
data['Athlete_gender'] = data['Athlete_gender'].replace({'F': 0, 'M': 1})
data['Athlete_gender']=data['Athlete_gender'].astype('int')

In [104]:
data['Athlete_gender'].unique()
data['Event_distance/length'].unique()

array(['50km', '58km', '85km', ..., '126.8km', '100mi/4stages', '53.1mi'],
      shape=(1867,), dtype=object)

In [105]:
 #* multiply mile(mi) distance in 1,6 to be accurate 
data['Event_distance/length'] = np.where(
    data['Event_distance/length'].str.contains('mi', na=False),               
    data['Event_distance/length'].str.extract(r'(\d+)').astype(float)[0] * 1.6, 
    data['Event_distance/length']                                               
)

In [106]:
 #* Reviewing all unique values in data to take the right action for best accuracy
u=data['Event_distance/length'].unique()

In [107]:
 #* removed all rows contain units another km or mile
data = data[~data['Event_distance/length'].str.contains('h', na=False)]

In [108]:
#* extract only the numeric part from the distance/length column
data['Event_distance/length']=data['Event_distance/length'].str.split(r'[^0-9\.]').str.get(0)
data['Event_distance/length'].unique()

array(['50', '58', '85', ..., '283', '191', '126.8'],
      shape=(1085,), dtype=object)

In [109]:
#change data type to float
data['Event_distance/length']=data['Event_distance/length'].astype('float')

In [110]:
# heat map to understand relations between each column
num_cols=data.select_dtypes('number').columns
corr=data[num_cols].corr()
fig=px.imshow(corr,text_auto=True,title='Correlation heatmap')
fig.update_layout(width=700,height=800)

### **The data we have**
- we have the normal age based on birth date and event year
- clean -Age- column only between **--12 and 75--**
- encoded gender to be **--1 Male--**  , **--0 Female--**
- removed the units form distance and replaces mile(mi) by it equivalent by kilometer(km)
---


## Splitting data into periods to analysis every period


In [111]:
print(f'Max year of event : {data['Year_of_event'].min()}\nMax year of event : {data['Year_of_event'].max()}')

Max year of event : 1875
Max year of event : 2022


In [112]:
# global period
p80_22=data[(data['Year_of_event']>=1980) & (data['Year_of_event']<=2022)]
# decades period
p80_90=data[(data['Year_of_event']>=1980) & (data['Year_of_event']<=1990)]
p90_00=data[(data['Year_of_event']>=1990) & (data['Year_of_event']<=2000)]
p00_10=data[(data['Year_of_event']>=2000) & (data['Year_of_event']<=2010)]
p10_20=data[(data['Year_of_event']>=2010) & (data['Year_of_event']<=2020)]

## Analyzing the mean distances for each decade


In [113]:
# the mean of total distance in every decade
mean_dis_80s=p80_90['Event_distance/length'].mean()
mean_dis_90s=p90_00['Event_distance/length'].mean()
mean_dis_00s=p00_10['Event_distance/length'].mean()
mean_dis_10s=p10_20['Event_distance/length'].mean()

In [114]:
# add them to data frame
decades=['1980s','1990s','2000s','2010s']
means=[mean_dis_80s,mean_dis_90s,mean_dis_00s,mean_dis_10s]
mean_df=pd.DataFrame({'decades':decades,'means':means})
mean_df.head()

Unnamed: 0,decades,means
0,1980s,84.191221
1,1990s,76.882544
2,2000s,76.44358
3,2010s,73.257242


In [115]:
fig=px.pie(names=mean_df['decades'],
       values=mean_df['means'],
       title='<b>Percent of total distance in each decade',
       color=mean_df['decades'],
       color_discrete_map={'1990s':'#576fc7','2000s':'#ffb433','2010s':'#98d8ef'},
       hover_name=mean_df['decades'])
fig.show()



### Now analyzing based on gender

In [116]:
# getting the mean of males and females in each decade
count_male_80s = p80_90[p80_90['Athlete_gender']==1]['Athlete_gender'].count()
                            #shape will return (rows , columns)count [0] will return rows count
mean_male_80s=(count_male_80s/p80_90.shape[0])*100
mean_female_80s = 100 - mean_male_80s


count_male_90s = p90_00[p90_00['Athlete_gender']==1]['Athlete_gender'].count()
                            #shape will return (rows , columns)count [0] will return rows count
mean_male_90s=(count_male_90s/p90_00.shape[0])*100
mean_female_90s = 100 - mean_male_90s

        
count_male_00s = p00_10[p00_10['Athlete_gender'] == 1]['Athlete_gender'].count()
mean_male_00s=(count_male_00s/p00_10.shape[0])*100
mean_female_00s = 100 - mean_male_00s


count_male_10s = p10_20[p10_20['Athlete_gender'] == 1]['Athlete_gender'].count()
mean_male_10s=(count_male_10s/p10_20.shape[0])*100
mean_female_10s = 100 - mean_male_10s



In [117]:
# add means to data frame for visualization
male_means = [round(mean_male_80s,2),round(mean_male_90s,2), round(mean_male_00s,2), round(mean_male_10s,2)]
female_means = [round(mean_female_80s,2),round(mean_female_90s, 2), round(mean_female_00s,2),round(mean_female_10s,2)]

mean_gender = pd.DataFrame({
    #! decade of each gender
    'Decade': decades * 2,  
    #!gender for each decade
    'Gender': ['Male']*4 + ['Female']*4,
    'Mean': male_means + female_means
})
mean_gender

Unnamed: 0,Decade,Gender,Mean
0,1980s,Male,93.85
1,1990s,Male,86.55
2,2000s,Male,82.38
3,2010s,Male,80.14
4,1980s,Female,6.15
5,1990s,Female,13.45
6,2000s,Female,17.62
7,2010s,Female,19.86


In [118]:
fig=px.histogram(mean_gender,
             x='Decade',
             y='Mean',
             color='Gender',
             barmode='group',
             text_auto=True,
             title='<b>Percent of Athletes gender for each decade (Histogram)',
             color_discrete_sequence=px.colors.qualitative.Pastel,
             labels={'Mean':'Percent'}
             
             )
fig.update_layout(yaxis_title='Mean')

In [119]:
fig = px.line(
    mean_gender,
    x='Decade',
    y='Mean',
    color='Gender',
    markers=True,
    title='<b>Percent of Athletes genders each decade (Line)'
)
fig.show()


**Line for each year**

In [120]:
  #! very important
# the data frame for athlete genders for every year
gender_per_year_DF=data[(data['Year_of_event']>=1980)]
gender_count_per_year = gender_per_year_DF.groupby(['Year_of_event', 'Athlete_gender']).size().unstack(fill_value=0).rename(columns={0:'Female_count', 1:'Male_count'}).reset_index()


df_long = gender_count_per_year.melt(id_vars='Year_of_event', 
                                     value_vars=['Male_count', 'Female_count'], 
                                     var_name='Gender', 
                                     value_name='Count')
df_long

Unnamed: 0,Year_of_event,Gender,Count
0,1980,Male_count,1469
1,1981,Male_count,1748
2,1982,Male_count,1928
3,1983,Male_count,2194
4,1984,Male_count,2499
...,...,...,...
81,2018,Female_count,12160
82,2019,Female_count,13748
83,2020,Female_count,4188
84,2021,Female_count,7077


In [121]:
 #! for each year
px.line(df_long
        ,x='Year_of_event'
        ,y='Count'
        ,color='Gender'
        ,title='<b>Athletes genders every year'
        ,markers=True
                )

**Make Gender count(not mean like the histogram cell 38) Years DataFrame for specific Decade**

In [122]:
gender_count_per_year_80s = p80_90.groupby(['Year_of_event', 'Athlete_gender']).size().unstack(fill_value=0).rename(columns={0:'Female_count', 1:'Male_count'}).reset_index()
gender_count_per_year_90s = p90_00.groupby(['Year_of_event', 'Athlete_gender']).size().unstack(fill_value=0).rename(columns={0:'Female_count', 1:'Male_count'}).reset_index()
gender_count_per_year_00s = p00_10.groupby(['Year_of_event', 'Athlete_gender']).size().unstack(fill_value=0).rename(columns={0:'Female_count', 1:'Male_count'}).reset_index()
gender_count_per_year_10s = p10_20.groupby(['Year_of_event', 'Athlete_gender']).size().unstack(fill_value=0).rename(columns={0:'Female_count', 1:'Male_count'}).reset_index()

df_long_80s = gender_count_per_year_80s.melt(id_vars='Year_of_event',value_vars=['Male_count', 'Female_count'],var_name='Gender',value_name='Count')
df_long_90s = gender_count_per_year_90s.melt(id_vars='Year_of_event',value_vars=['Male_count', 'Female_count'],var_name='Gender',value_name='Count')
df_long_00s = gender_count_per_year_00s.melt(id_vars='Year_of_event',value_vars=['Male_count', 'Female_count'],var_name='Gender',value_name='Count')
df_long_10s = gender_count_per_year_10s.melt(id_vars='Year_of_event',value_vars=['Male_count', 'Female_count'],var_name='Gender',value_name='Count')

In [123]:
 #* Choose any decade to display its analysis 

In [124]:
# the decade can change to display each decade we need
px.line(df_long_00s
        ,x='Year_of_event'
        ,y='Count'
        ,color='Gender'
        ,title='<b>Athletes genders every decade'
        ,markers=True
        )

## Now analyzing distance for each gender

In [125]:
 #* calculating mean distances for each gender in every period
mean_distance_male_80s=((p80_90[p80_90['Athlete_gender']==1]['Event_distance/length'].sum())/p80_90['Event_distance/length'].sum())*100
mean_distance_male_90s=((p90_00[p90_00['Athlete_gender']==1]['Event_distance/length'].sum())/p90_00['Event_distance/length'].sum())*100
mean_distance_male_00s=((p00_10[p00_10['Athlete_gender']==1]['Event_distance/length'].sum())/p00_10['Event_distance/length'].sum())*100
mean_distance_male_10s=((p10_20[p10_20['Athlete_gender']==1]['Event_distance/length'].sum())/p10_20['Event_distance/length'].sum())*100

mean_distance_female_80s=100-mean_distance_male_80s
mean_distance_female_90s=100-mean_distance_male_90s
mean_distance_female_00s=100-mean_distance_male_00s
mean_distance_female_10s=100-mean_distance_male_10s


print(mean_distance_female_10s,mean_distance_male_10s)

17.963843986173217 82.03615601382678


In [126]:
males_dis=[round(mean_distance_male_80s,2),round(mean_distance_male_90s,2),round(mean_distance_male_00s,2),round(mean_distance_male_10s,2)]
females_dis=[round(mean_distance_female_80s,2),round(mean_distance_female_90s,2),round(mean_distance_female_00s,2),round(mean_distance_female_10s,2)]

mean_dis=pd.DataFrame({
    'Decades':decades*2,
    'Gender':['male']*4 + ['female']*4,
    'Mean distance':males_dis+females_dis
})
mean_dis

Unnamed: 0,Decades,Gender,Mean distance
0,1980s,male,94.51
1,1990s,male,87.76
2,2000s,male,84.19
3,2010s,male,82.04
4,1980s,female,5.49
5,1990s,female,12.24
6,2000s,female,15.81
7,2010s,female,17.96


In [127]:
px.line(mean_dis,
        x='Decades',
        y='Mean distance',
        color='Gender',
        markers=True
        )

## AVG speed

In [128]:
 #! to delete any nonnumeric record from avg speed column
p80_22['Athlete_average_speed'] = pd.to_numeric(p80_22['Athlete_average_speed'], errors='coerce')

#change data type to get mean for every year
p80_22['Athlete_average_speed'] = p80_22['Athlete_average_speed'].astype('float')

## Problem
before 1995 the AVG speeds unit is deferent from the AVG speed after 1995
after some searching I found that before 1995 it was by (meter/hour) and after 1995 it was by (km/hour)
**solution**
- multiply the AVG Speeds before 1995 by 0.001

In [129]:
conversion_factor = 0.001

p80_22.loc[p80_22['Year_of_event'] <= 1995, 'Athlete_average_speed'] *= conversion_factor

In [130]:
# Data Frame for year and mean avg speed of it
avg_s_80_22 = p80_22.groupby('Year_of_event')['Athlete_average_speed'].mean().reset_index()

In [131]:
px.line(avg_s_80_22
        ,x='Year_of_event'
        ,y='Athlete_average_speed'
        ,title='<b>Average Speed for each year'
        ,labels={
                  'Year_of_event': 'Year',
                  'Athlete_average_speed': 'Average Speeds (km/h)'
              }
        ,markers=True
        ,color_discrete_sequence=["#0077FF"]
        )

In [132]:
p80_22.columns

Index(['Year_of_event', 'Event_dates', 'Event_name', 'Event_distance/length',
       'Event_number_of_finishers', 'Athlete_performance', 'Athlete_club',
       'Athlete_country', 'Athlete_year_of_birth', 'Athlete_gender',
       'Athlete_age_category', 'Athlete_average_speed', 'Athlete_ID', 'Age'],
      dtype='object')

In [133]:
num_of_finishers80_22=p80_22['Event_number_of_finishers'].sum()
num_of_finishers80_22

# p80_22.shape

np.int64(1030425524)

In [134]:
# Data Frame for year and mean avg speed of it
num_of_finishers80_22 = p80_22.groupby('Year_of_event')['Event_number_of_finishers'].sum().reset_index()
# num_of_finishers80_22

In [135]:
px.line(num_of_finishers80_22
        ,x='Year_of_event'
        ,y='Event_number_of_finishers')