In [1]:
# imports
import numpy as np
import pandas as pd

In [2]:
# load data
data_athle = pd.read_csv(
    "data/data_athle.csv",
    header=0,
    sep="	", 
    decimal=",", 
    index_col=0
)

In [4]:
data_athle.head()

Unnamed: 0,100m,200m,400m,800m,1500m,5000m,10000m,SemiMarathon,Marathon
Australie,9.93,20.06,44.38,104.4,211.96,775.76,1649.73,3602,7671
Belgique,10.02,20.19,44.78,103.86,214.13,769.71,1612.3,3605,7640
Bresil,10.0,19.89,44.29,101.77,213.25,799.43,1648.12,3573,7565
RoyaumeUni,9.87,19.87,44.36,101.73,209.67,780.41,1638.14,3609,7633
Canada,9.84,20.17,44.44,103.68,211.71,793.96,1656.01,3650,7809


In [5]:
data_athle.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, Australie to USA
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   100m          27 non-null     float64
 1   200m          27 non-null     float64
 2   400m          27 non-null     float64
 3   800m          27 non-null     float64
 4   1500m         27 non-null     float64
 5   5000m         27 non-null     float64
 6   10000m        27 non-null     float64
 7   SemiMarathon  27 non-null     int64  
 8   Marathon      27 non-null     int64  
dtypes: float64(7), int64(2)
memory usage: 2.1+ KB


In [6]:
# classify distances
short_distances = ["100m", "200m", "400m"]
medium_distances = ["800m", "1500m"]
long_distances = ["5000m", "10000m", "SemiMarathon", "Marathon"]

In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


def compare_countries(
    df: pd.DataFrame,
    nb_components: int = 1
    ):
    
    # standarize data
    # init scaler
    scaler = StandardScaler()

    # reduce data
    reduced_data = scaler.fit_transform(df.values)

    # reduce dimension using PCA
    pca = PCA(n_components=1)
    country_score = pca.fit_transform(reduced_data)

    # construct a series
    country_score_df = pd.DataFrame(
        country_score,
        index=df.index
    )

    country_score_df.sort_values(
        by=[0],
        ascending=False,
        inplace=True
    )

    # return countris and their scores
    return [country_score_df, pca.explained_variance_ratio_]

In [41]:
# best country in short distances

# select data
short_distances_df = data_athle[short_distances]

# score
score_short, explained_var_ratio = compare_countries(short_distances_df)

In [49]:
# plot scores for short distances
import plotly.express as px

fig = px.bar(
    score_short,
    y=0
)

fig.update_layout(
    title=f"Score of countries for short distances (ex_var_ratio: {explained_var_ratio.round(2)})",
    yaxis_title="score", 
    xaxis_title="countries"
)
fig.show()

In [50]:
# best country in medium distances

# select data
medium_distances_df = data_athle[medium_distances]

# score
score_medium, explained_var_ratio = compare_countries(medium_distances_df)

In [51]:
# plot scores for short distances
import plotly.express as px

fig = px.bar(
    score_medium,
    y=0
)

fig.update_layout(
    title=f"Score of countries for medium distances (ex_var_ratio: {explained_var_ratio.round(2)})",
    yaxis_title="score", 
    xaxis_title="countries"
)
fig.show()

In [52]:
# best country in long distances

# select data
long_distances_df = data_athle[long_distances]

# score
score_long, explained_var_ratio = compare_countries(long_distances_df)

In [53]:
# plot scores for short distances
import plotly.express as px

fig = px.bar(
    score_long,
    y=0
)

fig.update_layout(
    title=f"Score of countries for medium distances (ex_var_ratio: {explained_var_ratio.round(2)})",
    yaxis_title="score", 
    xaxis_title="countries"
)
fig.show()