In [3]:
import pandas as pd
import seaborn as sns
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

base_file_path = '../data/'

df = pd.read_csv(base_file_path+'polished3_with_gdp.csv')

male = df[df["Sex"] == "M"]
female = df[df["Sex"] == "F"]

used_columns = ["Height", "BMI", "Age", 'GDP']

bio_by_sport_male = male.groupby(['Event'])[used_columns].median().reset_index()
bio_by_sport_female = female.groupby(['Event'])[used_columns].median().reset_index()

In [4]:
from sklearn.preprocessing import StandardScaler

columns_to_normalize = ['Height', 'BMI', 'Age', 'GDP']

scaler = StandardScaler()

bio_by_sport_male[columns_to_normalize] = scaler.fit_transform(bio_by_sport_male[columns_to_normalize])
bio_by_sport_female[columns_to_normalize] = scaler.fit_transform(bio_by_sport_female[columns_to_normalize])


bio_by_sport_male.head()
bio_by_sport_female.head()

Unnamed: 0,Event,Height,BMI,Age,GDP
0,Archery Women's Individual,-0.064286,0.027448,0.027614,-0.661869
1,Archery Women's Team,-0.064286,0.151684,-0.218003,0.094738
2,"Athletics Women's 1,500 metres",-0.417599,-1.12235,0.273231,-0.91681
3,Athletics Women's 10 kilometres Walk,-0.770912,-0.923719,0.027614,0.052724
4,"Athletics Women's 10,000 metres",-0.947568,-1.422485,0.273231,0.079543


In [7]:
all_events_male = bio_by_sport_male["Event"].unique()
all_events_male

array(["Archery Men's Individual", "Archery Men's Team",
       "Athletics Men's 1,500 metres", "Athletics Men's 10,000 metres",
       "Athletics Men's 100 metres", "Athletics Men's 110 metres Hurdles",
       "Athletics Men's 20 kilometres Walk", "Athletics Men's 200 metres",
       "Athletics Men's 3,000 metres Steeplechase",
       "Athletics Men's 4 x 100 metres Relay",
       "Athletics Men's 4 x 400 metres Relay",
       "Athletics Men's 400 metres", "Athletics Men's 400 metres Hurdles",
       "Athletics Men's 5,000 metres",
       "Athletics Men's 50 kilometres Walk", "Athletics Men's 800 metres",
       "Athletics Men's Decathlon", "Athletics Men's Discus Throw",
       "Athletics Men's Hammer Throw", "Athletics Men's High Jump",
       "Athletics Men's Javelin Throw", "Athletics Men's Long Jump",
       "Athletics Men's Marathon", "Athletics Men's Pole Vault",
       "Athletics Men's Shot Put", "Athletics Men's Triple Jump",
       "Badminton Men's Doubles", "Badminton Men's

In [10]:
from scipy.spatial.distance import pdist, squareform

dist_matrix = squareform(pdist(bio_by_sport_male[used_columns], metric='euclidean'))
dist_matrix

array([[0.        , 1.22054293, 1.51170404, ..., 3.99430356, 1.21750478,
        1.05452783],
       [1.22054293, 0.        , 2.30725252, ..., 4.22814358, 2.27273139,
        2.16590967],
       [1.51170404, 2.30725252, 0.        , ..., 4.99773553, 1.63913532,
        1.73044052],
       ...,
       [3.99430356, 4.22814358, 4.99773553, ..., 0.        , 4.17193105,
        3.99693227],
       [1.21750478, 2.27273139, 1.63913532, ..., 4.17193105, 0.        ,
        0.34875468],
       [1.05452783, 2.16590967, 1.73044052, ..., 3.99693227, 0.34875468,
        0.        ]])

In [12]:
from sklearn.manifold import MDS

mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)

# Fit the MDS algorithm to the distance matrix and transform it to get the coordinates
coords = mds.fit_transform(dist_matrix)

# Create a DataFrame with the x and y coordinates
coords_df = pd.DataFrame(coords, columns=['x', 'y'], index=bio_by_sport_male.index)

# Display the resulting coordinates
print(coords_df)

            x         y
0    0.116074  0.134504
1    1.013881 -0.487889
2   -1.299037  0.382432
3   -0.355468  1.669601
4   -0.851491  0.181567
..        ...       ...
218 -0.154408  0.032258
219  0.319080 -3.880790
220  0.107654 -3.931509
221 -0.677333  0.900364
222 -0.334980  0.823443

[223 rows x 2 columns]


In [14]:
bio_by_sport_male_with_coords = pd.concat([bio_by_sport_male, coords_df], axis=1)
bio_by_sport_male_with_coords

Unnamed: 0,Event,Height,BMI,Age,GDP,x,y
0,Archery Men's Individual,-0.078387,-0.091115,0.024227,0.164335,0.116074,0.134504
1,Archery Men's Team,0.191579,-0.057313,-0.267811,1.317771,1.013881,-0.487889
2,"Athletics Men's 1,500 metres",0.056596,-1.212278,-0.559850,-0.653514,-1.299037,0.382432
3,"Athletics Men's 10,000 metres",-0.753301,-1.436511,0.316266,-0.673392,-0.355468,1.669601
4,Athletics Men's 100 metres,-0.078387,-0.278297,-0.559850,-0.690328,-0.851491,0.181567
...,...,...,...,...,...,...,...
218,"Wrestling Men's Middleweight, Greco-Roman",-0.213370,0.678663,0.024227,-0.574132,-0.154408,0.032258
219,"Wrestling Men's Super-Heavyweight, Freestyle",1.271442,3.560912,0.024227,-0.580762,0.319080,-3.880790
220,"Wrestling Men's Super-Heavyweight, Greco-Roman",1.406425,3.560912,0.024227,-0.477921,0.107654,-3.931509
221,"Wrestling Men's Welterweight, Freestyle",-0.888284,0.094689,-0.267811,-0.676245,-0.677333,0.900364


In [15]:
bio_by_sport_male_with_coords["x"].max()

4.819270420926478

In [16]:
bio_by_sport_male_with_coords.to_csv("bio_by_sport_male_with_coords.csv")