## Query similar cyclists
Draft version (using KNN)

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [2]:
df_cyclists = pd.read_excel("../data/cyclists_2021.xlsx")

In [3]:
df_cyclists.head()

Unnamed: 0,gene_sz_lastname,gene_sz_firstname,country,gene_i_birthdate,gene_i_size,gene_i_weight,charac_i_plain,charac_i_mountain,charac_i_downhilling,charac_i_cobble,charac_i_timetrial,charac_i_prologue,charac_i_sprint,charac_i_acceleration,charac_i_endurance,charac_i_resistance,charac_i_recuperation,charac_i_hill,charac_i_baroudeur
0,Rebellin,Davide,Italy,19710809,171,63,65,66,65,57,57,57,62,67,66,66,63,68,65
1,Zeits,Andrey,Kazakhstan,19861214,189,73,69,74,68,58,69,67,61,66,73,70,74,71,69
2,Valverde,Alejandro,Spain,19800425,177,61,72,79,75,73,70,72,70,75,76,76,74,81,66
3,Madouas,Valentin,France,19960712,178,71,74,75,69,75,67,68,69,73,76,73,75,76,76
4,Seigle,Romain,France,19941011,169,63,74,71,69,63,71,69,74,73,69,69,68,73,69


In [4]:
df_cyclists_meta = df_cyclists.iloc[:, :6].copy()

In [5]:
df_cyclists_meta["fullname"] = df_cyclists_meta["gene_sz_firstname"] + " " + df_cyclists_meta["gene_sz_lastname"]

In [6]:
age = pd.to_datetime("today") - pd.to_datetime(df_cyclists_meta["gene_i_birthdate"], format="%Y%m%d")
age = np.floor(age.dt.days / 365)
df_cyclists_meta["age"] = age.astype(int)

In [7]:
df_cyclists_meta.head()

Unnamed: 0,gene_sz_lastname,gene_sz_firstname,country,gene_i_birthdate,gene_i_size,gene_i_weight,fullname,age
0,Rebellin,Davide,Italy,19710809,171,63,Davide Rebellin,51
1,Zeits,Andrey,Kazakhstan,19861214,189,73,Andrey Zeits,35
2,Valverde,Alejandro,Spain,19800425,177,61,Alejandro Valverde,42
3,Madouas,Valentin,France,19960712,178,71,Valentin Madouas,26
4,Seigle,Romain,France,19941011,169,63,Romain Seigle,27


In [8]:
cyclists_embeddings = df_cyclists.iloc[:, 6:].values.astype(np.float32)  # should be an array
cyclists_embeddings = np.ascontiguousarray(cyclists_embeddings)  # should be c-contiguous

In [9]:
cyclists_embeddings

array([[65., 66., 65., ..., 63., 68., 65.],
       [69., 74., 68., ..., 74., 71., 69.],
       [72., 79., 75., ..., 74., 81., 66.],
       ...,
       [51., 50., 55., ..., 50., 50., 55.],
       [53., 51., 56., ..., 51., 52., 55.],
       [57., 52., 58., ..., 53., 55., 57.]], dtype=float32)

In [10]:
age_max = 30

if age_max is not None:
    population_indices = df_cyclists_meta[df_cyclists_meta.age <= age_max].index
else:
    population_indices = df_cyclists_meta.index
    
cyclists_embeddings_population = cyclists_embeddings[population_indices]

In [11]:
all_riders = df_cyclists_meta["fullname"].tolist()
# all_riders

In [12]:
def display_rider_info(idx, D=None, k_i=0, df_meta=df_cyclists_meta, arr_emb=cyclists_embeddings, n_just=25):
    p = f"{df_meta.fullname.iloc[idx].rjust(n_just)} | age: {df_meta.age.iloc[idx]} | stats: {arr_emb[idx]}"
    if D is not None:
        p = p + " | distance = " + str(np.round(D[k_i], 4))
    
    print(p)

In [13]:
rider = "Wout Van Aert"

In [14]:
assert rider in all_riders, "Selected rider not in database, sorry!"
rider_i = all_riders.index(rider)
rider_embedding = cyclists_embeddings[[rider_i]]

In [15]:
k = 10

In [16]:
neigh = NearestNeighbors(n_neighbors=k+1)
neigh.fit(cyclists_embeddings_population)
D, I = neigh.kneighbors(rider_embedding, return_distance=True)

In [17]:
display_rider_info(rider_i)
print("-"*120)

for k_i, i in enumerate(I[0][1:]):
    display_rider_info(i, D[0][1:], k_i, 
                       df_meta=df_cyclists_meta.iloc[population_indices],
                       arr_emb=cyclists_embeddings_population)

            Wout Van Aert | age: 27 | stats: [81. 75. 80. 80. 82. 82. 80. 76. 80. 77. 74. 79. 72.]
------------------------------------------------------------------------------------------------------------------------
           Kasper Asgreen | age: 27 | stats: [81. 68. 74. 82. 79. 77. 73. 72. 80. 74. 70. 74. 73.] | distance = 15.4596
     Søren Kragh Andersen | age: 28 | stats: [82. 71. 75. 73. 79. 82. 72. 70. 76. 78. 72. 75. 82.] | distance = 18.3576
          Alberto Bettiol | age: 28 | stats: [78. 73. 73. 76. 75. 77. 70. 73. 78. 72. 68. 75. 70.] | distance = 18.6011
             Marc Hirschi | age: 23 | stats: [75. 74. 80. 73. 72. 75. 72. 76. 77. 75. 73. 80. 78.] | distance = 18.7083
       Julian Alaphilippe | age: 30 | stats: [73. 75. 80. 76. 75. 74. 69. 79. 80. 78. 70. 83. 79.] | distance = 20.1246
    Maximilian Schachmann | age: 27 | stats: [76. 76. 71. 71. 77. 75. 73. 76. 76. 77. 65. 79. 76.] | distance = 20.5913
            Gianni Moscon | age: 28 | stats: [76. 73. 72. 76