## Query similar cyclists
Draft version (using the **faiss** library)

In [1]:
import faiss
import pandas as pd
import numpy as np

In [2]:
df_cyclists = pd.read_excel("../data/cyclists_2021.xlsx")

In [3]:
df_cyclists.head()

Unnamed: 0,gene_sz_lastname,gene_sz_firstname,country,gene_i_birthdate,gene_i_size,gene_i_weight,charac_i_plain,charac_i_mountain,charac_i_downhilling,charac_i_cobble,charac_i_timetrial,charac_i_prologue,charac_i_sprint,charac_i_acceleration,charac_i_endurance,charac_i_resistance,charac_i_recuperation,charac_i_hill,charac_i_baroudeur
0,Rebellin,Davide,Italy,19710809,171,63,65,66,65,57,57,57,62,67,66,66,63,68,65
1,Zeits,Andrey,Kazakhstan,19861214,189,73,69,74,68,58,69,67,61,66,73,70,74,71,69
2,Valverde,Alejandro,Spain,19800425,177,61,72,79,75,73,70,72,70,75,76,76,74,81,66
3,Madouas,Valentin,France,19960712,178,71,74,75,69,75,67,68,69,73,76,73,75,76,76
4,Seigle,Romain,France,19941011,169,63,74,71,69,63,71,69,74,73,69,69,68,73,69


In [4]:
df_cyclists_meta = df_cyclists.iloc[:, :6].copy()

In [5]:
df_cyclists_meta["fullname"] = df_cyclists_meta["gene_sz_firstname"] + " " + df_cyclists_meta["gene_sz_lastname"]

In [6]:
age = pd.to_datetime("today") - pd.to_datetime(df_cyclists_meta["gene_i_birthdate"], format="%Y%m%d")
age = np.floor(age.dt.days / 365)
df_cyclists_meta["age"] = age.astype(int)

In [7]:
df_cyclists_meta.head()

Unnamed: 0,gene_sz_lastname,gene_sz_firstname,country,gene_i_birthdate,gene_i_size,gene_i_weight,fullname,age
0,Rebellin,Davide,Italy,19710809,171,63,Davide Rebellin,51
1,Zeits,Andrey,Kazakhstan,19861214,189,73,Andrey Zeits,35
2,Valverde,Alejandro,Spain,19800425,177,61,Alejandro Valverde,42
3,Madouas,Valentin,France,19960712,178,71,Valentin Madouas,26
4,Seigle,Romain,France,19941011,169,63,Romain Seigle,27


In [8]:
cyclists_embeddings = df_cyclists.iloc[:, 6:].values.astype(np.float32)  # should be an array
cyclists_embeddings = np.ascontiguousarray(cyclists_embeddings)  # should be c-contiguous

In [9]:
cyclists_embeddings

array([[65., 66., 65., ..., 63., 68., 65.],
       [69., 74., 68., ..., 74., 71., 69.],
       [72., 79., 75., ..., 74., 81., 66.],
       ...,
       [51., 50., 55., ..., 50., 50., 55.],
       [53., 51., 56., ..., 51., 52., 55.],
       [57., 52., 58., ..., 53., 55., 57.]], dtype=float32)

In [10]:
d = cyclists_embeddings.shape[1]
index = faiss.IndexFlatL2(d)

In [11]:
age_max = 30

if age_max is not None:
    population_indices = df_cyclists_meta[df_cyclists_meta.age <= age_max].index
else:
    population_indices = df_cyclists_meta.index
    
cyclists_embeddings_population = cyclists_embeddings[population_indices]

In [12]:
index.reset()
index.add(cyclists_embeddings_population)
index.ntotal

4620

In [13]:
all_riders = df_cyclists_meta["fullname"].tolist()
# all_riders

In [14]:
def display_rider_info(idx, D=None, k_i=0, df_meta=df_cyclists_meta, arr_emb=cyclists_embeddings, n_just=25):
    p = f"{df_meta.fullname.iloc[idx].rjust(n_just)} | age: {df_meta.age.iloc[idx]} | stats: {arr_emb[idx]}"
    if D is not None:
        p = p + " | distance = " + str(int(D[k_i]))
    
    print(p)

In [15]:
rider = "Jasper Philipsen"

In [16]:
assert rider in all_riders, "Selected rider not in database, sorry!"
rider_i = all_riders.index(rider)
rider_embedding = cyclists_embeddings[[rider_i]]

In [17]:
k = 10
D, I = index.search(rider_embedding, k + 1)  # search k closest riders (drop first, will be rider_i)

In [18]:
display_rider_info(rider_i)
print("-"*120)

for k_i, i in enumerate(I[0][1:]):
    display_rider_info(i, D[0][1:], k_i, 
                       df_meta=df_cyclists_meta.iloc[population_indices],
                       arr_emb=cyclists_embeddings_population)

         Jasper Philipsen | age: 24 | stats: [74. 62. 67. 72. 66. 73. 78. 77. 73. 74. 69. 67. 65.]
------------------------------------------------------------------------------------------------------------------------
           Mike Teunissen | age: 29 | stats: [75. 61. 72. 75. 66. 73. 77. 75. 75. 73. 68. 69. 67.] | distance = 55
             Jake Stewart | age: 22 | stats: [74. 59. 65. 72. 66. 73. 76. 75. 71. 70. 66. 71. 65.] | distance = 66
                 Cees Bol | age: 27 | stats: [72. 62. 69. 72. 68. 71. 78. 77. 71. 69. 66. 62. 64.] | distance = 80
            Arnaud Démare | age: 30 | stats: [73. 60. 70. 73. 71. 72. 81. 80. 74. 76. 71. 69. 62.] | distance = 81
         Fernando Gaviria | age: 28 | stats: [72. 63. 70. 70. 61. 71. 77. 79. 72. 70. 72. 68. 68.] | distance = 88
             David Dekker | age: 24 | stats: [73. 58. 68. 73. 65. 70. 75. 75. 72. 69. 65. 65. 69.] | distance = 104
             Marc Sarreau | age: 29 | stats: [70. 56. 68. 72. 68. 72. 74. 74. 72. 73. 66.