# Top 10 stage predictions for the Tour de France

## Imports

In [2]:
from procyclingstats import Race
from procyclingstats import RiderResults
from procyclingstats import Stage
from procyclingstats import RaceClimbs
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras import layers
import lightgbm as lgb

2023-07-20 19:59:12.740652: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Get riders name

The first step in our study is to retrieve the names of the Tour de France 2023 riders. To do this, we went to [ProCyclingStats](https://www.procyclingstats.com) and downloaded the html page containing the names of the Tour riders.

In [19]:
with open('data/riders_tdf_2023.html', 'r') as file:
    html_content = file.read()
    
soup = BeautifulSoup(html_content, 'html.parser')

href_elements = soup.find_all(href=True)

In [20]:
list_riders = []
for element in href_elements:
    href_value = element['href']
    list_riders.append(href_value)

In [25]:
list_riders

['rider/jonas-abrahamsen',
 'rider/julian-alaphilippe',
 'rider/andrey-amador',
 'rider/alex-aranburu',
 'rider/nikias-arndt',
 'rider/kasper-asgreen',
 'rider/romain-bardet',
 'rider/warren-barguil',
 'rider/phil-bauhaus',
 'rider/tiesj-benoot',
 'rider/egan-bernal',
 'rider/clement-berthet',
 'rider/alberto-bettiol',
 'rider/jenthe-biermans',
 'rider/pello-bilbao',
 'rider/mikkel-bjerg',
 'rider/edvald-boasson-hagen',
 'rider/guillaume-boivin',
 'rider/cees-bol',
 'rider/emanuel-buchmann',
 'rider/mathieu-burgaudeau',
 'rider/lilian-calmejane',
 'rider/victor-campenaerts',
 'rider/richard-carapaz',
 'rider/jonathan-castroviejo',
 'rider/remi-cavagna',
 'rider/mark-cavendish',
 'rider/clement-champoussin',
 'rider/anthon-charmig',
 'rider/johan-esteban-chaves',
 'rider/giulio-ciccone',
 'rider/simon-clarke',
 'rider/bryan-coquard',
 'rider/magnus-cort-nielsen',
 'rider/benoit-cosnefroy',
 'rider/rui-costa',
 'rider/lawson-craddock',
 'rider/steff-cras',
 'rider/jasper-de-buyst',
 'rid

Now that we've retrieved the names of all the Tour de France riders, we're going to retrieve all the races of these riders available on the API and store them in a dataframe.

# Get rider's race history

In [40]:
# Get one rider's result to get the keys for all races
rider_results = RiderResults("{}/results".format(list_riders[0]))

In [43]:
data = {}
for var in list(rider_results.parse()['results'][0].keys()) :
    data[var] = []

In [44]:
data["rider"]=[]

In [45]:
for rider in list_riders :
    rider_results = RiderResults("{}/results".format(rider))
    for race in rider_results.parse()['results'] :
        if "race/tour-de-france/2023/" not in race["stage_url"] :
            data["rider"].append(rider.split("/")[1])
            for key, val in race.items() :
                data[key].append(val)

In [47]:
data = pd.DataFrame(data)

In [48]:
data

Unnamed: 0,date,rank,stage_url,stage_name,nationality,class,distance,pcs_points,uci_points,rider
0,2023-06-25,3.0,race/nc-norway/2023/result,National Championships Norway ME - Road Race,NO,NC,185.0,7,60.0,jonas-abrahamsen
1,2023-06-18,24.0,race/tour-of-belgium/2023/stage-5-points,Baloise Belgium Tour | Points classification,BE,2.Pro,,0,0.0,jonas-abrahamsen
2,2023-06-18,18.0,race/tour-of-belgium/2023/gc,Baloise Belgium Tour,BE,2.Pro,,12,5.0,jonas-abrahamsen
3,2023-06-18,49.0,race/tour-of-belgium/2023/stage-5,Baloise Belgium Tour | Stage 5,BE,2.Pro,194.8,0,0.0,jonas-abrahamsen
4,2023-06-17,9.0,race/tour-of-belgium/2023/stage-4,Baloise Belgium Tour | Stage 4,BE,2.Pro,172.6,0,0.0,jonas-abrahamsen
...,...,...,...,...,...,...,...,...,...,...
14572,2022-07-27,50.0,race/tour-de-wallonie/2022/stage-5,Ethias-Tour de Wallonie | Stage 5,BE,2.Pro,214.8,0,0.0,axel-zingle
14573,2022-07-26,47.0,race/tour-de-wallonie/2022/stage-4,Ethias-Tour de Wallonie | Stage 4,BE,2.Pro,200.8,0,0.0,axel-zingle
14574,2022-07-25,13.0,race/tour-de-wallonie/2022/stage-3,Ethias-Tour de Wallonie | Stage 3,BE,2.Pro,195.6,0,0.0,axel-zingle
14575,2022-07-24,64.0,race/tour-de-wallonie/2022/stage-2,Ethias-Tour de Wallonie | Stage 2,BE,2.Pro,176.8,0,0.0,axel-zingle


At present, we have a dataframe with all the races of each rider, but we don't have enough information on the races and stages. So we're going to use the API to look up the characteristics of the races, create a dataframe and then merge it with the previous dataframe.

# Get stages characteristics

In [15]:
list_stages = list(pd.unique(data["stage_url"]))

In [19]:
dico_stages = {}
dico_stages['race'] = []
for var in ['climbs', 'distance', 'pcs_points_scale', 'profile_icon', 'race_startlist_quality_score', 'stage_type', 'vertical_meters', 'won_how'] :
    dico_stages[var] = []

In [20]:
df_stages = pd.DataFrame(columns=['race', 'climbs', 'distance', 'pcs_points_scale', 'profile_icon', 'race_startlist_quality_score', 'stage_type', 'vertical_meters', 'won_how'])

In [21]:
for i in range(len(list_stages)) :
    try :
        Stage(list_stages[i]).parse()
    except :
        None
    else :
        stage = Stage(list_stages[i]).parse()
        df_stages.loc[i, 'race'] = list_stages[i]
        df_stages.loc[i, 'climbs'] = len(stage['climbs'])
        for var in ['distance', 'pcs_points_scale', 'profile_icon', 'race_startlist_quality_score', 'stage_type', 'vertical_meters', 'won_how'] :
            df_stages.loc[i, var] = stage[var]

0.0 %
0.07 %
0.15 %
0.22 %
0.29 %
0.37 %
0.44 %
0.51 %
0.59 %
0.66 %
0.73 %
0.81 %
0.88 %
0.95 %
1.02 %
1.1 %
1.17 %
1.24 %
1.32 %
1.39 %
1.46 %
1.54 %
1.61 %
1.68 %
1.76 %
1.83 %
1.9 %
1.98 %
2.05 %
2.12 %
2.2 %
2.27 %
2.34 %
2.42 %
2.49 %
2.56 %
2.64 %
2.71 %
2.78 %
2.86 %
2.93 %
3.0 %
3.07 %
3.15 %
3.22 %
3.29 %
3.37 %
3.44 %
3.51 %
3.59 %
3.66 %
3.73 %
3.81 %
3.88 %
3.95 %
4.03 %
4.1 %
4.17 %
4.25 %
4.32 %
4.39 %
4.47 %
4.54 %
4.61 %
4.69 %
4.76 %
4.83 %
4.9 %
4.98 %
5.05 %
5.12 %
5.2 %
5.27 %
5.34 %
5.42 %
5.49 %
5.56 %
5.64 %
5.71 %
5.78 %
5.86 %
5.93 %
6.0 %
6.08 %
6.15 %
6.22 %
6.3 %
6.37 %
6.44 %
6.52 %
6.59 %
6.66 %
6.73 %
6.81 %
6.88 %
6.95 %
7.03 %
7.1 %
7.17 %
7.25 %
7.32 %
7.39 %
7.47 %
7.54 %
7.61 %
7.69 %
7.76 %
7.83 %
7.91 %
7.98 %
8.05 %
8.13 %
8.2 %
8.27 %
8.35 %
8.42 %
8.49 %
8.57 %
8.64 %
8.71 %
8.78 %
8.86 %
8.93 %
9.0 %
9.08 %
9.15 %
9.22 %
9.3 %
9.37 %
9.44 %
9.52 %
9.59 %
9.66 %
9.74 %
9.81 %
9.88 %
9.96 %
10.03 %
10.1 %
10.18 %
10.25 %
10.32 %
10.4 %
10.47 %
1

77.23 %
77.31 %
77.38 %
77.45 %
77.53 %
77.6 %
77.67 %
77.75 %
77.82 %
77.89 %
77.96 %
78.04 %
78.11 %
78.18 %
78.26 %
78.33 %
78.4 %
78.48 %
78.55 %
78.62 %
78.7 %
78.77 %
78.84 %
78.92 %
78.99 %
79.06 %
79.14 %
79.21 %
79.28 %
79.36 %
79.43 %
79.5 %
79.58 %
79.65 %
79.72 %
79.8 %
79.87 %
79.94 %
80.01 %
80.09 %
80.16 %
80.23 %
80.31 %
80.38 %
80.45 %
80.53 %
80.6 %
80.67 %
80.75 %
80.82 %
80.89 %
80.97 %
81.04 %
81.11 %
81.19 %
81.26 %
81.33 %
81.41 %
81.48 %
81.55 %
81.63 %
81.7 %
81.77 %
81.84 %
81.92 %
81.99 %
82.06 %
82.14 %
82.21 %
82.28 %
82.36 %
82.43 %
82.5 %
82.58 %
82.65 %
82.72 %
82.8 %
82.87 %
82.94 %
83.02 %
83.09 %
83.16 %
83.24 %
83.31 %
83.38 %
83.46 %
83.53 %
83.6 %
83.67 %
83.75 %
83.82 %
83.89 %
83.97 %
84.04 %
84.11 %
84.19 %
84.26 %
84.33 %
84.41 %
84.48 %
84.55 %
84.63 %
84.7 %
84.77 %
84.85 %
84.92 %
84.99 %
85.07 %
85.14 %
85.21 %
85.29 %
85.36 %
85.43 %
85.51 %
85.58 %
85.65 %
85.72 %
85.8 %
85.87 %
85.94 %
86.02 %
86.09 %
86.16 %
86.24 %
86.31 %
86.38 %
86.4

In [22]:
df_stages

Unnamed: 0,race,climbs,distance,pcs_points_scale,profile_icon,race_startlist_quality_score,stage_type,vertical_meters,won_how
1,race/tour-of-belgium/2023/stage-5-points,0,194.8,2.PRO.Stage,p2,323,RR,1174,Sprint of large group
2,race/tour-of-belgium/2023/gc,0,194.8,2.PRO.Stage,p2,323,RR,1174,Sprint of large group
3,race/tour-of-belgium/2023/stage-5,0,194.8,2.PRO.Stage,p2,323,RR,1174,Sprint of large group
4,race/tour-of-belgium/2023/stage-4,0,172.6,2.PRO.Stage,p2,323,RR,3230,36.7 km solo
5,race/tour-of-belgium/2023/stage-3,0,15.2,2.PRO.Stage,p1,323,ITT,37,Time Trial
...,...,...,...,...,...,...,...,...,...
1361,race/settimana-ciclistica-italiana/2021/stage-...,0,170.2,2.1.Stage,p2,289,RR,1905,Sprint of large group
1362,race/settimana-ciclistica-italiana/2021/gc,0,170.2,2.1.Stage,p2,289,RR,1905,Sprint of large group
1363,race/settimana-ciclistica-italiana/2021/stage-5,0,170.2,2.1.Stage,p2,289,RR,1905,Sprint of large group
1364,race/settimana-ciclistica-italiana/2021/stage-4,0,168.0,2.1.Stage,p2,289,RR,2503,Sprint of small group


Now that we have our dataframe of every race contested by Tour de France riders over the last three years, we can merge it with the first dataframe created.

In [26]:
data = data.rename(columns={'stage_url': 'race'})

In [28]:
df_all = pd.merge(data, df_stages, on='race', how='left')

In [29]:
mots_exclus = ["/gc", "points", "-kom"]
df_all = df_all[~df_all['race'].str.contains('|'.join(mots_exclus))]

In [30]:
df_all

Unnamed: 0,date,rank,race,stage_name,nationality,class,distance_x,pcs_points,uci_points,rider,climbs,distance_y,pcs_points_scale,profile_icon,race_startlist_quality_score,stage_type,vertical_meters,won_how
0,2023-06-25,3.0,race/nc-norway/2023/result,National Championships Norway ME - Road Race,NO,NC,185.0,7,60.0,jonas-abrahamsen,,,,,,,,
3,2023-06-18,49.0,race/tour-of-belgium/2023/stage-5,Baloise Belgium Tour | Stage 5,BE,2.Pro,194.8,0,0.0,jonas-abrahamsen,0,194.8,2.PRO.Stage,p2,323,RR,1174,Sprint of large group
4,2023-06-17,9.0,race/tour-of-belgium/2023/stage-4,Baloise Belgium Tour | Stage 4,BE,2.Pro,172.6,0,0.0,jonas-abrahamsen,0,172.6,2.PRO.Stage,p2,323,RR,3230,36.7 km solo
5,2023-06-16,97.0,race/tour-of-belgium/2023/stage-3,Baloise Belgium Tour | Stage 3 (ITT),BE,2.Pro,15.2,0,0.0,jonas-abrahamsen,0,15.2,2.PRO.Stage,p1,323,ITT,37,Time Trial
6,2023-06-15,63.0,race/tour-of-belgium/2023/stage-2,Baloise Belgium Tour | Stage 2,BE,2.Pro,175.7,0,0.0,jonas-abrahamsen,0,175.7,2.PRO.Stage,p1,323,RR,457,Sprint of large group
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17595,2022-05-22,9.0,race/boucles-de-l-aulne/2022/result,Boucles de l'Aulne - Châteaulin,FR,1.1,175.2,16,25.0,axel-zingle,0,175.2,1.1,p2,133,RR,2848,19.7 km solo
17596,2022-05-21,58.0,race/tour-du-finistere/2022/result,Tour du Finistère,FR,1.1,193.3,0,0.0,axel-zingle,0,193.3,1.1,p3,137,RR,2860,Sprint à deux
17597,2022-05-04,,race/4-jours-de-dunkerque/2022/stage-2,4 Jours de Dunkerque / Grand Prix des Hauts de...,FR,2.Pro,181.5,0,0.0,axel-zingle,3,181.5,2.PRO.Stage,p1,159,RR,1089,Sprint of large group
17598,2022-05-03,22.0,race/4-jours-de-dunkerque/2022/stage-1,4 Jours de Dunkerque / Grand Prix des Hauts de...,FR,2.Pro,161.1,0,0.0,axel-zingle,3,161.1,2.PRO.Stage,p1,159,RR,713,Sprint of large group


In [36]:
with open('df_tdf.pkl', 'wb') as f:
    pickle.dump(df_all, f)

# Dictionnaire 1 dataframe pour chaque coureur

In [37]:
dico_data_rider = {}
for rider in list_riders :
    rider = rider.split("/")[1]
    dico_data_rider[rider] = df_all[df_all["rider"] == rider]
    dico_data_rider[rider].index = range(len(dico_data_rider[rider]))

In [245]:
dico_data_rider["tobias-halland-johannessen"]

Unnamed: 0,date,rank,race,stage_name,nationality,class,distance_x,pcs_points,uci_points,rider,climbs,distance_y,pcs_points_scale,profile_icon,race_startlist_quality_score,stage_type,vertical_meters,won_how
0,2023-06-25,10.0,race/nc-norway/2023/result,National Championships Norway ME - Road Race,NO,NC,185.0,0,3.0,tobias-halland-johannessen,,,,,,,,
1,2023-06-11,3.0,race/dauphine/2023/stage-8-youth,Critérium du Dauphiné | Youth classification,FR,2.UWT,,0,0.0,tobias-halland-johannessen,0,152.8,2.WT.Stage,p5,781,RR,4160,20 km solo
2,2023-06-11,24.0,race/dauphine/2023/stage-8,Critérium du Dauphiné | Stage 8,FR,2.UWT,152.8,0,0.0,tobias-halland-johannessen,6,152.8,2.WT.Stage,p5,781,RR,4160,20 km solo
3,2023-06-10,15.0,race/dauphine/2023/stage-7,Critérium du Dauphiné | Stage 7,FR,2.UWT,147.9,0,0.0,tobias-halland-johannessen,3,147.9,2.WT.Stage,p5,781,RR,4015,5.3 km solo
4,2023-06-09,14.0,race/dauphine/2023/stage-6,Critérium du Dauphiné | Stage 6,FR,2.UWT,170.2,0,0.0,tobias-halland-johannessen,4,170.2,2.WT.Stage,p5,781,RR,3406,Sprint à deux
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,2021-10-08,11.0,race/circuit-des-ardennes-international/2021/s...,Circuit des Ardennes International | Stage 2,FR,2.2,186.9,0,0.0,tobias-halland-johannessen,4,186.9,2.2.Stage,p2,39,RR,2026,Sprint of large group
77,2021-10-07,100.0,race/circuit-des-ardennes-international/2021/s...,Circuit des Ardennes International | Stage 1,FR,2.2,115.3,0,0.0,tobias-halland-johannessen,4,115.3,2.2.Stage,p2,39,RR,1805,2 km solo
78,2021-10-03,,race/ronde-de-l-isard/2021/stage-5,Ronde de l'Isard | Stage 5,FR,2.2U,157.5,0,0.0,tobias-halland-johannessen,3,157.5,2.2.Stage,p2,9,RR,3048,45 km solo
79,2021-10-02,50.0,race/ronde-de-l-isard/2021/stage-4,Ronde de l'Isard | Stage 4,FR,2.2U,149.0,0,0.0,tobias-halland-johannessen,2,149.0,2.2.Stage,p5,9,RR,2915,? km solo


# Données tour de france 2023

In [185]:
stage = Stage("race/tour-de-france/2023/stage-21")

In [186]:
print(stage.parse())

{'arrival': 'Paris', 'climbs': [{'climb_name': 'Côte du Pavé des Gardes', 'climb_url': 'location/cote-du-pave-des-gardes'}], 'date': '2023-07-23', 'departure': 'Saint-Quentin-en-Yvelines', 'distance': 115.1, 'gc': [], 'is_one_day_race': False, 'kom': [], 'pcs_points_scale': 'GT.A.Stage', 'points': [], 'profile_icon': 'p1', 'profile_score': 14, 'race_startlist_quality_score': 1584, 'results': None, 'stage_type': 'RR', 'teams': [], 'uci_points_scale': 'UCI.WR.GT.A.Stage', 'vertical_meters': 577, 'winning_attack_length': None, 'won_how': '? - let us know!', 'youth': []}


In [55]:
df_tdf_stages_2023 = pd.DataFrame(columns=['race', 'climbs', 'distance', 'pcs_points_scale', 'profile_icon', 'race_startlist_quality_score', 'stage_type', 'vertical_meters' 'won_how'])

In [54]:
print(list(range(1,22)))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


In [58]:
for i in range(1,22) :
    stage = Stage("race/tour-de-france/2023/stage-{}".format(i)).parse()
    df_tdf_stages_2023.loc[i, 'race'] = "race/tour-de-france/2023/stage-{}".format(i)
    df_tdf_stages_2023.loc[i, 'climbs'] = len(stage['climbs'])
    for var in ['distance', 'pcs_points_scale', 'profile_icon', 'race_startlist_quality_score', 'stage_type', 'vertical_meters', 'won_how'] :
        df_tdf_stages_2023.loc[i, var] = stage[var]

In [59]:
df_tdf_stages_2023

Unnamed: 0,race,climbs,distance,pcs_points_scale,profile_icon,race_startlist_quality_score,stage_type,vertical_meterswon_how,vertical_meters,won_how
1,race/tour-de-france/2023/stage-1,5,182.0,GT.A.Stage,p3,1584,RR,,3221.0,? - let us know!
2,race/tour-de-france/2023/stage-2,5,208.9,GT.A.Stage,p2,1584,RR,,2949.0,? - let us know!
3,race/tour-de-france/2023/stage-3,4,193.5,GT.A.Stage,p2,1584,RR,,2667.0,? - let us know!
4,race/tour-de-france/2023/stage-4,1,181.8,GT.A.Stage,p1,1584,RR,,1427.0,? - let us know!
5,race/tour-de-france/2023/stage-5,3,162.7,GT.A.Stage,p4,1584,RR,,3652.0,? - let us know!
6,race/tour-de-france/2023/stage-6,4,144.9,GT.A.Stage,p5,1584,RR,,3894.0,? - let us know!
7,race/tour-de-france/2023/stage-7,1,169.9,GT.A.Stage,p1,1584,RR,,808.0,? - let us know!
8,race/tour-de-france/2023/stage-8,3,200.7,GT.A.Stage,p2,1584,RR,,1996.0,? - let us know!
9,race/tour-de-france/2023/stage-9,4,182.4,GT.A.Stage,p5,1584,RR,,3441.0,? - let us know!
10,race/tour-de-france/2023/stage-10,5,167.2,GT.A.Stage,p2,1584,RR,,3151.0,? - let us know!


# Modification du dataframe pour la prédiction

In [124]:
def transform_df(df) :
    
    # df rider
    data = df.drop(columns=["rider", "race", "stage_name" ,"date", "nationality", "uci_points", "won_how", "distance_y", "pcs_points", "class"])
    data = data.rename(columns={'distance_x': 'distance'})
    data = data.dropna()
    df_dummies = data[["pcs_points_scale", "profile_icon", "stage_type"]]
    df_dummies = pd.get_dummies(df_dummies, prefix='', prefix_sep='')
    df_new = pd.concat([data.drop(columns=["pcs_points_scale", "profile_icon", "stage_type"]), df_dummies], axis = 1)
    for col in df_new.columns :
        df_new[col] = df_new[col].astype(float)
        
    # df tdf 
    df_tdf = df_tdf_stages_2023
    df_tdf = df_tdf.drop(columns=["race", 'vertical_meterswon_how', 'won_how'])
    df_tdf_dummies = df_tdf[["pcs_points_scale", "profile_icon", "stage_type"]]
    df_tdf_dummies = pd.get_dummies(df_tdf_dummies, prefix='', prefix_sep='')
    df_tdf_new = pd.concat([df_tdf.drop(columns=["pcs_points_scale", "profile_icon", "stage_type"]), df_tdf_dummies], axis = 1)
    for col in df_tdf_new.columns :
        df_tdf_new[col] = df_tdf_new[col].astype(float)
        
    X = df_new.drop(columns=["rank"])
    Y = df_new["rank"]
    Y.index = range(len(Y))
    
    X_combined = pd.concat([df_new.drop(columns=["rank"]), df_tdf_new], axis = 0).fillna(0)
    X_combined.index = range(len(X_combined))
    
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(X_combined)
    df_scaled = pd.DataFrame(df_scaled, columns=X_combined.columns)
    
    # Train 
    train = df_scaled[0:-21]
    train = pd.concat([train, Y], axis = 1)
    # Test
    test = df_scaled[-21:]
    test.index = range(1,22)

    return {"train" : train, "test" : test}

In [190]:
transform_df(dico_data_rider["julian-alaphilippe"])["train"]

Unnamed: 0,distance,climbs,race_startlist_quality_score,vertical_meters,1.1,1.PRO,1.WT.A,1.WT.B,2.1.Stage,2.PRO.Stage,...,Worlds.RR,p1,p2,p3,p4,p5,ITT,RR,GT.A.Stage,rank
0,-0.328488,1.760800,-0.337685,1.353197,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,-0.361873,...,-0.146647,-0.380235,-0.871355,-0.548972,-0.259645,2.405351,-0.259645,0.259645,-0.532714,14.0
1,-0.420046,0.306226,-0.337685,1.231889,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,-0.361873,...,-0.146647,-0.380235,-0.871355,-0.548972,-0.259645,2.405351,-0.259645,0.259645,-0.532714,19.0
2,-0.003361,0.791084,-0.337685,0.722392,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,-0.361873,...,-0.146647,-0.380235,-0.871355,-0.548972,-0.259645,2.405351,-0.259645,0.259645,-0.532714,9.0
3,0.387164,0.306226,-0.337685,-0.438825,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,-0.361873,...,-0.146647,-0.380235,1.147638,-0.548972,-0.259645,-0.415740,-0.259645,0.259645,-0.532714,2.0
4,-2.602504,-1.148348,-0.337685,-1.785770,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,-0.361873,...,-0.146647,-0.380235,1.147638,-0.548972,-0.259645,-0.415740,3.851407,-3.851407,-0.532714,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,0.048958,0.306226,-1.131146,-0.520813,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,2.763397,...,-0.146647,-0.380235,1.147638,-0.548972,-0.259645,-0.415740,-0.259645,0.259645,-0.532714,30.0
70,0.456300,0.306226,-1.131146,-0.218796,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,2.763397,...,-0.146647,-0.380235,1.147638,-0.548972,-0.259645,-0.415740,-0.259645,0.259645,-0.532714,12.0
71,0.516093,0.306226,-1.131146,0.171065,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,2.763397,...,-0.146647,-0.380235,-0.871355,1.821588,-0.259645,-0.415740,-0.259645,0.259645,-0.532714,3.0
72,-0.339699,0.306226,-1.131146,-0.865497,-0.209657,-0.303239,-0.235702,-0.209657,-0.180579,2.763397,...,-0.146647,-0.380235,1.147638,-0.548972,-0.259645,-0.415740,-0.259645,0.259645,-0.532714,8.0


# Prédiction

In [339]:
def pred(df) :
    
    X = (transform_df(df)['train']).drop(columns=['rank'])
    Y = transform_df(df)['train']["rank"]
    
        
    num_folds = 3
    
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    rmse_scores = []
    
    #for train_index, test_index in kf.split(X):
    #    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    #    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
        
    regressor = RandomForestRegressor()
    
    param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4, 8]
    }
    
    grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error')

    grid_search.fit(X, Y)
        
    #regressor.fit(X_train, y_train)
        
    #y_pred = regressor.predict(X_test)
        
    rmse = np.sqrt(-grid_search.best_score_)
        
    rmse_scores.append(rmse)
        
    #for fold, rmse in enumerate(rmse_scores):
    #    print(f"Fold {fold+1}: RMSE = {rmse}")
    
    #mean_rmse = np.mean(rmse_scores)
    
    # Prediction of tdf stages 2023
    model = RandomForestRegressor(**grid_search.best_params_)
    model.fit(X,Y)
    y_pred = model.predict(transform_df(df)['test'])
    
    
    # Df result :
    df_result = df_tdf_stages_2023.drop(columns=["pcs_points_scale", "profile_icon", "race_startlist_quality_score", "stage_type", "vertical_meterswon_how", "won_how"])
    df_result['rank'] = y_pred
    return {"df" : df_result, "rmse" : rmse}

In [340]:
dico_result_riders_tdf_23 = {}
dico_rmse = {}
for rider in list_riders :
    print(rider.split("/")[1])
    dico_result_riders_tdf_23[rider.split("/")[1]] = pred(dico_data_rider[rider.split("/")[1]])["df"]
    dico_rmse[rider.split("/")[1]] = pred(dico_data_rider[rider.split("/")[1]])["rmse"]

jonas-abrahamsen
julian-alaphilippe
andrey-amador
alex-aranburu
nikias-arndt
kasper-asgreen
romain-bardet
warren-barguil
phil-bauhaus
tiesj-benoot
egan-bernal
clement-berthet
alberto-bettiol
jenthe-biermans
pello-bilbao
mikkel-bjerg
edvald-boasson-hagen
guillaume-boivin
cees-bol
emanuel-buchmann
mathieu-burgaudeau
lilian-calmejane
victor-campenaerts
richard-carapaz
jonathan-castroviejo
remi-cavagna
mark-cavendish
clement-champoussin
anthon-charmig
johan-esteban-chaves
giulio-ciccone
simon-clarke
bryan-coquard
magnus-cort-nielsen
benoit-cosnefroy
rui-costa
lawson-craddock
steff-cras
jasper-de-buyst
david-de-la-cruz
tim-declercq
john-degenkolb
anthony-delaplace
dries-devenyns
stan-dewulf
silvan-dillier
matthew-dinham
luke-durbridge
alexander-edmondson
nils-eekhoff
pascal-eenkhoorn
caleb-ewan
yevgeniy-fedorov
valentin-ferron
omar-fraile
frederik-frison
felix-gall
tony-gallopin
david-gaudu
kevin-geniets
simon-geschke
biniam-girmay
michael-gogl
jonas-gregaard
dylan-groenewegen
felix-grosssc

In [341]:
dico_result_riders_tdf_23

{'jonas-abrahamsen':                                  race climbs distance  vertical_meters  \
 1    race/tour-de-france/2023/stage-1      5    182.0           3221.0   
 2    race/tour-de-france/2023/stage-2      5    208.9           2949.0   
 3    race/tour-de-france/2023/stage-3      4    193.5           2667.0   
 4    race/tour-de-france/2023/stage-4      1    181.8           1427.0   
 5    race/tour-de-france/2023/stage-5      3    162.7           3652.0   
 6    race/tour-de-france/2023/stage-6      4    144.9           3894.0   
 7    race/tour-de-france/2023/stage-7      1    169.9            808.0   
 8    race/tour-de-france/2023/stage-8      3    200.7           1996.0   
 9    race/tour-de-france/2023/stage-9      4    182.4           3441.0   
 10  race/tour-de-france/2023/stage-10      5    167.2           3151.0   
 11  race/tour-de-france/2023/stage-11      3    179.8           1873.0   
 12  race/tour-de-france/2023/stage-12      5    168.8           3120.0   
 13  

In [342]:
rank_stages_tdf_23 = {}
for i in df_tdf_stages_2023["race"] :
    dico = {}
    for key, val in dico_result_riders_tdf_23.items() :
        dico[key] = float(val.loc[val['race']==i, 'rank'])
    dico = dict(sorted(dico.items(), key=lambda x: x[1]))
    rank_stages_tdf_23[i] = dico

In [343]:
rank_stages_tdf_23

{'race/tour-de-france/2023/stage-1': {'tadej-pogacar': 9.812874053040366,
  'jonas-vingegaard-rasmussen': 10.457449342745317,
  'enric-mas': 15.400543814299544,
  'jai-hindley': 15.453102156492045,
  'guillaume-martin': 16.135069250194253,
  'egan-bernal': 17.08529305890188,
  'adam-yates': 17.80911309523809,
  'thomas-pidcock': 18.106236166611158,
  'neilson-powless': 19.77,
  'mikel-landa': 19.815667715282657,
  'david-gaudu': 20.827946386946383,
  'romain-bardet': 22.242540848040854,
  'simon-yates': 23.278106615606617,
  'patrick-konrad': 24.40633602508603,
  'valentin-madouas': 24.637297792246066,
  'emanuel-buchmann': 25.020106107014367,
  'aleksey-lutsenko': 25.46300221931491,
  'carlos-rodriguez-cano': 26.12249279476207,
  'thibaut-pinot': 26.5025,
  'luis-leon-sanchez': 26.52,
  'bob-jungels': 27.396030808286994,
  'richard-carapaz': 27.98129190513612,
  'alex-aranburu': 28.68932126207126,
  'daniel-felipe-martinez': 29.6117748759304,
  'warren-barguil': 31.169035621592624,
  

In [344]:
def split_list(lst):
    n = len(lst)
    k = n // 4  # Taille de chaque sous-liste

    sublists = [lst[i:i+k] for i in range(0, n, k)]

    return sublists

rmse_ordered = dict(sorted(dico_rmse.items(), key=lambda x: x[1]))
rmse_ordered_split = split_list(list(rmse_ordered.values()))

In [345]:
dico_note_rmse = {}
for i, j in zip(["****","***","**","*"], rmse_ordered_split) :
    dico_note_rmse[i] = j

In [346]:
for key, val in rmse_ordered.items() :
    grade = ""
    for k, v in dico_note_rmse.items() :
        if val in v :
            grade = k
    rmse_ordered[key] = [val, grade]

In [347]:
rmse_ordered

{'carlos-rodriguez-cano': [13.046710121227115, '****'],
 'romain-bardet': [15.919566436209259, '****'],
 'johan-esteban-chaves': [17.721039806479844, '****'],
 'guillaume-martin': [20.31838281120732, '****'],
 'nelson-oliveira': [21.180466952827565, '****'],
 'enric-mas': [21.604998565119942, '****'],
 'tadej-pogacar': [21.999652193038287, '****'],
 'ben-o-connor': [22.106547219253503, '****'],
 'mattias-skjelmose-jensen': [22.147468739596686, '****'],
 'jai-hindley': [22.634137566089954, '****'],
 'aurelien-paret-peintre': [23.162870285951705, '****'],
 'gorka-izagirre': [23.371200660035996, '****'],
 'louis-meintjes': [23.395970905545273, '****'],
 'luis-leon-sanchez': [23.733742721263955, '****'],
 'jonas-vingegaard-rasmussen': [23.77697345228282, '****'],
 'stefan-kung': [23.987821431260567, '****'],
 'valentin-madouas': [24.08664679064402, '****'],
 'mikel-landa': [24.417571378666544, '****'],
 'dylan-teuns': [24.433328749937672, '****'],
 'jack-haig': [24.566523669810778, '****']

In [348]:
file = open("pred_results-tdf_23_rf.txt", "w")
for key, val in rank_stages_tdf_23.items() :
    stage = Stage(key).parse()
    file.write("--------- Etape {} --------- \n".format(key.split("-")[-1]))
    file.write("Départ : {} \n".format(stage['departure']))
    file.write("Arrivé : {} \n".format(stage['arrival']))
    file.write("Distance : {} \n".format(stage['distance']))
    file.write("Dénivelé : {} \n".format(stage['vertical_meters']))
    for i in range(1,11) :
        file.write("{} - {} {} \n".format(i, list(val.keys())[i-1], rmse_ordered[list(val.keys())[i-1]][1]))
    file.write("-------------------------------------------------\n")
file.write("Moyenne RMSE : {} \n".format(np.mean([i[0] for i in list(rmse_ordered.values())])))
for key, val in rmse_ordered.items() :
    file.write("{} : {} \n".format(key, val[0]))
file.close()