In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from multiprocessing import Pool
from bs4 import BeautifulSoup
import lightgbm as lgb
import pandas as pd
import numpy as np
import threading
import requests
import time
import re

In [2]:
sporza_races = [
    "omloop-het-nieuwsblad",
    "kuurne-brussel-kuurne",
    "gp-samyn",
    "strade-bianche",
    "nokere-koers",
    "bredene-koksijde-classic",
    "milano-sanremo",
    "oxyclean-classic-brugge-de-panne",
    "e3-harelbeke",
    "gent-wevelgem",
    "dwars-door-vlaanderen",
    "ronde-van-vlaanderen",
    "scheldeprijs",
    "paris-roubaix",
    "brabantse-pijl",
    "amstel-gold-race",
    "la-fleche-wallone",
    "liege-bastogne-liege"
]

Let's get the list of riders that start in each of the races this year. These will be fed to the model to be ranked later.

In [3]:
start_info = {}
for race in sporza_races:
    startlist = []
    race_link = f"https://www.procyclingstats.com/race/{race}/2024/startlist/startlist-quality"
    response = requests.get(race_link)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="basic")
    for rider in table.find_all("tr")[1:]:
        try:
            startlist.append(rider.find("a")["href"].split('/')[1])
        except:
            break
    start_info[race] = startlist

In [4]:
from collections import Counter

all_names = [name for names in start_info.values() for name in names]
name_counts = Counter(all_names)
name_counts = sorted(name_counts.items(), key=lambda x: x[1], reverse=True)
print("Riders with the most participations:")
for rank, (name, count) in enumerate(name_counts, 1):
    if count >= 9:
        print(f"{rank}. {name}: {count}")
        
num_unique_names = len(set(name for names in start_info.values() for name in names))
print("")
print(f"There are {num_unique_names}/945 Sporza riders participating so far.")

Riders with the most participations:
1. matej-mohoric: 11
2. edward-theuns: 11
3. jasper-philipsen: 10
4. arnaud-de-lie: 10
5. riley-sheehan: 10
6. christophe-laporte: 9
7. tim-wellens: 9
8. jordi-meeus: 9
9. jasper-stuyven: 9
10. antonio-morgado: 9
11. matevz-govekar: 9
12. sander-de-pestel: 9
13. ceriel-desal: 9
14. nils-eekhoff: 9
15. marc-hirschi: 9

There are 590/945 Sporza riders participating so far.


Let's get the top x riders from https://www.procyclingstats.com/rankings.php and create a dictionary to store their respective names (as the procyclingstats identifier of the form first_name-family_name), teams and PCS points.

In [5]:
def get_riders_teams_PCSpoints(n_riders, rider_data):
    for i in range (n_riders//100):
        PCS_ranking_url = f"""https://www.procyclingstats.com/rankings.php?
                          nation=&age=&zage=&page=smallerorequal&team=&offset={100*i}
                          &teamlevel=&filter=Filter"""
        response = requests.get(PCS_ranking_url)
        soup = BeautifulSoup(response.content, "html.parser")
        ranking_table = soup.find("table", class_="basic")
        for row in ranking_table.find_all("tr")[1:]:
            links = row.find_all("a", href=True)
            rider = links[0].get("href").split("/")[-1]
            team = links[1].get("href").split("/")[-1]
            points = links[2].text        
            rider_data[rider] = {"Team": team, "PCSpoints": points}

In [6]:
def get_riders_teams_PCSpoints_threaded(n_riders, rider_data):
    # Defines the functionality to add information for one rider
    def get_riders_teams_PCSpoints_single(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        ranking_table = soup.find("table", class_="basic")
        for row in ranking_table.find_all("tr")[1:]:
            links = row.find_all("a", href=True)
            rider = links[0].get("href").split("/")[-1]
            team = links[1].get("href").split("/")[-1]
            points = links[2].text        
            rider_data[rider] = {"Team": team, "PCSpoints": points}
            
    # Get all the url and multithread the previously defined function
    rider_urls = []
    for i in range (n_riders//100):
        PCS_ranking_url = f"""https://www.procyclingstats.com/rankings.php?
                          nation=&age=&zage=&page=smallerorequal&team=&offset={100*i}
                          &teamlevel=&filter=Filter"""
        rider_urls.append(PCS_ranking_url)
    threads = []
    for url in rider_urls:
        thread = threading.Thread(target=get_riders_teams_PCSpoints_single, args=(url,))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()

Let's use the rider identifiers to retrieve their age, weight, and nationality. Since the same function has to be run for each rider, multithreading will speed it up a lot.

In [7]:
def get_age_weight_threaded(rider_data):
    # Defines the functionality to add information for one rider
    def get_age_weight_single(rider):
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        nationality = info.find('a').text
        fields = info.find_all("b")
        try:
            age = re.search(r'\((.*?)\)', info.text).group(1)
        except(AttributeError):
            age = 27
        weight = 0
        height = 0
        for field in fields:
            label = field.get_text(strip=True)
            if label == "Weight:":
                weight = field.next_sibling.strip()
            if label == "Height:":
                height = field.next_sibling.strip()
        rider_data[rider].update({"Age": age, "Weight": weight,
                                  "Height": height, "Nationality": nationality})

    # Get all the url and multithread the previously defined function
    riders = list(rider_data.keys())
    threads = []
    for rider in riders:
        thread = threading.Thread(target=get_age_weight_single, args=(rider,))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()

In [8]:
def get_sporza_results_threaded(years, rider_data, DN_values):
    # Defines the functionality to add information for one race
    def get_sporza_results_single(race_url):
        nonlocal count # Used to get a consistent counter across all threads
        try:
            response = requests.get(race_url)
            with count_lock:
                count += 1
        except:
            print(f"Page {race_url} didn't respond.")
        soup = BeautifulSoup(response.content, "html.parser")
        for row in soup.find("tbody").find_all("tr"):
            rider = row.find("input", class_="gotoH2H").get("data-seo")
            if rider in rider_data.keys():
                fields = row.find_all("td")
                placement = fields[0].get_text(strip=True)
                placement = placement_mapping.get(placement, placement)
                UCI_points = fields[7].get_text(strip=True)
                ptn = fields[8].get_text(strip=True)
                race_name = "/".join(race_url.split("/")[4:6])
                rider_data[rider][race_name] = int(placement)
                if UCI_points != "":
                    if "UCI_points" in rider_data.get(rider, {}):
                        rider_data[rider]["UCI_points"] += int(UCI_points)
                    else:
                         rider_data[rider][f"UCI_points"] = int(UCI_points)
                if ptn != "":
                    year = race_url[-11:-7]
                    if f"ptn/{year}" in rider_data.get(rider, {}):
                        rider_data[rider][f"ptn/{year}"] += int(ptn)
                    else:
                         rider_data[rider][f"ptn/{year}"] = int(ptn)
                            
    # Get all the url and multithread the previously defined function
    placement_mapping = {"DNS": DN_values[0], "DNF": DN_values[1], "OTL": DN_values[1], "DSQ": DN_values[2]}
    count_lock = threading.Lock()
    race_urls, count = [], 0
    for race in sporza_races:
        for year in years:
            race_name = f"{race}/{year}"
            race_urls.append(f"https://www.procyclingstats.com/race/{race_name}/result")
    threads = []
    for race_url in race_urls:
        thread = threading.Thread(target=get_sporza_results_single, args=(race_url, ))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()
    print(f"""Successfully retrieved
              the results of {count}/{len(sporza_races)*len(years)} races.""")

Preprocessing of the retrieved data includes:
* Creating a dataframe of the retrieved data
* Resetting the index so the rider's names are in a separate column
* Renaming some columns
* Converting the team name using a one hot encoding
* Converting PCSpoints and Age from string to int and cut
* Converting the Weight and Heights strings to int and float respectively as well. If a rider has not gained points in a certain season, let's set it to zero instead of NaN.
* Setting missing Weight and Height values to the respective modal values
* Converting all race results to integers. x for DNS, y for DNF/OTL and z for DSQ.

In [9]:
def preprocess_rider_data(years, rider_data):
    pd.set_option('future.no_silent_downcasting', True) # Suggested by a warning
    # Make a dataframe and change the rider names from being the index to a separate column
    rider_data = pd.DataFrame(rider_data).T
    rider_data.reset_index(inplace=True)
    rider_data.rename(columns={"index": "Name", "Weight:": "Weight", "Height:": "Height"}, inplace=True)
    # One hot encoding of the team names
    rider_data = pd.get_dummies(rider_data, columns=["Team"])
    rider_data["PCSpoints"] = rider_data["PCSpoints"].astype(int)
    rider_data["Age"] = rider_data["Age"].fillna(27).astype(int)
    # Transforming Weight to int + filling missing values witht the modal weight
    rider_data["Weight"] = rider_data["Weight"].apply(
        lambda x: int(x[:2]) if isinstance(x, str) else x)
    rider_data["Weight"] = rider_data["Weight"].fillna(rider_data["Weight"].mode()[0])
    rider_data["Weight"] = rider_data["Weight"].infer_objects(copy=False)
    # Transforming Weight to float + filling missing values witht the modal height
    rider_data["Height"] = rider_data["Height"].apply(
        lambda x: float(x.split()[0]) if isinstance(x, str) else x)
    rider_data["Height"] = rider_data["Height"].fillna(rider_data["Height"].mode()[0])
    rider_data["Height"] = rider_data["Height"].infer_objects(copy=False)
    # Change rider points to 0 if they have None
    for year in years:
        rider_data[f"ptn/{year}"] = rider_data[f"ptn/{year}"].fillna(0).infer_objects(copy=False)
    rider_data['UCI_points'] = rider_data['UCI_points'].fillna(0).infer_objects(copy=False)
    # Change placement to 100 if a rider didn't participate
    rider_data = rider_data.fillna(100)
    
    # There is one rider with team = Team_ so let's drop that column
    rider_data = rider_data.drop("Team_", axis=1)
    # That rider is Pinot and he retired so let's remove him too.
    rider_data = rider_data[rider_data["Name"] != "thibaut-pinot"]
    return rider_data

In [10]:
n_riders = 200
DN_values = [100, 200, 300] # Placement if a rider DNS, DNF/OTL and DSQ respectively
#years = [2018, 2019, 2021, 2022, 2023]
years = [2022, 2023]
rider_data = {}

print("Retrieving rider names, teams and number of PCS points (multithreaded).")
start_time = time.time()
get_riders_teams_PCSpoints_threaded(n_riders, rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving age and weight (multithreaded).")
start_time = time.time()
get_age_weight_threaded(rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving race results (multithreaded).")
start_time = time.time()
get_sporza_results_threaded(years, rider_data, DN_values)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Preprocessing data.")
start_time = time.time()
rider_data = preprocess_rider_data(years, rider_data)
end_time = time.time()
print(f"Data preprocessed in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

Retrieving rider names, teams and number of PCS points (multithreaded).
Information retrieved in 1.0678610801696777 seconds.
----------------------------------------------------------
Retrieving age and weight (multithreaded).
Information retrieved in 18.856273651123047 seconds.
----------------------------------------------------------
Retrieving race results (multithreaded).
Successfully retrieved
              the results of 36/36 races.
Information retrieved in 10.853145122528076 seconds.
----------------------------------------------------------
Preprocessing data.
Data preprocessed in 0.04236865043640137 seconds.
----------------------------------------------------------


Let's get some order into the columns so they can be used to predict future results regardless of the number of years that is used.

In [11]:
# The first columns will always be the same, regardless of the number of years used.
order = ['Name',
         'PCSpoints',
         'Age',
         'Weight',
         'Height',
         'UCI_points',
         'Team_alpecin-deceuninck-2024',
         'Team_arkea-b-b-hotels-2024',
         'Team_astana-qazaqstan-team-2024',
         'Team_bahrain-victorious-2024',
         'Team_bora-hansgrohe-2024',
         'Team_burgos-bh-2024',
         'Team_caja-rural-seguros-rga-2024',
         'Team_cofidis-2024',
         'Team_decathlon-ag2r-la-mondiale-2024',
         'Team_ef-education-easypost-2024',
         'Team_green-project-bardiani-csf-faizane-2024',
         'Team_groupama-fdj-2024',
         'Team_ineos-grenadiers-2024',
         'Team_intermarche-circus-want-2024',
         'Team_israel-premier-tech-2024',
         'Team_lidl-trek-2024',
         'Team_lotto-dstny-2024',
         'Team_movistar-team-2024',
         'Team_q365-pro-cycing-2024',
         'Team_soudal-quick-step-2024',
         'Team_tarteletto-isorex-2024',
         'Team_team-dsm-firmenich-postnl-2024',
         'Team_team-jayco-alula-2024',
         'Team_team-medellin-2024',
         'Team_team-totalenergies-2024',
         'Team_team-visma-lease-a-bike-2024',
         'Team_tudor-pro-cycling-team-2024',
         'Team_uae-team-emirates-2024',
         'Team_uno-x-mobility-2024']
# The number of ptn/{year} categories will depend on the number of years used.
order.extend(f"ptn/{year}" for year in years)
order.extend(f"{race}/{year}" for year in years for race in sporza_races)
rider_data = rider_data[order]

In [12]:
pd.set_option('display.max_columns', None)
rider_data.head()

Unnamed: 0,Name,PCSpoints,Age,Weight,Height,UCI_points,Team_alpecin-deceuninck-2024,Team_arkea-b-b-hotels-2024,Team_astana-qazaqstan-team-2024,Team_bahrain-victorious-2024,Team_bora-hansgrohe-2024,Team_burgos-bh-2024,Team_caja-rural-seguros-rga-2024,Team_cofidis-2024,Team_decathlon-ag2r-la-mondiale-2024,Team_ef-education-easypost-2024,Team_green-project-bardiani-csf-faizane-2024,Team_groupama-fdj-2024,Team_ineos-grenadiers-2024,Team_intermarche-circus-want-2024,Team_israel-premier-tech-2024,Team_lidl-trek-2024,Team_lotto-dstny-2024,Team_movistar-team-2024,Team_q365-pro-cycing-2024,Team_soudal-quick-step-2024,Team_tarteletto-isorex-2024,Team_team-dsm-firmenich-postnl-2024,Team_team-jayco-alula-2024,Team_team-medellin-2024,Team_team-totalenergies-2024,Team_team-visma-lease-a-bike-2024,Team_tudor-pro-cycling-team-2024,Team_uae-team-emirates-2024,Team_uno-x-mobility-2024,ptn/2022,ptn/2023,omloop-het-nieuwsblad/2022,kuurne-brussel-kuurne/2022,gp-samyn/2022,strade-bianche/2022,nokere-koers/2022,bredene-koksijde-classic/2022,milano-sanremo/2022,oxyclean-classic-brugge-de-panne/2022,e3-harelbeke/2022,gent-wevelgem/2022,dwars-door-vlaanderen/2022,ronde-van-vlaanderen/2022,scheldeprijs/2022,paris-roubaix/2022,brabantse-pijl/2022,amstel-gold-race/2022,la-fleche-wallone/2022,liege-bastogne-liege/2022,omloop-het-nieuwsblad/2023,kuurne-brussel-kuurne/2023,gp-samyn/2023,strade-bianche/2023,nokere-koers/2023,bredene-koksijde-classic/2023,milano-sanremo/2023,oxyclean-classic-brugge-de-panne/2023,e3-harelbeke/2023,gent-wevelgem/2023,dwars-door-vlaanderen/2023,ronde-van-vlaanderen/2023,scheldeprijs/2023,paris-roubaix/2023,brabantse-pijl/2023,amstel-gold-race/2023,la-fleche-wallone/2023,liege-bastogne-liege/2023
0,jack-haig,545,30,67,1.9,88,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,56,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,200,31,41,11,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
1,julian-alaphilippe,544,31,62,1.73,361,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,95,67,100,100,100,58,100,100,100,100,100,100,100,100,100,100,200,100,4,200,100,100,100,43,100,100,11,100,200,100,29,51,100,100,100,100,100,86
2,tao-geoghegan-hart,544,28,65,1.83,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
3,cristian-rodriguez,543,29,56,1.78,8,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,5,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,45,200,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
4,andreas-kron,539,25,63,1.77,358,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,16,137,100,100,100,24,100,100,100,100,100,100,100,100,100,100,52,83,100,100,100,100,100,10,100,100,100,100,100,100,100,100,100,100,35,4,101,200


The race results of 2023 will be the target variables used to train the model. Next riders will be fed with the first year removed and the 2023 results included and the model will be asked to predict the 2024 results based on the start list for each race. Note that the names of the riders are not used for training?

In [13]:
X = rider_data.iloc[:, 1:-len(sporza_races)] # Omit rider name, start from column 1 not 0
y = rider_data.iloc[:, -len(sporza_races):]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

To start model training, four model types using standard configuration are gauged: linear regression, random forest classification, KNN classification and multi-layer perceptron.

In [14]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

y_pred = linear_regression_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for linear regression:", mse)

random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for random forest classfier:", mse)

KNN_model = KNeighborsRegressor()
KNN_model.fit(X_train, y_train)

y_pred = KNN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for KNN model:", mse)

NN_model = MLPRegressor()
NN_model.fit(X_train, y_train)

y_pred = NN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for neural network model:", mse)

MSE for linear regression: 1299.86142864881
MSE for random forest classfier: 732.2126083333334
MSE for KNN model: 839.2110555555556
MSE for neural network model: 872.7223611598642




Linear regression performs the worst and additionally doesn't have many hyperparameters to configure. It makes sense to further vary the parameters of the other 3 models.

In [15]:
param_grid_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

param_grid_knn = {
    "n_neighbors": [3, 5, 7],
    "metric": ['euclidean', 'manhattan']
}

param_grid_nn = {
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "sgd"],
    "learning_rate": ["constant", "adaptive"]
}

# Initialize models
models = {
    "Random Forest": RandomForestRegressor(),
    "KNN": KNeighborsRegressor(),
    "Neural Network": MLPRegressor()
}

# Perform GridSearchCV for each model
for name, model in models.items():
    if name == "Linear Regression":
        grid_search = GridSearchCV(model, param_grid_linear, cv=5)
    elif name == "Random Forest":
        grid_search = GridSearchCV(model, param_grid_rf, cv=5)
    elif name == "KNN":
        grid_search = GridSearchCV(model, param_grid_knn, cv=5)
    elif name == "Neural Network":
        grid_search = GridSearchCV(model, param_grid_nn, cv=5)
    
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {name}: {grid_search.best_params_}")


Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 7}


  ret = a @ b
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  ret = a @ b
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


Best parameters for Neural Network: {'activation': 'tanh', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'solver': 'sgd'}




The outputs: best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10,'n_estimators': 300} and best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 7} indicate the best configurations for each model. Let's look at the losses for these models.

In [16]:
random_forest = RandomForestRegressor(max_depth=10, min_samples_leaf=10, n_estimators=300)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for optimised random forest classfier:", mse)

MSE for optimised random forest classfier: 710.4814731002382


In [17]:
KNN_model = KNeighborsRegressor(metric="manhattan", n_neighbors=7)
KNN_model.fit(X_train, y_train)

y_pred = KNN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for optimised KNN model:", mse)

MSE for optimised KNN model: 787.015589569161


The neural net never converged. Each of the configurations had only 200 epochs, so let's increase that to 2000 epochs and run the hyperparameter optimisation experiment again.

In [18]:
NN_model = MLPRegressor(max_iter=2000)

param_grid_nn = {
    "batch_size": [128, 256],
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "solver": ["adam", "sgd"],
    "learning_rate": ["invscaling"]
}

grid_search = GridSearchCV(NN_model, param_grid_nn, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters for neural network: {grid_search.best_params_}")

  ret = a @ b
  ret = a @ b
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


Best parameters for neural network: {'batch_size': 256, 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling', 'solver': 'adam'}




In [19]:
NN_model = MLPRegressor(max_iter=2000,
                       batch_size=128,
                       hidden_layer_sizes=(50,),
                       learning_rate="invscaling",
                       solver="adam")
NN_model.fit(X_train, y_train)

y_pred = NN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for optimised neural network model:", mse)

MSE for optimised neural network model: 923.3424807802107


The random forest model performs the best. To generate the results for this year, we could feed the model with rider data having the first year remvoed and their results from 2023 added. To do this we will remove the rider's results from 2018, move the results of the riders 2023 -> 2022 ... 2019 -> 2018, and add the results from 2023.

In [20]:
cont_rider_data = rider_data.copy()
# Remove the results from the earliest year
start_index = cont_rider_data.columns.get_loc(f"omloop-het-nieuwsblad/{years[0]}")
# Keep everything before the earliest results
columns_to_keep = list(cont_rider_data.columns[:start_index])
# Keep everything after the earliest results
columns_to_keep.extend(cont_rider_data.columns[start_index+18:])
cont_rider_data = cont_rider_data[columns_to_keep]
# Move all kept columns 18 places to the front and rename them
rename_dict = dict(zip(cont_rider_data.columns, rider_data.columns[:-18]))
cont_rider_data.rename(columns=rename_dict, inplace=True)
cont_rider_data

Unnamed: 0,Name,PCSpoints,Age,Weight,Height,UCI_points,Team_alpecin-deceuninck-2024,Team_arkea-b-b-hotels-2024,Team_astana-qazaqstan-team-2024,Team_bahrain-victorious-2024,Team_bora-hansgrohe-2024,Team_burgos-bh-2024,Team_caja-rural-seguros-rga-2024,Team_cofidis-2024,Team_decathlon-ag2r-la-mondiale-2024,Team_ef-education-easypost-2024,Team_green-project-bardiani-csf-faizane-2024,Team_groupama-fdj-2024,Team_ineos-grenadiers-2024,Team_intermarche-circus-want-2024,Team_israel-premier-tech-2024,Team_lidl-trek-2024,Team_lotto-dstny-2024,Team_movistar-team-2024,Team_q365-pro-cycing-2024,Team_soudal-quick-step-2024,Team_tarteletto-isorex-2024,Team_team-dsm-firmenich-postnl-2024,Team_team-jayco-alula-2024,Team_team-medellin-2024,Team_team-totalenergies-2024,Team_team-visma-lease-a-bike-2024,Team_tudor-pro-cycling-team-2024,Team_uae-team-emirates-2024,Team_uno-x-mobility-2024,ptn/2022,ptn/2023,omloop-het-nieuwsblad/2022,kuurne-brussel-kuurne/2022,gp-samyn/2022,strade-bianche/2022,nokere-koers/2022,bredene-koksijde-classic/2022,milano-sanremo/2022,oxyclean-classic-brugge-de-panne/2022,e3-harelbeke/2022,gent-wevelgem/2022,dwars-door-vlaanderen/2022,ronde-van-vlaanderen/2022,scheldeprijs/2022,paris-roubaix/2022,brabantse-pijl/2022,amstel-gold-race/2022,la-fleche-wallone/2022,liege-bastogne-liege/2022
0,jack-haig,545,30,67,1.90,88,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,56,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
1,julian-alaphilippe,544,31,62,1.73,361,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,95,67,100,100,100,43,100,100,11,100,200,100,29,51,100,100,100,100,100,86
2,tao-geoghegan-hart,544,28,65,1.83,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,0,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
3,cristian-rodriguez,543,29,56,1.78,8,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,5,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
4,andreas-kron,539,25,63,1.77,358,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,16,137,100,100,100,10,100,100,100,100,100,100,100,100,100,100,35,4,101,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,nils-politt,552,29,80,1.92,477,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,136,191,7,54,100,100,100,100,21,100,13,20,10,20,100,35,100,200,100,100
196,milan-menten,547,27,68,1.74,265,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,24,135,100,100,1,100,3,100,100,100,100,100,100,100,100,100,100,100,100,100
197,vincenzo-albanese,547,27,70,1.75,82,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,56,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
198,juan-sebastian-molano,546,29,72,1.80,120,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,5,81,100,100,100,100,100,7,100,9,100,100,100,100,100,100,100,100,100,100


Now we can used the shifted data to predict riders' results for this year.

In [21]:
this_years_results = random_forest.predict(cont_rider_data.iloc[:, 1:])
this_years_results = pd.DataFrame(this_years_results)
this_years_results.insert(0, "Name", rider_data.iloc[:, 0])
this_years_results

Unnamed: 0,Name,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,jack-haig,100.641616,99.107796,100.873824,98.446090,96.899338,96.387453,92.781705,104.172281,101.922888,100.532435,97.966084,102.064452,97.732384,100.572193,97.665477,98.857139,91.535877,98.582657
1,julian-alaphilippe,90.198175,87.932034,95.583123,73.601570,95.938799,98.951589,70.975320,97.941146,97.226092,87.503718,83.706222,89.126218,95.180648,89.373241,91.166486,78.557050,70.560975,76.264155
2,tao-geoghegan-hart,100.088002,101.314777,100.168508,100.080411,99.861772,99.803659,98.961764,100.054617,102.034175,102.351873,99.959099,100.157613,99.883865,102.499861,100.573974,109.180248,105.599927,101.708072
3,cristian-rodriguez,100.248896,101.969252,100.537519,100.987949,99.583092,99.487064,98.036182,100.137368,100.725378,102.558099,99.779861,100.410677,99.700969,101.409671,101.032180,106.254592,102.577624,102.892290
4,andreas-kron,92.576994,94.832932,97.060435,70.069222,97.147616,98.416711,76.408379,98.449555,93.591182,96.590953,90.273846,88.858837,97.425207,96.790678,81.762846,65.923411,62.280112,69.528448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,jasper-de-buyst,69.333444,70.578740,89.549996,85.332921,88.451953,95.387513,55.333427,85.616121,59.610352,65.740471,62.362446,54.112659,93.347550,49.799338,104.504793,94.289577,88.830593,92.125929
195,nils-politt,94.221530,92.606895,90.385660,78.481774,91.204462,93.101806,84.741887,99.547755,94.561244,90.587934,87.606971,96.255231,90.075797,91.486384,87.595219,76.329281,80.880476,83.522047
196,milan-menten,102.155775,99.904284,102.014320,101.112821,95.010451,93.477272,93.971841,107.637829,104.214683,99.488801,96.332217,106.366419,94.294034,101.604702,97.188276,99.595577,95.695491,101.969619
197,vincenzo-albanese,96.883512,91.138800,93.954428,87.123308,90.569130,85.600915,90.983431,92.368496,103.079688,84.974743,91.994380,103.703751,85.593439,89.814649,91.841571,84.911582,92.115828,94.871570


Now we can sort the riders based on the results for each race. We can then filter these results based on the starting list for each race to get a realistic guess.

In [22]:
this_years_results.iloc[:, 1:] = this_years_results.iloc[:, 1:].rank(method="first")
this_years_results.sample(10)

Unnamed: 0,Name,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
192,diego-ulissi,130.0,144.0,111.0,123.0,178.0,189.0,188.0,98.0,185.0,127.0,191.0,112.0,172.0,184.0,146.0,194.0,196.0,174.0
65,lorenzo-fortunato,111.0,122.0,125.0,144.0,156.0,173.0,179.0,122.0,128.0,177.0,184.0,120.0,194.0,126.0,181.0,163.0,173.0,176.0
73,rigoberto-uran,134.0,135.0,155.0,171.0,125.0,132.0,145.0,143.0,98.0,167.0,162.0,130.0,171.0,118.0,184.0,131.0,140.0,141.0
37,matteo-moschetti,198.0,199.0,184.0,197.0,114.0,52.0,144.0,188.0,84.0,193.0,108.0,198.0,31.0,192.0,78.0,90.0,100.0,115.0
168,stephen-williams,173.0,157.0,165.0,115.0,146.0,113.0,103.0,117.0,94.0,185.0,126.0,169.0,119.0,146.0,71.0,114.0,43.0,62.0
20,mathieu-burgaudeau,159.0,176.0,108.0,183.0,199.0,167.0,108.0,160.0,58.0,197.0,121.0,163.0,114.0,144.0,54.0,36.0,36.0,34.0
35,alberto-dainese,178.0,161.0,47.0,151.0,117.0,49.0,122.0,184.0,116.0,109.0,97.0,185.0,35.0,77.0,53.0,55.0,66.0,56.0
196,milan-menten,180.0,109.0,187.0,141.0,67.0,38.0,120.0,177.0,192.0,96.0,119.0,184.0,73.0,150.0,85.0,124.0,130.0,163.0
96,oscar-sevilla,118.0,123.0,144.0,173.0,142.0,158.0,163.0,133.0,120.0,175.0,174.0,116.0,192.0,125.0,180.0,147.0,157.0,140.0
154,maxim-van-gils,114.0,138.0,112.0,117.0,194.0,197.0,197.0,72.0,170.0,105.0,196.0,94.0,184.0,155.0,127.0,195.0,198.0,197.0


I think better performance is still achievable by doing some feature engineering.