In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from multiprocessing import Pool
from bs4 import BeautifulSoup
import lightgbm as lgb
import pandas as pd
import numpy as np
import requests
import time
import re

In [2]:
sporza_races = [
    "omloop-het-nieuwsblad",
    "kuurne-brussel-kuurne",
    "gp-samyn",
    "strade-bianche",
    "nokere-koers",
    "bredene-koksijde-classic",
    "milano-sanremo",
    "oxyclean-classic-brugge-de-panne",
    "e3-harelbeke",
    "gent-wevelgem",
    "dwars-door-vlaanderen",
    "ronde-van-vlaanderen",
    "scheldeprijs",
    "paris-roubaix",
    "brabantse-pijl",
    "amstel-gold-race",
    "la-fleche-wallone",
    "liege-bastogne-liege"
]

Let's get the list of riders that start in each of the races this year. These will be fed to the model to be ranked later.

In [3]:
start_info = {}
for race in sporza_races:
    startlist = []
    race_link = f"https://www.procyclingstats.com/race/{race}/2024/startlist/startlist-quality"
    response = requests.get(race_link)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="basic")
    for rider in table.find_all("tr")[1:]:
        try:
            startlist.append(rider.find("a")["href"].split('/')[1])
        except:
            break
    start_info[race] = startlist

In [4]:
from collections import Counter

all_names = [name for names in start_info.values() for name in names]
name_counts = Counter(all_names)
name_counts = sorted(name_counts.items(), key=lambda x: x[1], reverse=True)
print("Riders with the most participations:")
for rank, (name, count) in enumerate(name_counts, 1):
    if count >= 9:
        print(f"{rank}. {name}: {count}")
        
num_unique_names = len(set(name for names in start_info.values() for name in names))
print("")
print(f"There are {num_unique_names}/945 Sporza riders participating so far.")

Riders with the most participations:
1. matej-mohoric: 11
2. edward-theuns: 11
3. jasper-philipsen: 10
4. arnaud-de-lie: 10
5. riley-sheehan: 10
6. matevz-govekar: 10
7. christophe-laporte: 9
8. tim-wellens: 9
9. jordi-meeus: 9
10. jasper-stuyven: 9
11. antonio-morgado: 9
12. sander-de-pestel: 9
13. nils-eekhoff: 9
14. marc-hirschi: 9

There are 462/945 Sporza riders participating so far.


Let's get the top x riders from https://www.procyclingstats.com/rankings.php and create a dictionary to store their respective names (as the procyclingstats identifier of the form first_name-family_name), teams and PCS points.

In [5]:
def get_riders_teams_PCSpoints(n_riders, rider_data):
    for i in range (n_riders//100):
        PCS_ranking_url = f"""https://www.procyclingstats.com/rankings.php?
                          nation=&age=&zage=&page=smallerorequal&team=&offset={100*i}
                          &teamlevel=&filter=Filter"""
        response = requests.get(PCS_ranking_url)
        soup = BeautifulSoup(response.content, "html.parser")
        ranking_table = soup.find("table", class_="basic")
        for row in ranking_table.find_all("tr")[1:]:
            links = row.find_all("a", href=True)
            rider = links[0].get("href").split("/")[-1]
            team = links[1].get("href").split("/")[-1]
            points = links[2].text        
            rider_data[rider] = {"Team": team, "PCSpoints": points}

Let's use the rider identifiers to retrieve their age and weight.

In [6]:
def get_age_weight(rider_data):
    for rider in rider_data.keys():
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        fields = info.find_all("b")
        try:
            rider_data[rider]["Age"] = re.search(r'\((.*?)\)', info.text).group(1)
        except(AttributeError):
            rider_data[rider]["Age"] = 27
        for field in fields:
            label = field.get_text(strip=True)
            if label not in ["", "Date of birth:", "Nationality:", "Place of birth:"]:
                value = field.next_sibling.strip()
                rider_data[rider][label] = value

Trying to parallellise the get_age_weight function because it consists of seuentially retrieving the same information of all rider profiles.

In [7]:
def get_age_weight_parallel(rider_data):
    riders = list(rider_data.keys())
    with Pool() as pool:
        results = pool.map(get_age_weight_single_rider, riders)
        for result in results:
            rider = result["rider"]
            del result["rider"]
            rider_data[rider].update(result)
    
def get_age_weight_single_rider(rider):
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        print(info)
        nationality = info.find('a').text
        fields = info.find_all("b")
        try:
            age = re.search(r'\((.*?)\)', info.text).group(1)
        except(AttributeError):
            age = 27
        for field in fields:
            label = field.get_text(strip=True)
            weight = 0
            if label not in ["", "Date of birth:", "Nationality:" "Place of birth:"]:
                value = field.next_sibling.strip()
                print(f"{label} is {value}")
                weight = value

        return {"rider": rider, "age": age, "weight": weight, "nationality": nationality}

In [8]:
def get_sporza_results(years, rider_data, DN_values):
    count = 0
    for race in sporza_races:
        for year in years:
            race_name = f"{race}/{year}"
            race_url = f"https://www.procyclingstats.com/race/{race_name}/result"
            try:
                response = requests.get(race_url)
                count += 1
            except:
                print(f"Page {race_url} didn't respond.")
            soup = BeautifulSoup(response.content, "html.parser")
            for row in soup.find("tbody").find_all("tr"):
                rider = row.find("input", class_="gotoH2H").get("data-seo")
                if rider in rider_data.keys():
                    fields = row.find_all("td")
                    placement = fields[0].get_text(strip=True)
                    if placement == "DNS":
                        placement = DN_values[0]
                    if placement == "DNF":
                        placement = DN_values[1]
                    if placement == "OTL":
                        placement = DN_values[1]
                    if placement == "DSQ":
                        placement = DN_values[2]
                    UCI_points = fields[7].get_text(strip=True)
                    ptn = fields[8].get_text(strip=True)
                    rider_data[rider][race_name] = int(placement)
                    if UCI_points != "":
                        if "UCI_points" in rider_data.get(rider, {}):
                            rider_data[rider]["UCI_points"] += int(UCI_points)
                        else:
                             rider_data[rider][f"UCI_points"] = int(UCI_points)
                    if ptn != "":
                        if f"ptn/{year}" in rider_data.get(rider, {}):
                            rider_data[rider][f"ptn/{year}"] += int(ptn)
                        else:
                             rider_data[rider][f"ptn/{year}"] = int(ptn)
    print(f"""Successfully retrieved the results of
          {count}/{len(sporza_races)*len(years)} races.""")

Preprocessing of the retrieved data includes:
* Creating a dataframe of the retrieved data
* Resetting the index so the rider's names are in a separate column
* Renaming some columns
* Converting the team name using a one hot encoding
* Converting PCSpoints and Age from string to int and cut
* Converting the Weight and Heights strings to int and float respectively as well. If a rider has not gained points in a certain season, let's set it to zero instead of NaN.
* Setting missing Weight and Height values to the respective modal values
* Converting all race results to integers. x for DNS, y for DNF/OTL and z for DSQ.

In [9]:
def preprocess_rider_data(years, rider_data):
    # Make a dataframe and change the rider names from being the index to a separate column
    rider_data = pd.DataFrame(rider_data).T
    rider_data.reset_index(inplace=True)
    rider_data.rename(columns={"index": "Name", "Weight:": "Weight", "Height:": "Height"}, inplace=True)
    # One hot encoding of the team names
    rider_data = pd.get_dummies(rider_data, columns=["Team"])
    rider_data["PCSpoints"] = rider_data["PCSpoints"].astype(int)
    rider_data["Age"] = rider_data["Age"].astype(int)
    # Transforming Weight to int + filling missing values witht the modal weight
    rider_data["Weight"] = rider_data["Weight"].apply(
        lambda x: int(x[:2]) if isinstance(x, str) else x)
    rider_data["Weight"].fillna(rider_data["Weight"].mode()[0], inplace=True)
    rider_data["Weight"] = rider_data["Weight"].infer_objects(copy=False)
    # Transforming Weight to float + filling missing values witht the modal height
    rider_data["Height"] = rider_data["Height"].apply(
        lambda x: float(x.split()[0]) if isinstance(x, str) else x)
    rider_data["Height"].fillna(rider_data["Height"].mode()[0], inplace=True)
    rider_data["Height"] = rider_data["Height"].infer_objects(copy=False)
    # Change rider points to 0 if they have None
    for year in years:
        rider_data[f"ptn/{year}"] = rider_data[f"ptn/{year}"].fillna(0).infer_objects(copy=False)
    rider_data['UCI_points'] = rider_data['UCI_points'].fillna(0).infer_objects(copy=False)
    # Change placement to 100 if a rider didn't participate
    rider_data = rider_data.fillna(100)
    
    # There is one rider with team = Team_ so let's drop that column
    rider_data = rider_data.drop("Team_", axis=1)
    # That rider is Pinot and he retired so let's remove him too.
    rider_data = rider_data[rider_data["Name"] != "thibaut-pinot"]
    return rider_data

In [10]:
n_riders = 200
DN_values = [100, 200, 300] # Placement if a rider DNS, DNF/OTL and DSQ respectively
#years = [2018, 2019, 2021, 2022, 2023]
years = [2022, 2023]
rider_data = {}

print("Retrieving rider names, teams and number of PCS points.")
start_time = time.time()
get_riders_teams_PCSpoints(n_riders, rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving age and weight.")
start_time = time.time()
get_age_weight(rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving race results.")
start_time = time.time()
get_sporza_results(years, rider_data, DN_values)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Preprocessing data.")
rider_data = preprocess_rider_data(years, rider_data)

Retrieving rider names, teams and number of PCS points.
Information retrieved in 0.8614664077758789 seconds.
----------------------------------------------------------
Retrieving age and weight.
Information retrieved in 45.08698296546936 seconds.
----------------------------------------------------------
Retrieving race results.
Successfully retrieved the results of
          36/36 races.
Information retrieved in 15.011163711547852 seconds.
----------------------------------------------------------
Preprocessing data.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rider_data["Weight"].fillna(rider_data["Weight"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rider_data["Height"].fillna(rider_data["Height"].mode()[0], inplace=True)
  rider_data[f"ptn/{year}"] = rider_data[f"ptn/{year}"].fillna(0).infer_objects(copy

Let's get some order into the columns so they can be used to predict future results regardless of the number of years that is used.

In [11]:
# The first columns will always be the same, regardless of the number of years used.
order = ['Name',
         'PCSpoints',
         'Age',
         'Weight',
         'Height',
         'UCI_points',
         'Team_alpecin-deceuninck-2024',
         'Team_arkea-b-b-hotels-2024',
         'Team_astana-qazaqstan-team-2024',
         'Team_bahrain-victorious-2024',
         'Team_bora-hansgrohe-2024',
         'Team_burgos-bh-2024',
         'Team_caja-rural-seguros-rga-2024',
         'Team_cofidis-2024',
         'Team_decathlon-ag2r-la-mondiale-2024',
         'Team_ef-education-easypost-2024',
         'Team_green-project-bardiani-csf-faizane-2024',
         'Team_groupama-fdj-2024',
         'Team_ineos-grenadiers-2024',
         'Team_intermarche-circus-want-2024',
         'Team_israel-premier-tech-2024',
         'Team_lidl-trek-2024',
         'Team_lotto-dstny-2024',
         'Team_movistar-team-2024',
         'Team_q365-pro-cycing-2024',
         'Team_soudal-quick-step-2024',
         'Team_tarteletto-isorex-2024',
         'Team_team-dsm-firmenich-postnl-2024',
         'Team_team-jayco-alula-2024',
         'Team_team-medellin-2024',
         'Team_team-totalenergies-2024',
         'Team_team-visma-lease-a-bike-2024',
         'Team_tudor-pro-cycling-team-2024',
         'Team_uae-team-emirates-2024',
         'Team_uno-x-mobility-2024']
# The number of ptn/{year} categories will depend on the number of years used.
order.extend(f"ptn/{year}" for year in years)
order.extend(f"{race}/{year}" for year in years for race in sporza_races)
rider_data = rider_data[order]

In [12]:
pd.set_option('display.max_columns', None)
rider_data.head()

Unnamed: 0,Name,PCSpoints,Age,Weight,Height,UCI_points,Team_alpecin-deceuninck-2024,Team_arkea-b-b-hotels-2024,Team_astana-qazaqstan-team-2024,Team_bahrain-victorious-2024,Team_bora-hansgrohe-2024,Team_burgos-bh-2024,Team_caja-rural-seguros-rga-2024,Team_cofidis-2024,Team_decathlon-ag2r-la-mondiale-2024,Team_ef-education-easypost-2024,Team_green-project-bardiani-csf-faizane-2024,Team_groupama-fdj-2024,Team_ineos-grenadiers-2024,Team_intermarche-circus-want-2024,Team_israel-premier-tech-2024,Team_lidl-trek-2024,Team_lotto-dstny-2024,Team_movistar-team-2024,Team_q365-pro-cycing-2024,Team_soudal-quick-step-2024,Team_tarteletto-isorex-2024,Team_team-dsm-firmenich-postnl-2024,Team_team-jayco-alula-2024,Team_team-medellin-2024,Team_team-totalenergies-2024,Team_team-visma-lease-a-bike-2024,Team_tudor-pro-cycling-team-2024,Team_uae-team-emirates-2024,Team_uno-x-mobility-2024,ptn/2022,ptn/2023,omloop-het-nieuwsblad/2022,kuurne-brussel-kuurne/2022,gp-samyn/2022,strade-bianche/2022,nokere-koers/2022,bredene-koksijde-classic/2022,milano-sanremo/2022,oxyclean-classic-brugge-de-panne/2022,e3-harelbeke/2022,gent-wevelgem/2022,dwars-door-vlaanderen/2022,ronde-van-vlaanderen/2022,scheldeprijs/2022,paris-roubaix/2022,brabantse-pijl/2022,amstel-gold-race/2022,la-fleche-wallone/2022,liege-bastogne-liege/2022,omloop-het-nieuwsblad/2023,kuurne-brussel-kuurne/2023,gp-samyn/2023,strade-bianche/2023,nokere-koers/2023,bredene-koksijde-classic/2023,milano-sanremo/2023,oxyclean-classic-brugge-de-panne/2023,e3-harelbeke/2023,gent-wevelgem/2023,dwars-door-vlaanderen/2023,ronde-van-vlaanderen/2023,scheldeprijs/2023,paris-roubaix/2023,brabantse-pijl/2023,amstel-gold-race/2023,la-fleche-wallone/2023,liege-bastogne-liege/2023
0,tadej-pogacar,2994,25,66.0,1.76,3298,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,521,955,100,100,100,1,100,100,5,100,100,100,10,4,100,100,100,100,12,100,100,100,100,100,100,100,4,100,3,100,100,1,100,100,100,1,1,200
1,remco-evenepoel,2794,24,61.0,1.71,1378,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,320,275,100,100,100,100,100,100,100,100,100,100,100,100,100,100,6,100,43,1,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,1
2,jonas-vingegaard,2715,27,60.0,1.75,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,0,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,200,200,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
3,primoz-roglic,2526,34,65.0,1.77,30,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,26,0,100,100,100,100,100,100,17,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
4,wout-van-aert,2402,29,78.0,1.9,3890,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,904,795,1,100,100,100,100,100,8,100,1,12,100,100,100,2,100,100,100,3,100,100,100,100,100,100,3,100,1,2,100,4,100,3,100,100,100,100


The race results of 2023 will be the target variables used to train the model. Next riders will be fed with the first year removed and the 2023 results included and the model will be asked to predict the 2024 results based on the start list for each race.

In [13]:
X = rider_data.iloc[:, 1:-len(sporza_races)]
y = rider_data.iloc[:, -len(sporza_races):]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model training.

In [14]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

y_pred = linear_regression_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for linear regression:", mse)

MSE for linear regression: 1716.8914099719389


In [15]:
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for random forest classfier:", mse)

MSE for random forest classfier: 1175.3259295833332


In [16]:
KNN_model = KNeighborsRegressor()
KNN_model.fit(X_train, y_train)

y_pred = KNN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for KNN model:", mse)

MSE for KNN model: 1343.9654999999998


In [17]:
NN_model = MLPRegressor()
NN_model.fit(X_train, y_train)

y_pred = NN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for neural network model:", mse)

MSE for neural network model: 1440.1951379603877




Linear regression performs the worst and additionally doesn't have many hyperparameters to configure. It makes sense to further vary the parameters of the other 3 models.

In [18]:
param_grid_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

param_grid_knn = {
    "n_neighbors": [3, 5, 7],
    "metric": ['euclidean', 'manhattan']
}

param_grid_nn = {
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "sgd"],
    "learning_rate": ["constant", "adaptive"]
}

# Initialize models
models = {
    "Random Forest": RandomForestRegressor(),
    "KNN": KNeighborsRegressor(),
    "Neural Network": MLPRegressor()
}

# Perform GridSearchCV for each model
for name, model in models.items():
    if name == "Linear Regression":
        grid_search = GridSearchCV(model, param_grid_linear, cv=5)
    elif name == "Random Forest":
        grid_search = GridSearchCV(model, param_grid_rf, cv=5)
    elif name == "KNN":
        grid_search = GridSearchCV(model, param_grid_knn, cv=5)
    elif name == "Neural Network":
        grid_search = GridSearchCV(model, param_grid_nn, cv=5)
    
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {name}: {grid_search.best_params_}")


Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 7}


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
 -3.47687187e-01 -9.29417486e+07 -4.81567606e-01 -4.11904381e+14
 -3.63285699e-01            -inf -2.52825829e-01            -inf
 -1.38500556e+01 -1.16659192e-01 -1.39111883e+01 -4.72929497e-01
 -1.10377093e+01 -9.49211562e-02 -1.08635454e+01 -5.24691094e-02
 -1.36065619e+01 -2.06501747e-01 -1.35380329e+01 -4.06046527e-01]
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


Best parameters for Neural Network: {'activation': 'tanh', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'solver': 'sgd'}


The outputs: best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10,'n_estimators': 300} and best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 7} indicate the best configurations for each model. Let's look at the losses for these models.

In [19]:
random_forest = RandomForestRegressor(max_depth=10, min_samples_leaf=10, n_estimators=300)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for optimised random forest classfier:", mse)

MSE for optimised random forest classfier: 1153.4501534103338


In [20]:
KNN_model = KNeighborsRegressor(metric="manhattan", n_neighbors=7)
KNN_model.fit(X_train, y_train)

y_pred = KNN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for optimised KNN model:", mse)

MSE for optimised KNN model: 1272.2173469387753


The neural net never converged. Each of the configurations had only 200 epochs, so let's increase that tà 500 epochs and run the hyperparameter optimisation experiment again.

In [21]:
NN_model = MLPRegressor(max_iter=2000)

param_grid_nn = {
    "batch_size": [128, 256],
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "solver": ["adam", "sgd"],
    "learning_rate": ["invscaling"]
}

grid_search = GridSearchCV(NN_model, param_grid_nn, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters for neural network: {grid_search.best_params_}")

  ret = a @ b
  ret = a @ b
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


Best parameters for neural network: {'batch_size': 128, 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling', 'solver': 'adam'}


In [22]:
NN_model = MLPRegressor(max_iter=2000,
                       batch_size=128,
                       hidden_layer_sizes=(50,),
                       learning_rate="invscaling",
                       solver="adam")
NN_model.fit(X_train, y_train)

y_pred = NN_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE for optimised neural network model:", mse)

MSE for optimised neural network model: 1349.331319459857


The random forest model performs the best. To generate the results for this year, we could feed the model with rider data having the first year remvoed and their results from 2023 added. To do this we will remove the rider's results from 2018, move the results of the riders 2023 -> 2022 ... 2019 -> 2018, and add the results from 2023.

Name needs to be in each row somehow, otherwise we won't be able to link predictions to riders.

In [23]:
cont_rider_data = rider_data.copy()
cont_rider_data.head()

Unnamed: 0,Name,PCSpoints,Age,Weight,Height,UCI_points,Team_alpecin-deceuninck-2024,Team_arkea-b-b-hotels-2024,Team_astana-qazaqstan-team-2024,Team_bahrain-victorious-2024,Team_bora-hansgrohe-2024,Team_burgos-bh-2024,Team_caja-rural-seguros-rga-2024,Team_cofidis-2024,Team_decathlon-ag2r-la-mondiale-2024,Team_ef-education-easypost-2024,Team_green-project-bardiani-csf-faizane-2024,Team_groupama-fdj-2024,Team_ineos-grenadiers-2024,Team_intermarche-circus-want-2024,Team_israel-premier-tech-2024,Team_lidl-trek-2024,Team_lotto-dstny-2024,Team_movistar-team-2024,Team_q365-pro-cycing-2024,Team_soudal-quick-step-2024,Team_tarteletto-isorex-2024,Team_team-dsm-firmenich-postnl-2024,Team_team-jayco-alula-2024,Team_team-medellin-2024,Team_team-totalenergies-2024,Team_team-visma-lease-a-bike-2024,Team_tudor-pro-cycling-team-2024,Team_uae-team-emirates-2024,Team_uno-x-mobility-2024,ptn/2022,ptn/2023,omloop-het-nieuwsblad/2022,kuurne-brussel-kuurne/2022,gp-samyn/2022,strade-bianche/2022,nokere-koers/2022,bredene-koksijde-classic/2022,milano-sanremo/2022,oxyclean-classic-brugge-de-panne/2022,e3-harelbeke/2022,gent-wevelgem/2022,dwars-door-vlaanderen/2022,ronde-van-vlaanderen/2022,scheldeprijs/2022,paris-roubaix/2022,brabantse-pijl/2022,amstel-gold-race/2022,la-fleche-wallone/2022,liege-bastogne-liege/2022,omloop-het-nieuwsblad/2023,kuurne-brussel-kuurne/2023,gp-samyn/2023,strade-bianche/2023,nokere-koers/2023,bredene-koksijde-classic/2023,milano-sanremo/2023,oxyclean-classic-brugge-de-panne/2023,e3-harelbeke/2023,gent-wevelgem/2023,dwars-door-vlaanderen/2023,ronde-van-vlaanderen/2023,scheldeprijs/2023,paris-roubaix/2023,brabantse-pijl/2023,amstel-gold-race/2023,la-fleche-wallone/2023,liege-bastogne-liege/2023
0,tadej-pogacar,2994,25,66.0,1.76,3298,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,521,955,100,100,100,1,100,100,5,100,100,100,10,4,100,100,100,100,12,100,100,100,100,100,100,100,4,100,3,100,100,1,100,100,100,1,1,200
1,remco-evenepoel,2794,24,61.0,1.71,1378,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,320,275,100,100,100,100,100,100,100,100,100,100,100,100,100,100,6,100,43,1,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,1
2,jonas-vingegaard,2715,27,60.0,1.75,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,0,0,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,200,200,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
3,primoz-roglic,2526,34,65.0,1.77,30,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,26,0,100,100,100,100,100,100,17,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
4,wout-van-aert,2402,29,78.0,1.9,3890,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,904,795,1,100,100,100,100,100,8,100,1,12,100,100,100,2,100,100,100,3,100,100,100,100,100,100,3,100,1,2,100,4,100,3,100,100,100,100


In [24]:
#n_races = len(sporza_races)*len(years)
#order = cont_rider_data.columns
#order[-n_races:-n_races+len(sporza_races)] = list(cont_rider_data.columns[-n_races+18:-n_races+36])
#cont_rider_data.columns[-n_races+18:-n_races+36]
#cont_rider_data

#cont_rider_data = cont_rider_data[order]

