In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from multiprocessing import Pool
from bs4 import BeautifulSoup
import lightgbm as lgb
import pandas as pd
import numpy as np
import threading
import requests
import time
import re

This next section will implement Sporza's point system for the Giro.

In [2]:
stage_points = [100, 80, 65, 55, 45, 35, 30, 25, 20, 17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
km_points = [25, 20, 15, 10, 5] # Main main gets a bonus if he finishes in the top 5
teammate_bonus = 5 # Goes to all teammates of the winner
combative_bonus = 10
pink_bonus = 20
purple_bonus, blue_bonus = 10, 10
white_bonus = 5

# Final standings bonusses
final_pink_bonus = [200, 150, 120, 110, 100, 90, 80, 70, 60, 50, 47, 45, 42, 40, 37, 35, 32, 30, 27,
                   25, 24, 23, 22, 21, 20, 19, 18, 17, 15, 12] # All other riders get 4 points
final_purple_bonus, final_blue_bonus = [80, 60, 40, 30, 25, 20, 15, 10, 5, 2], [80, 60, 40, 30, 25, 20, 15, 10, 5, 2]
final_white_bonus = [60, 48, 32, 24, 20, 16, 12, 8, 3, 1]

The giro results will be presented as a 21 x number of riders table. A second table will be provided with the kopman, combative, pink, purple, blue and white for each race.

In [3]:
def calc_points_per_rider(giro_riders, giro_results):
    points_per_rider = {}
    for name in giro_riders:
        points_per_rider[name] = 0

In [4]:
sporza_races = [
    "omloop-het-nieuwsblad",
    "kuurne-brussel-kuurne",
    "gp-samyn",
    "strade-bianche",
    "nokere-koers",
    "bredene-koksijde-classic",
    "milano-sanremo",
    "oxyclean-classic-brugge-de-panne",
    "e3-harelbeke",
    "gent-wevelgem",
    "dwars-door-vlaanderen",
    "ronde-van-vlaanderen",
    "scheldeprijs",
    "paris-roubaix",
    "brabantse-pijl",
    "amstel-gold-race",
    "la-fleche-wallone",
    "liege-bastogne-liege"
]

In [5]:
rider_info = {}
giro_link = "https://www.procyclingstats.com/race/giro-d-italia/2024/startlist/startlist-quality"
response = requests.get(giro_link)
soup = BeautifulSoup(response.content, "html.parser")
ranking_table = soup.find("table", class_="basic")
for row in ranking_table.find_all("tr")[1:]:
    row_data = [td.get_text() for td in row.find_all('td')]
    PCS_ranking = row_data[2]
    points = row_data[3]
    rider_link = str(row.find("a", href=True))
    match = re.search(r'rider/(.+?)">', rider_link)
    if match: rider_name = match.group(1)
    rider_info[rider_name] = {"PCS_ranking": PCS_ranking,
                              "points": points}

In [6]:
def get_riders_teams_PCSpoints_threaded(n_riders, rider_data):
    # Defines the functionality to add information for one rider
    def get_riders_teams_PCSpoints_single(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        ranking_table = soup.find("table", class_="basic")
        for row in ranking_table.find_all("tr")[1:]:
            links = row.find_all("a", href=True)
            rider = links[0].get("href").split("/")[-1]
            team = links[1].get("href").split("/")[-1]
            points = links[2].text        
            rider_data[rider] = {"Team": team, "PCSpoints": points}
            
    # Get all the url and multithread the previously defined function
    rider_urls = []
    for i in range (n_riders//100):
        PCS_ranking_url = f"""https://www.procyclingstats.com/rankings.php?
                          nation=&age=&zage=&page=smallerorequal&team=&offset={100*i}
                          &teamlevel=&filter=Filter"""
        rider_urls.append(PCS_ranking_url)
    threads = []
    for url in rider_urls:
        thread = threading.Thread(target=get_riders_teams_PCSpoints_single, args=(url,))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()

Let's use the rider identifiers to retrieve their age, weight, and nationality. Since the same function has to be run for each rider, multithreading will speed it up a lot.

In [7]:
def get_age_weight_threaded(rider_data):
    # Defines the functionality to add information for one rider
    def get_age_weight_single(rider):
        rider_profile_url = f"https://www.procyclingstats.com/rider/{rider}"
        response = requests.get(rider_profile_url)
        soup = BeautifulSoup(response.content, "html.parser")
        info = soup.find("div", class_ ="rdr-info-cont")
        nationality = info.find('a').text
        fields = info.find_all("b")
        try:
            age = re.search(r'\((.*?)\)', info.text).group(1)
        except(AttributeError):
            age = 27
        weight = 0
        height = 0
        for field in fields:
            label = field.get_text(strip=True)
            if label == "Weight:":
                weight = field.next_sibling.strip()
            if label == "Height:":
                height = field.next_sibling.strip()
        rider_data[rider].update({"Age": age, "Weight": weight,
                                  "Height": height, "Nationality": nationality})

    # Get all the url and multithread the previously defined function
    riders = list(rider_data.keys())
    threads = []
    for rider in riders:
        thread = threading.Thread(target=get_age_weight_single, args=(rider,))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()

In [8]:
def get_sporza_results_threaded(years, rider_data, DN_values):
    # Defines the functionality to add information for one race
    def get_sporza_results_single(race_url):
        nonlocal count # Used to get a consistent counter across all threads
        try:
            response = requests.get(race_url)
            with count_lock:
                count += 1
        except:
            print(f"Page {race_url} didn't respond.")
        soup = BeautifulSoup(response.content, "html.parser")
        for row in soup.find("tbody").find_all("tr"):
            rider = row.find("input", class_="gotoH2H").get("data-seo")
            if rider in rider_data.keys():
                fields = row.find_all("td")
                placement = fields[0].get_text(strip=True)
                placement = placement_mapping.get(placement, placement)
                UCI_points = fields[7].get_text(strip=True)
                ptn = fields[8].get_text(strip=True)
                race_name = "/".join(race_url.split("/")[4:6])
                rider_data[rider][race_name] = int(placement)
                if UCI_points != "":
                    if "UCI_points" in rider_data.get(rider, {}):
                        rider_data[rider]["UCI_points"] += int(UCI_points)
                    else:
                         rider_data[rider][f"UCI_points"] = int(UCI_points)
                if ptn != "":
                    year = race_url[-11:-7]
                    if f"ptn/{year}" in rider_data.get(rider, {}):
                        rider_data[rider][f"ptn/{year}"] += int(ptn)
                    else:
                         rider_data[rider][f"ptn/{year}"] = int(ptn)
                            
    # Get all the url and multithread the previously defined function
    placement_mapping = {"DNS": DN_values[0], "DNF": DN_values[1], "OTL": DN_values[1], "DSQ": DN_values[2]}
    count_lock = threading.Lock()
    race_urls, count = [], 0
    for race in sporza_races:
        for year in years:
            race_name = f"{race}/{year}"
            race_urls.append(f"https://www.procyclingstats.com/race/{race_name}/result")
    threads = []
    for race_url in race_urls:
        thread = threading.Thread(target=get_sporza_results_single, args=(race_url, ))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()
    print(f"""Successfully retrieved
              the results of {count}/{len(sporza_races)*len(years)} races.""")

Preprocessing of the retrieved data includes:
* Creating a dataframe of the retrieved data
* Resetting the index so the rider's names are in a separate column
* Renaming some columns
* Converting the team name using a one hot encoding
* Converting PCSpoints and Age from string to int and cut
* Converting the Weight and Heights strings to int and float respectively as well. If a rider has not gained points in a certain season, let's set it to zero instead of NaN.
* Setting missing Weight and Height values to the respective modal values
* Converting all race results to integers. x for DNS, y for DNF/OTL and z for DSQ.

In [9]:
def preprocess_rider_data(years, rider_data):
    pd.set_option('future.no_silent_downcasting', True) # Suggested by a warning
    # Make a dataframe and change the rider names from being the index to a separate column
    rider_data = pd.DataFrame(rider_data).T
    rider_data.reset_index(inplace=True)
    rider_data.rename(columns={"index": "Name", "Weight:": "Weight", "Height:": "Height"}, inplace=True)
    # One hot encoding of the team names
    rider_data = pd.get_dummies(rider_data, columns=["Team"])
    rider_data["PCSpoints"] = rider_data["PCSpoints"].astype(int)
    rider_data["Age"] = rider_data["Age"].fillna(27).astype(int)
    # Transforming Weight to int + filling missing values witht the modal weight
    rider_data["Weight"] = rider_data["Weight"].apply(
        lambda x: int(x[:2]) if isinstance(x, str) else x)
    rider_data["Weight"] = rider_data["Weight"].fillna(rider_data["Weight"].mode()[0])
    rider_data["Weight"] = rider_data["Weight"].infer_objects(copy=False)
    # Transforming Weight to float + filling missing values witht the modal height
    rider_data["Height"] = rider_data["Height"].apply(
        lambda x: float(x.split()[0]) if isinstance(x, str) else x)
    rider_data["Height"] = rider_data["Height"].fillna(rider_data["Height"].mode()[0])
    rider_data["Height"] = rider_data["Height"].infer_objects(copy=False)
    # Change rider points to 0 if they have None
    for year in years:
        rider_data[f"ptn/{year}"] = rider_data[f"ptn/{year}"].fillna(0).infer_objects(copy=False)
    rider_data['UCI_points'] = rider_data['UCI_points'].fillna(0).infer_objects(copy=False)
    # Change placement to 100 if a rider didn't participate
    rider_data = rider_data.fillna(100)
    
    # There is one rider with team = Team_ so let's drop that column
    rider_data = rider_data.drop("Team_", axis=1)
    # That rider is Pinot and he retired so let's remove him too.
    rider_data = rider_data[rider_data["Name"] != "thibaut-pinot"]
    return rider_data

n_riders = 200
DN_values = [100, 200, 300] # Placement if a rider DNS, DNF/OTL and DSQ respectively
#years = [2018, 2019, 2021, 2022, 2023]
years = [2022, 2023]
rider_data = {}

print("Retrieving rider names, teams and number of PCS points (multithreaded).")
start_time = time.time()
get_riders_teams_PCSpoints_threaded(n_riders, rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving age and weight (multithreaded).")
start_time = time.time()
get_age_weight_threaded(rider_data)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Retrieving race results (multithreaded).")
start_time = time.time()
get_sporza_results_threaded(years, rider_data, DN_values)
end_time = time.time()
print(f"Information retrieved in {end_time - start_time} seconds.")
print("----------------------------------------------------------")

print("Preprocessing data.")
start_time = time.time()
rider_data = preprocess_rider_data(years, rider_data)
end_time = time.time()
print(f"Data preprocessed in {end_time - start_time} seconds.")
print("----------------------------------------------------------")