In [None]:
# import the relevant .csv files
import pandas as pd
tdf_riders = pd.read_csv('TDF_Riders_Info.csv')
tdf_riders_history = pd.read_csv('TDF_Riders_History.csv')
all_tdf_stages = pd.read_csv('TDF_Stages_History.csv')
all_riders = pd.read_csv('All_Riders_Info.csv')

Data Cleaning

In [None]:
all_tdf_participated_in = tdf_riders['Tour de Frances participated in']
tdf_editions = []

for tdfs_participated_in in all_tdf_participated_in:
    tdfs_participated_in_list = tdfs_participated_in.split(", ")
    for tdf_edition in tdfs_participated_in_list:
        if int(tdf_edition) not in tdf_editions:
            tdf_editions.append(int(tdf_edition))

tdf_editions.sort()

# all the Tour de France editions that the available riders have raced in (need to get route information for all stages in each of these Tours)
print(f"Tour de Frances: {tdf_editions}")
print(f"Number of Tour de Frances: {len(tdf_editions)}")

In [None]:
def tdf_total_finish_time_column_name(tdf_edition):
    return f'{tdf_edition} TDF Total Finish Time (s)'

# create the relevant columns in the dataframe first
for tdf_edition in tdf_editions:
    tdf_riders[tdf_total_finish_time_column_name(tdf_edition)] = ''

# add the total finish times of each Tour de France edition for each rider in the TDF_Riders dataframe
for tdf_edition in tdf_editions:
    for rider_index in range(len(tdf_riders['Rider'])):
        rider = tdf_riders.iloc[rider_index]['Rider']
        tdfs_participated_in_raw = tdf_riders.iloc[rider_index]['Tour de Frances participated in'].split(", ")
        tdfs_participated_in = []
    
        for tdf_edition_raw in tdfs_participated_in_raw:
            tdfs_participated_in.append(int(tdf_edition_raw))

        if tdf_edition not in tdfs_participated_in:
            continue
        else:
            tdf_edition_finishes_dataframe = tdf_riders_history.loc[tdf_riders_history['Year'] == tdf_edition]
            finish_time = tdf_edition_finishes_dataframe.loc[tdf_edition_finishes_dataframe['Rider'] == rider.upper()]['TotalSeconds']
            finish_time = finish_time.to_string().split(" ")[-1]
            tdf_riders.at[rider_index, tdf_total_finish_time_column_name(tdf_edition)] = finish_time

Feature Engineering

In [None]:
tdf_edition_type_indices_reference = {
    'Individual Time Trial': 1,
    'Team Time Trial': 2,
    'Flat': 3,
    'Hilly': 4,
    'Mountain Time Trial': 5,
    'Mountain': 6
}
tdf_edition_type_indices = {}
tdf_edition_total_distances = {}

# get the Tour de France editions raced by the available riders and the information of each of their stages
tdf_stages_information = all_tdf_stages.loc[all_tdf_stages['Year'].isin(tdf_editions)]

# determine the overall type of each Tour de France edition using weighted average
for tdf_edition in tdf_editions:
    tdf_edition_stages_information = tdf_stages_information.loc[tdf_stages_information['Year'] == tdf_edition]
    tdf_edition_total_distance = float(tdf_edition_stages_information.iloc[0]['TotalTDFDistance'])
    tdf_edition_overall_type_index = 0

    for tdf_edition_stage_index, tdf_edition_stage_information in tdf_edition_stages_information.iterrows():
        tdf_stage_type = tdf_edition_stage_information['Stage Type']
        tdf_stage_distance = tdf_edition_stage_information['Distance (km)']
        tdf_stage_type_index = tdf_edition_type_indices_reference[tdf_stage_type]
        tdf_edition_overall_type_index += tdf_stage_distance * tdf_stage_type_index

    tdf_edition_overall_type_index /= tdf_edition_total_distance
    tdf_edition_type_indices[tdf_edition] = tdf_edition_overall_type_index
    tdf_edition_total_distances[tdf_edition] = tdf_edition_total_distance

# the overall type of Tour de France each edition is. the higher the index, the more mountainous it is
print(tdf_edition_type_indices)

print(tdf_edition_total_distances)

In [None]:
time_trial = 'Points (Time Trial)'
sprint = 'Points (Sprint)'
gc = 'Points (GC)'
climb = 'Points (Climb)'
overall_type = 'Overall Type Index'
rider_type_indices_reference = {
    time_trial: 1,
    sprint: 2,
    gc: 3,
    climb: 4
}
rider_type_indices = {}
tdf_rider_type_indices = {}

# determine the overall type of all available rider
tdf_riders_list = tdf_riders['Rider'].to_list()

for rider_index, rider_information in all_riders.iterrows():
    rider_name = rider_information['Rider']
    rider_time_trial_points = rider_information[time_trial]
    rider_sprint_points = rider_information[sprint]
    rider_gc_points = rider_information[gc]
    rider_climb_points = rider_information[climb]
    rider_total_points = rider_time_trial_points + rider_sprint_points + rider_gc_points + rider_climb_points
    rider_overall_type_index = ((rider_type_indices_reference[time_trial] * rider_time_trial_points) + (rider_type_indices_reference[sprint] * rider_sprint_points) + (rider_type_indices_reference[gc] * rider_gc_points) + (rider_type_indices_reference[climb] * rider_climb_points))/rider_total_points
    rider_type_indices[rider_name] = rider_overall_type_index
    all_riders.at[rider_index, overall_type] = rider_overall_type_index

for tdf_rider_index, tdf_rider_information in tdf_riders.iterrows():
    tdf_rider_name = tdf_rider_information['Rider']
    tdf_rider_time_trial_points = tdf_rider_information[time_trial]
    tdf_rider_sprint_points = tdf_rider_information[sprint]
    tdf_rider_gc_points = tdf_rider_information[gc]
    tdf_rider_climb_points = tdf_rider_information[climb]
    tdf_rider_total_points = tdf_rider_time_trial_points + tdf_rider_sprint_points + tdf_rider_gc_points + tdf_rider_climb_points
    tdf_rider_overall_type_index = ((rider_type_indices_reference[time_trial] * tdf_rider_time_trial_points) + (rider_type_indices_reference[sprint] * tdf_rider_sprint_points) + (rider_type_indices_reference[gc] * tdf_rider_gc_points) + (rider_type_indices_reference[climb] * tdf_rider_climb_points))/tdf_rider_total_points
    tdf_rider_type_indices[tdf_rider_name] = tdf_rider_overall_type_index
    tdf_riders.at[tdf_rider_index, overall_type] = tdf_rider_overall_type_index

# the overall type of each rider. the higher the index, the better the rider is at mountainous stages. the lower the index, the better the rider is at time trial and flat stages
print(rider_type_indices)
all_riders.to_csv('All_Riders_Info_With_Type.csv', sep=',', encoding='utf-8', index=False)
tdf_riders.to_csv('TDF_Riders_With_Timing_Type.csv', sep=',', encoding='utf-8', index=False)

Features (independent variable) preparation

In [None]:
# rider features
ftp = 'FTP (w)'
weight = 'Weight (kg)'
power_to_weight_ratio = 'Power-to-Weight Ratio (w/kg)'
height = 'Height (m)'
tdfs_participated_in = 'Tour de Frances participated in' # to indicate which Tour de France edition to use when training the regression model
name = 'Rider' # to indicate which Tour de France total finish timing to use when training the regression model
features_riders = tdf_riders[[name, ftp, weight, power_to_weight_ratio, height, overall_type, tdfs_participated_in]].copy()

# Tour de France edition features
total_distance = 'Total Distance (km)'
edition_type = 'Edition Type Index'
year = 'Edition Year' # to indicate which rider to use the Tour de France edition features on

features_list = []

for feature_rider_index, feature_rider_information in features_riders.iterrows():
    tdfs_participated_in_raw = feature_rider_information['Tour de Frances participated in'].split(", ")
    tdfs_participated_in = []

    for tdf_edition_raw in tdfs_participated_in_raw:
        tdfs_participated_in.append(int(tdf_edition_raw))

    for tdf_participated_in in tdfs_participated_in:
        features_list.append([
            feature_rider_information[ftp],
            feature_rider_information[weight],
            feature_rider_information[power_to_weight_ratio],
            feature_rider_information[height],
            feature_rider_information[overall_type],
            tdf_edition_total_distances[tdf_participated_in],
            tdf_edition_type_indices[tdf_participated_in]
        ])

features = pd.DataFrame(features_list, columns=[ftp, weight, power_to_weight_ratio, height, overall_type, total_distance, edition_type])
print(features)

Target (dependent variable) preparation

In [None]:
target_list = []

for tdf_rider_index, tdf_rider_information in tdf_riders.iterrows():
    tdfs_participated_in_raw = tdf_rider_information['Tour de Frances participated in'].split(", ")
    tdfs_participated_in = []

    for tdf_edition_raw in tdfs_participated_in_raw:
        tdfs_participated_in.append(int(tdf_edition_raw))

    for tdf_participated_in in tdfs_participated_in:
        target_list.append(tdf_rider_information[tdf_total_finish_time_column_name(tdf_participated_in)])

target = pd.DataFrame(target_list, columns=['TDF Total Finish Time (s)'])
print(target)

Training the regression model

In [None]:
# to split the features and target dataframes into testing and training datasets?