In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
from surprise import SVD, Dataset, Reader

In [10]:
df = pd.read_csv("../data/data_cleaned.csv", low_memory=False)
df.head()

Unnamed: 0,Rank,Performance,"Surname, first name",Club,Nat.,YOB,M/F,Rank M/F,Cat,Cat. Rank,...,Total Finishers,Male Finishers,Female Finishers,Distance KM,Finish Percentage,Average Speed,Race Location,Gender,Race Count,Cumulative Distance KM
0,1,225.935 km,"Nielsen, Paul","*Wellington, CO",USA,1961.0,M,1,M60,1.0,...,19,14,5,,0.05,,USA,M,-1,0.0
1,2,224.245 km,"Garmire, Jeff","*Bozeman, MT",USA,1990.0,M,2,M23,1.0,...,19,14,5,,0.11,,USA,M,-1,0.0
2,3,198.947 km,"Hallsten, Eric","*Berkeley, CA",USA,2000.0,M,3,MU23,1.0,...,19,14,5,,0.16,,USA,M,-1,0.0
3,4,175.354 km,"Derstine, Liz","*Melrose, MA",USA,1986.0,F,1,W35,1.0,...,19,14,5,,0.21,,USA,F,-1,0.0
4,5,163.541 km,"Tadajewski, Anthony","*Litchfield Park, AZ",USA,1977.0,M,4,M45,1.0,...,19,14,5,,0.26,,USA,M,-1,0.0


In [13]:
df_clean = df[df['Event Type']=='Distance']

In [15]:
df_clean.columns

Index(['Runner ID', 'First Name', 'Surname', 'Nat.', 'YOB', 'Gender',
       'Race Count', 'Cumulative Distance KM', 'Event ID', 'Event', 'Date',
       'Distance', 'Distance KM', 'Race Location', 'Elevation Gain', 'Terrain',
       'Event Type', 'Total Finishers', 'Male Finishers', 'Female Finishers',
       'Winner Time', 'Time Seconds Winner', 'Distance Winner', 'Rank',
       'Performance', 'Rank M/F', 'Distance/Time', 'Time Seconds Finish',
       'Distance Finish', 'Performance Ratio', 'Finish Percentage',
       'Average Speed'],
      dtype='object')

In [26]:
def calculate_race_difficulty(df):
    # Normalize each factor to a 0-1 scale
    distance_factor = df['Distance KM'] / 160  # Assuming 100km is the max
    #elevation_factor = df['Elevation Gain'] / 5000  # Assuming 5000m is the max
    terrain_factor = {'road': 0.6, 'track': 0.4, 'trail': 1.0, 'other':0.5}[df['Terrain']]
    
    # Adjust finishers factor (more finishers usually means easier race)
    finishers_factor = 1 - (df['Total Finishers'] / 1000)  # Assuming 1000 is a large race
    finishers_factor = max(finishers_factor, 0.5)  # Ensure it doesn't go below 0.5
    
    # Calculate difficulty score
    difficulty_score = (distance_factor * 0.5 +
                        #elevation_factor * 0.3 +
                        terrain_factor * 0.2 +
                        finishers_factor * 0.3)
    
    return difficulty_score

def weighted_performance(df):
    # Invert Winner Percentage so that lower is better
    performance_ratio = df['Performance Ratio']
    
    # Weight the performance by the difficulty
    weighted_perf = performance_ratio * df['Race Difficulty Score']
    
    # Normalize to 0-1 scale where 1 is the best possible performance
    #normalized_weighted_perf = 1 - weighted_perf
    
    return weighted_perf


In [27]:
df_clean['Race Difficulty Score'] = df_clean.apply(calculate_race_difficulty, axis=1)
df_clean['Weighted Performance'] = df_clean.apply(weighted_performance, axis=1)

In [28]:
df_clean.head()

Unnamed: 0,Runner ID,First Name,Surname,Nat.,YOB,Gender,Race Count,Cumulative Distance KM,Event ID,Event,...,Performance,Rank M/F,Distance/Time,Time Seconds Finish,Distance Finish,Performance Ratio,Finish Percentage,Average Speed,Race Difficulty Score,Weighted Performance
19,433246,Johan,Wander,NED,1960.0,M,-1,0.0,104536,1e Kennedymars Assen - Wildervank - Assen (NED),...,11:51:38 h,1,81km,42698.0,,1.0,0.11,527.135802,0.670425,0.670425
20,1675105,Wilko,Koster,NED,1967.0,M,-1,0.0,104536,1e Kennedymars Assen - Wildervank - Assen (NED),...,12:37:00 h,2,81km,45420.0,,0.94007,0.22,560.740741,0.670425,0.630247
21,9809,Marco,Hartman,NED,1974.0,M,-1,0.0,104536,1e Kennedymars Assen - Wildervank - Assen (NED),...,12:37:00 h,2,81km,45420.0,,0.94007,0.33,560.740741,0.670425,0.630247
22,989296,Ilka,Büsing,GER,1970.0,F,-1,0.0,104536,1e Kennedymars Assen - Wildervank - Assen (NED),...,14:30:00 h,1,81km,52200.0,,0.817969,0.44,644.444444,0.670425,0.548387
23,864829,Thomas,Behrens,GER,1966.0,M,-1,0.0,104536,1e Kennedymars Assen - Wildervank - Assen (NED),...,14:30:00 h,4,81km,52200.0,,0.817969,0.56,644.444444,0.670425,0.548387


### Random Forest

In [30]:
features = df_clean.groupby('Runner ID').agg(
    average_time=('Time Seconds Finish', 'mean'),
    min_time=('Time Seconds Finish', 'min'),
    max_time=('Time Seconds Finish', 'max'),
    count_races=('Time Seconds Finish', 'count'),
    avg_percentile=('Finish Percentage', 'mean'),
    max_percentile=('Finish Percentage', 'max'),
    min_percentile=('Finish Percentage', 'min'),
    avg_speed=('Average Speed', 'mean'),
    max_speed=('Average Speed', 'max'),
    min_speed=('Average Speed', 'min'),
    race_difficulty=('Race Difficulty Score', 'mean'),
    weighted_performance=('Weighted Performance', 'mean')
).reset_index()

features.head()

Unnamed: 0,Runner ID,average_time,min_time,max_time,count_races,avg_percentile,max_percentile,min_percentile,avg_speed,max_speed,min_speed,race_difficulty,weighted_performance
0,1,26940.0,26940.0,26940.0,1,0.84,0.84,0.84,585.652174,585.652174,585.652174,0.53445,0.279723
1,33,54590.628571,22920.0,631680.0,35,0.668286,0.94,0.38,561.012006,874.555556,469.919355,0.722541,0.480819
2,34,25379.5,14962.0,35797.0,2,0.435,0.71,0.16,420.809394,542.378788,299.24,0.54525,0.36346
3,47,34743.0,23129.0,46357.0,2,0.26,0.31,0.21,463.075,463.57,462.58,0.678175,0.531617
4,69,32062.0,16772.0,45183.0,6,0.525,0.89,0.38,416.089314,664.455882,335.44,0.653817,0.43377
