# Feature Engineering

In [6]:
from os import path
import pandas as pd

datasetpath = path.join('..', 'dataset','races_cleaned.csv')
races = pd.read_csv(datasetpath)
races.head()

Unnamed: 0,_url,name,points,length,climb_total,profile,startlist_quality,date,position,cyclist,cyclist_age,is_tarmac,cyclist_team,delta
0,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,0,sean-kelly,22.0,True,vini-ricordi-pinarello-sidermec-1986,0.0
1,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,norway-1987,0.0
2,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,france-1978,0.0
3,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,navigare-blue-storm-1993,0.0
4,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,spain-1991,0.0


### Feature 1: Classification of the races using a scoring system

First, we do a min-max normalization of the data necessary to calculate the scoring system. 

$$ x_{\text{norm}} = \frac{x - x_{\text{min}}}{x_{\text{max}} - x_{\text{min}}} $$


In [7]:
races['profile_norm'] = (races['profile'] - races['profile'].min()) / (races['profile'].max() - races['profile'].min())
races['climb_total_norm'] = (races['climb_total'] - races['climb_total'].min()) / (races['climb_total'].max() - races['climb_total'].min())
races['length_norm'] = (races['length'] - races['length'].min()) / (races['length'].max() - races['length'].min())

After that, we calculate the score for the difficulty and classify each race by the score as follows:
$$ difficulty\_score = profile\_norm + climb\_total\_norm + length\_norm $$

In [8]:
races['difficulty_score'] = races['profile_norm'] + races['climb_total_norm'] + races['length_norm']

def classify_difficulty(score):
    if score <= 1:
        return "easy"
    elif 1 < score <= 2:
        return "moderate"
    elif 2 < score <= 3:
        return "hard"
    else:
        return "extreme"

races['difficulty_level'] = races['difficulty_score'].apply(classify_difficulty)

races.drop(columns=['profile_norm', 'climb_total_norm', 'length_norm'], inplace=True)
races.head()


Unnamed: 0,_url,name,points,length,climb_total,profile,startlist_quality,date,position,cyclist,cyclist_age,is_tarmac,cyclist_team,delta,difficulty_score,difficulty_level
0,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,0,sean-kelly,22.0,True,vini-ricordi-pinarello-sidermec-1986,0.0,0.635375,easy
1,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,norway-1987,0.0,0.635375,easy
2,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,france-1978,0.0,0.635375,easy
3,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,navigare-blue-storm-1993,0.0,0.635375,easy
4,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,spain-1991,0.0,0.635375,easy


### Feature 2: Interation between the weight and the performance

(To be improved) This new feature combines `weight` and `position` to calculate a performance index for each race in the dataset.
$$ performance\_index = (1- norm\_weight) * (1 - norm\_position) $$

In [9]:


datasetpath = path.join('..', 'dataset','cyclists_cleaned.csv')
cyclists = pd.read_csv(datasetpath)
merged_df = races.merge(cyclists[['weight', '_url']], left_on='cyclist', right_on='_url', how='left')

merged_df['weight_norm'] = (merged_df['weight'] - merged_df['weight'].min()) / (merged_df['weight'].max() - merged_df['weight'].min())
merged_df['position_norm'] = (merged_df['position'] - merged_df['position'].min()) / (merged_df['position'].max() - merged_df['position'].min())

merged_df['performance_index'] = (1 - merged_df['position_norm']) * (1 - merged_df['weight_norm'])

correlation = merged_df[['performance_index', 'weight_norm', 'position_norm']].corr()

print("Correlation between performance index, weight, and position:")
print(correlation)

merged_df[['position', 'weight', 'weight_norm', 'position_norm', 'performance_index']].head()


Correlation between performance index, weight, and position:
                   performance_index  weight_norm  position_norm
performance_index           1.000000    -0.713476      -0.727131
weight_norm                -0.713476     1.000000       0.098442
position_norm              -0.727131     0.098442       1.000000


Unnamed: 0,position,weight,weight_norm,position_norm,performance_index
0,0,77.0,0.742857,0.0,0.257143
1,1,64.0,0.371429,0.004785,0.625564
2,2,69.0,0.514286,0.009569,0.481066
3,3,65.0,0.4,0.014354,0.591388
4,4,63.0,0.342857,0.019139,0.644566


### Feature 3: See the most advantageous races

Now we calculate for each row the ratio between the points and the difficulty score. This ratio will be used to see which are the most advantageous races for the riders over the years.

$$ gain\_ratio = \frac{points}{difficulty\_score} $$

In [10]:
races['gain_ratio'] = (races['points']/races['difficulty_score'])
races[['points','difficulty_score', 'difficulty_level', 'gain_ratio']].drop_duplicates().head()

Unnamed: 0,points,difficulty_score,difficulty_level,gain_ratio
0,100.0,0.635375,easy,157.387288
106,80.0,2.377975,hard,33.642066
271,100.0,0.488587,easy,204.671737
426,50.0,0.855043,easy,58.476618
545,100.0,1.606011,moderate,62.266071
