# Feature Engineering

In [20]:
from os import path
import pandas as pd

path = path.join('..', 'dataset','races_cleaned.csv')
df = pd.read_csv(path)
df.head()

Unnamed: 0,_url,name,points,length,climb_total,profile,startlist_quality,date,position,cyclist,cyclist_age,is_tarmac,cyclist_team,delta
0,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,0,sean-kelly,22.0,True,vini-ricordi-pinarello-sidermec-1986,0.0
1,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,norway-1987,0.0
2,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,france-1978,0.0
3,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,navigare-blue-storm-1993,0.0
4,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,spain-1991,0.0


### Feature 1: Classification of the races using a scoring system

First, we do a min-max normalization of the data necessary to calculate the scoring system. 

$$ x_{\text{norm}} = \frac{x - x_{\text{min}}}{x_{\text{max}} - x_{\text{min}}} $$


In [21]:
df['profile_norm'] = (df['profile'] - df['profile'].min()) / (df['profile'].max() - df['profile'].min())
df['climb_total_norm'] = (df['climb_total'] - df['climb_total'].min()) / (df['climb_total'].max() - df['climb_total'].min())
df['length_norm'] = (df['length'] - df['length'].min()) / (df['length'].max() - df['length'].min())

After that, we calculate the score for the difficulty and classify each race by the score as follows:
$$ difficulty\_score = profile\_norm + climb\_total\_norm + length\_norm $$

In [22]:
df['difficulty_score'] = df['profile_norm'] + df['climb_total_norm'] + df['length_norm']

def classify_difficulty(score):
    if score <= 1:
        return "easy"
    elif 1 < score <= 2:
        return "moderate"
    elif 2 < score <= 3:
        return "hard"
    else:
        return "extreme"

df['difficulty_level'] = df['difficulty_score'].apply(classify_difficulty)

df.drop(columns=['profile_norm', 'climb_total_norm', 'length_norm'], inplace=True)
df.head()


Unnamed: 0,_url,name,points,length,climb_total,profile,startlist_quality,date,position,cyclist,cyclist_age,is_tarmac,cyclist_team,delta,difficulty_score,difficulty_level
0,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,0,sean-kelly,22.0,True,vini-ricordi-pinarello-sidermec-1986,0.0,0.635375,easy
1,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,norway-1987,0.0,0.635375,easy
2,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,france-1978,0.0,0.635375,easy
3,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,navigare-blue-storm-1993,0.0,0.635375,easy
4,tour-de-france/1978/stage-6,Tour de France,100.0,162000.0,1101.0,1.0,1241,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,spain-1991,0.0,0.635375,easy
