In [1]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn
import featuretools as ft
import io
import requests
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from dateutil.parser import *

In [14]:
def encode_column(df, column):
    """ replaces strings with according int """
    nationalities_set = {df[column].unique()[i]: i for i in range(len(df[column].unique()))}
    df[column] = df[column].apply(lambda x: nationalities_set[x])       
    
def prepare_drivers():
    drivers = pd.read_csv("data/drivers.csv")
    # encode nationality for models
    encode_column(drivers, 'nationality')
    drivers = drivers[['driverId', 'dob', 'nationality']]
    drivers = drivers.dropna()
#     drivers['dob'] = drivers['dob'].apply(lambda x: parse(str(x)).timestamp())
    drivers['dob'] = drivers['dob'].apply(lambda x: parse(str(x)))
    return drivers

def prepare_races():
    races = pd.read_csv("data/races.csv")
    races['name'] = races['name'].apply(lambda x: x.split(" ")[0])
#     races['date'] = races['date'].apply(lambda x: parse(x).timestamp())
    races['date'] = races['date'].apply(lambda x: parse(x))
    encode_column(races, 'name')
    return races


def prepare_standing(top_x):
    standings = pd.read_csv("data/driverStandings.csv")
    standings.drop(columns=['positionText'], inplace=True)
    standings['target'] = standings['position'].apply(lambda x: 1 if x <= top_x else 0)
    
    return standings

def prepare_synthetic_table():
    first_model_input = pd.merge(standings, races, on='raceId')
    first_model_input = pd.merge(first_model_input, drivers, on='driverId')
    first_model_input = first_model_input.dropna()
    return first_model_input
    
# prepare raw data sets
drivers = prepare_drivers()
races = prepare_races()
standings = prepare_standing(5)
first_model_input = prepare_synthetic_table()

### Pipeline

- data collection
- data preparation
- feature engineering
- feature selection
- model training
- model evaluation
- repeat


##### TODO: proper cool picture
![Pipeline](bear.png "What is a data pipeline?")


### Feature Engineering

Creating new input for a model from existing data

- Domain knowledge
- Automatic approach


### Manual feature engineering

- Timestamp + [holidays](https://pypi.org/project/holidays/)
- Double click -> senior visitor
- Domain knowledge hypothesis


### Featuretools

[Featuretools](https://github.com/Featuretools/featuretools)


In [15]:
drivers.head()

Unnamed: 0,driverId,dob,nationality
0,1,1985-07-01,0
1,2,1977-10-05,1
2,3,1985-06-27,1
3,4,1981-07-29,2
4,5,1981-10-19,3


In [19]:
races.head()

Unnamed: 0,raceId,year,round,circuitId,name,date
0,1,2009,1,1,0,2009-03-29
1,2,2009,2,2,1,2009-04-05
2,3,2009,3,17,2,2009-04-19
3,4,2009,4,3,3,2009-04-26
4,5,2009,5,4,4,2009-05-10


In [17]:
standings.head()

Unnamed: 0,driverStandingsId,raceId,driverId,points,position,wins,target
0,1,18,1,10.0,1,1,1
1,2,18,2,8.0,2,0,1
2,3,18,3,6.0,3,0,1
3,4,18,4,5.0,4,0,1
4,5,18,5,4.0,5,0,1


### Predict that a pilot will be in top X for the race (Calssify as top pilot)
Possible input for predictions:
- driverId
- date_of_birth
- nationality

- raceId
- name
- date

Fatures:
- driverId
- dob -> split into year and month (almost always a default transformation)
- nationality
- raceId
- date -> split into year and month (almost always a default transformation)
- name

In [37]:
simple_model_df = pd.merge(standings, drivers, on='driverId')
simple_model_df = pd.merge(simple_model_df, races, on='raceId')
simple_model_df.drop(columns=['driverStandingsId', 'position', 'wins', 'year', 'round', 'circuitId', 'points'], inplace=True)
simple_model_df = simple_model_df.dropna()
simple_model_df['dob_year'] = simple_model_df['dob'].apply(lambda x: x.year)
simple_model_df['dob_month'] = simple_model_df['dob'].apply(lambda x: x.month)
simple_model_df['race_year'] = simple_model_df['date'].apply(lambda x: x.year)
simple_model_df['race_month'] = simple_model_df['date'].apply(lambda x: x.month)
simple_model_df.drop(columns=['dob', 'date'], inplace=True)

In [38]:
simple_model_df.head()

Unnamed: 0,raceId,driverId,target,nationality,name,dob_year,dob_month,race_year,race_month
0,18,1,1,0,0,1985,7,2008,3
1,18,2,1,1,0,1977,10,2008,3
2,18,3,1,1,0,1985,6,2008,3
3,18,4,1,2,0,1981,7,2008,3
4,18,5,1,3,0,1981,10,2008,3


In [41]:
train = simple_model_df[:int(simple_model_df.shape[0]*0.75)]
train_Y = train['target']
train_X = train.drop(columns=['target'])
test = simple_model_df[int(simple_model_df.shape[0]*0.75):]
test_Y = test['target']
test_X = test.drop(columns=['target'])

model_simple = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model_simple.fit(train_X, train_Y)  
simple_prediction = model_simple.predict(test_X)
mse_simple = mean_squared_error(test_Y.to_list(), simple_prediction)
mse_simple

0.08696808510638297

### Can we improve by adding features?
- How old was a pilot when taking part at race X
- Is pilot from host country
- Is pilot young?
...

Let's automate this questions

In [46]:
# small preparation so that we do not get any target exposure
standings_simple = standings[['driverId', 'raceId', 'target', 'driverStandingsId']]
race_simple = races[['raceId', 'date', 'name']]

In [58]:
# Entity set
es = ft.EntitySet(id = 'top_formula_pilot')
# Entities
es = es.entity_from_dataframe(entity_id="drivers",
                              dataframe=drivers,
                              index="driverId",
                              variable_types={"nationality": ft.variable_types.Categorical,
                                              "dob": ft.variable_types.DateOfBirth})
es = es.entity_from_dataframe(entity_id="standings",
                              dataframe=standings_simple,
                              index="driverStandingsId",
                              variable_types={"raceId": ft.variable_types.Id,
                                              "driverId": ft.variable_types.Id,
                                              "target": ft.variable_types.Categorical})
es = es.entity_from_dataframe(entity_id="races",
                              dataframe=race_simple,
                              index="raceId",
                              variable_types={"name": ft.variable_types.Categorical,
                                              "date": ft.variable_types.Datetime})



In [59]:
# Relationship
es = es.add_relationship(ft.Relationship(es["drivers"]["driverId"], es["standings"]["driverId"]))
es = es.add_relationship(ft.Relationship(es["races"]["raceId"], es["standings"]["raceId"]))

In [60]:
feature_matrix, feature_defs = ft.dfs(entityset=es, 
                                      target_entity="standings",
                                      max_depth = 4)

# agg_primitives=["sum", "count", "mean", "time_since_first"],
# trans_primitives=["cum_count", "days_since", "year"]

In [61]:
feature_matrix.columns

Index(['id', 'raceId', 'driverId', 'target', 'drivers.nationality',
       'races.name', 'drivers.SUM(standings.id)', 'drivers.STD(standings.id)',
       'drivers.MAX(standings.id)', 'drivers.SKEW(standings.id)',
       'drivers.MIN(standings.id)', 'drivers.MEAN(standings.id)',
       'drivers.COUNT(standings)', 'drivers.NUM_UNIQUE(standings.raceId)',
       'drivers.NUM_UNIQUE(standings.target)',
       'drivers.MODE(standings.raceId)', 'drivers.MODE(standings.target)',
       'drivers.DAY(dob)', 'drivers.YEAR(dob)', 'drivers.MONTH(dob)',
       'drivers.WEEKDAY(dob)', 'races.SUM(standings.id)',
       'races.STD(standings.id)', 'races.MAX(standings.id)',
       'races.SKEW(standings.id)', 'races.MIN(standings.id)',
       'races.MEAN(standings.id)', 'races.COUNT(standings)',
       'races.NUM_UNIQUE(standings.driverId)',
       'races.NUM_UNIQUE(standings.target)', 'races.MODE(standings.driverId)',
       'races.MODE(standings.target)', 'races.DAY(date)', 'races.YEAR(date)',
       '

In [62]:
feature_matrix.head()

Unnamed: 0_level_0,id,raceId,driverId,target,drivers.nationality,races.name,drivers.SUM(standings.id),drivers.STD(standings.id),drivers.MAX(standings.id),drivers.SKEW(standings.id),...,races.MEAN(standings.id),races.COUNT(standings),races.NUM_UNIQUE(standings.driverId),races.NUM_UNIQUE(standings.target),races.MODE(standings.driverId),races.MODE(standings.target),races.DAY(date),races.YEAR(date),races.MONTH(date),races.WEEKDAY(date)
driverStandingsId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,18,1,1,0.0,0.0,4891778.0,11006.88013,31702.0,-1.240795,...,3.5,8.0,8.0,2.0,1.0,1.0,16.0,2008.0,3.0,6.0
1,1,18,2,1,1.0,0.0,1112597.0,8937.100945,28894.0,2.033658,...,3.5,8.0,8.0,2.0,1.0,1.0,16.0,2008.0,3.0,6.0
2,2,18,3,1,1.0,0.0,4242350.0,12851.21192,31254.0,-0.751405,...,3.5,8.0,8.0,2.0,1.0,1.0,16.0,2008.0,3.0,6.0
3,3,18,4,1,2.0,0.0,4971545.0,13874.816481,31714.0,-0.139155,...,3.5,8.0,8.0,2.0,1.0,1.0,16.0,2008.0,3.0,6.0
4,4,18,5,1,3.0,0.0,1870864.0,13211.881979,29818.0,-0.273488,...,3.5,8.0,8.0,2.0,1.0,1.0,16.0,2008.0,3.0,6.0


In [63]:
feature_matrix = feature_matrix.dropna()
# drop some columns that make no sense or expose target
feature_matrix.drop(columns=['id', 'drivers.SUM(standings.id)', 'drivers.STD(standings.id)',
                             'drivers.MAX(standings.id)', 'drivers.SKEW(standings.id)',
                             'drivers.MIN(standings.id)', 'drivers.MEAN(standings.id)',
                             'drivers.NUM_UNIQUE(standings.target)', 'drivers.MODE(standings.target)',
                             'races.SUM(standings.id)', 'races.STD(standings.id)', 'races.MAX(standings.id)',
                             'races.SKEW(standings.id)', 'races.MIN(standings.id)',
                             'races.MEAN(standings.id)', 'races.COUNT(standings)',
                             'races.NUM_UNIQUE(standings.target)', 'races.MODE(standings.target)'
                            ], inplace=True)

train = feature_matrix[:int(feature_matrix.shape[0]*0.75)]
train_Y = train['target']
train_X = train.drop(columns=['target'])
test = feature_matrix[int(feature_matrix.shape[0]*0.75):]
test_Y = test['target']
test_X = test.drop(columns=['target'])



model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model.fit(train_X, train_Y)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [65]:
complex_predictions = model.predict(test_X)
mse_extended = mean_squared_error(test_Y.to_list(), complex_predictions)
mse_extended

0.0867129135538954