In [63]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn
import featuretools as ft
import io
import requests
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from dateutil.parser import *

In [64]:
def encode_column(df, column):
    """ replaces strings with according int """
    nationalities_set = {df[column].unique()[i]: i for i in range(len(df[column].unique()))}
    df[column] = df[column].apply(lambda x: nationalities_set[x])       

In [65]:
drivers = pd.read_csv("data/drivers.csv")
# encode nationality for models
encode_column(drivers, 'nationality')
drivers = drivers[['driverId', 'dob', 'nationality']]
drivers = drivers.dropna()
drivers['dob'] = drivers['dob'].apply(lambda x: parse(str(x)).timestamp())
drivers.head()

Unnamed: 0,driverId,dob,nationality
0,1,489016800.0,0
1,2,244854000.0,1
2,3,488671200.0,1
3,4,365205600.0,2
4,5,372294000.0,3


In [66]:
races = pd.read_csv("data/races.csv")
races['name'] = races['name'].apply(lambda x: x.split(" ")[0])
races['date'] = races['date'].apply(lambda x: parse(x).timestamp())
encode_column(races, 'name')
races.head()

Unnamed: 0,raceId,year,round,circuitId,name,date
0,1,2009,1,1,0,1238281000.0
1,2,2009,2,2,1,1238882000.0
2,3,2009,3,17,2,1240092000.0
3,4,2009,4,3,3,1240697000.0
4,5,2009,5,4,4,1241906000.0


In [67]:
standings = pd.read_csv("data/driverStandings.csv")
standings.drop(columns=['positionText'], inplace=True)
standings.head()

Unnamed: 0,driverStandingsId,raceId,driverId,points,position,wins
0,1,18,1,10.0,1,1
1,2,18,2,8.0,2,0
2,3,18,3,6.0,3,0
3,4,18,4,5.0,4,0
4,5,18,5,4.0,5,0


In [68]:
first_model_input = pd.merge(standings, races, on='raceId')
first_model_input = pd.merge(first_model_input, drivers, on='driverId')
first_model_input = first_model_input.dropna()
first_model_input.head()

Unnamed: 0,driverStandingsId,raceId,driverId,points,position,wins,year,round,circuitId,name,date,dob,nationality
0,1,18,1,10.0,1,1,2008,1,1,0,1205622000.0,489016800.0,0
1,9,19,1,14.0,1,1,2008,2,2,1,1206227000.0,489016800.0,0
2,27,20,1,14.0,3,1,2008,3,3,3,1207433000.0,489016800.0,0
3,48,21,1,20.0,2,1,2008,4,4,4,1209247000.0,489016800.0,0
4,69,22,1,28.0,3,1,2008,5,5,6,1210457000.0,489016800.0,0


In [69]:
feature_matrix_simple = first_model_input.copy()
feature_matrix_simple['target'] = feature_matrix_simple['position'].apply(lambda x: 1 if x <= 5 else 0)
feature_matrix_simple.drop(columns=['wins', 'position'], inplace=True)
train_simple = feature_matrix_simple[:int(feature_matrix_simple.shape[0]*0.75)]
train_simple_Y = train_simple['target']
train_simple_X = train_simple.drop(columns=['target'])
test_simple = feature_matrix_simple[int(feature_matrix_simple.shape[0]*0.75):]
test_simple_Y = test_simple['target']
test_simple_X = test_simple.drop(columns=['target'])

model_simple = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model_simple.fit(train_simple_X, train_simple_Y)  
prediction = model_simple.predict(test_simple_X)
mse_simple = mean_squared_error(test_simple_Y.to_list(), prediction)
mse_simple

0.07234042553191489

In [76]:
standings_extended = pd.merge(standings, races, on='raceId')
standings_extended = standings_extended.dropna()
target_df = pd.DataFrame()
target_df['target'] = standings_extended['position'].apply(lambda x: 1 if x <= 5 else 0)
# 
standings_extended.drop(columns=['wins', 'position'], inplace=True)
standings_extended.head()

Unnamed: 0,driverStandingsId,raceId,driverId,points,year,round,circuitId,name,date
0,1,18,1,10.0,2008,1,1,0,1205622000.0
1,2,18,2,8.0,2008,1,1,0,1205622000.0
2,3,18,3,6.0,2008,1,1,0,1205622000.0
3,4,18,4,5.0,2008,1,1,0,1205622000.0
4,5,18,5,4.0,2008,1,1,0,1205622000.0


In [77]:

# Entity set
es = ft.EntitySet(id = 'top_formula_pilot')
# Entities
es = es.entity_from_dataframe(entity_id="drivers",
                              dataframe=drivers,
                              index="driverId",
                              variable_types={"nationality": ft.variable_types.Categorical,
                                             "dob": ft.variable_types.DateOfBirth})
es = es.entity_from_dataframe(entity_id="standings",
                              dataframe=standings_extended,
                              index="driverStandingsId",
                              variable_types={"raceId": ft.variable_types.Id,
                                              "driverId": ft.variable_types.Id,
                                             "points": ft.variable_types.Numeric,
                                              "year": ft.variable_types.Datetime,
                                             "round": ft.variable_types.Numeric,
                                             "circuitId": ft.variable_types.Numeric,
                                             "name": ft.variable_types.Categorical,
                                             "date": ft.variable_types.Datetime})
# es = es.entity_from_dataframe(entity_id="races",
#                               dataframe=races,
#                               index="raceId",
#                               variable_types={"year": ft.variable_types.Datetime,
#                                              "round": ft.variable_types.Numeric,
#                                              "circuitId": ft.variable_types.Numeric,
#                                              "name": ft.variable_types.Text,
#                                              "date": ft.variable_types.Datetime})

# Relationship
es = es.add_relationship(ft.Relationship(es["drivers"]["driverId"], es["standings"]["driverId"]))
# es = es.add_relationship(ft.Relationship(es["races"]["raceId"], es["standings"]["raceId"]))

In [78]:
# ft.list_primitives()

In [97]:
feature_matrix, feature_defs = ft.dfs(entityset=es, 
                                      target_entity="standings",
                                      max_depth = 4)
# feature_matrix = feature_matrix.dropna()

# agg_primitives=["sum", "count", "mean", "time_since_first"],
# trans_primitives=["cum_count", "days_since", "year"]

In [98]:
feature_matrix.columns

Index(['raceId', 'driverId', 'points', 'round', 'circuitId', 'name',
       'DAY(year)', 'DAY(date)', 'YEAR(year)', 'YEAR(date)', 'MONTH(year)',
       'MONTH(date)', 'WEEKDAY(year)', 'WEEKDAY(date)', 'drivers.nationality',
       'drivers.SUM(standings.points)', 'drivers.SUM(standings.round)',
       'drivers.SUM(standings.circuitId)', 'drivers.STD(standings.points)',
       'drivers.STD(standings.round)', 'drivers.STD(standings.circuitId)',
       'drivers.MAX(standings.points)', 'drivers.MAX(standings.round)',
       'drivers.MAX(standings.circuitId)', 'drivers.SKEW(standings.points)',
       'drivers.SKEW(standings.round)', 'drivers.SKEW(standings.circuitId)',
       'drivers.MIN(standings.points)', 'drivers.MIN(standings.round)',
       'drivers.MIN(standings.circuitId)', 'drivers.MEAN(standings.points)',
       'drivers.MEAN(standings.round)', 'drivers.MEAN(standings.circuitId)',
       'drivers.COUNT(standings)', 'drivers.NUM_UNIQUE(standings.raceId)',
       'drivers.NUM_UNIQUE

In [87]:
# feature_matrix.columns

In [99]:
feature_matrix['target'] = target_df['target']
feature_matrix = feature_matrix.dropna()
train = feature_matrix[:int(feature_matrix.shape[0]*0.75)]
train_Y = train['target']
train_X = train.drop(columns=['target'])
test = feature_matrix[int(feature_matrix.shape[0]*0.75):]
test_Y = test['target']
test_X = test.drop(columns=['target'])



model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model.fit(train_X, train_Y)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [100]:
predictions = model.predict(test_X)
mse_extended = mean_squared_error(test_Y.to_list(), predictions)
mse_extended

0.08550573514077164