In [1]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn
import featuretools as ft
import io
import requests

In [107]:
drivers_full = pd.read_csv("data/drivers.csv")
drivers_full.head()

Unnamed: 0,driverId,driverRef,dob,nationality
0,1,hamilton,07/01/1985,British
1,2,heidfeld,10/05/1977,German
2,3,rosberg,27/06/1985,German
3,4,alonso,29/07/1981,Spanish
4,5,kovalainen,19/10/1981,Finnish


In [108]:
drivers = drivers_full[['driverId', 'dob', 'nationality']]

In [40]:
races = pd.read_csv("data/races.csv")
races['name'] = races['name'].apply(lambda x: x.split(" ")[0])
races.head()

Unnamed: 0,raceId,year,round,circuitId,name,date
0,1,2009,1,1,Australian,2009-03-29
1,2,2009,2,2,Malaysian,2009-04-05
2,3,2009,3,17,Chinese,2009-04-19
3,4,2009,4,3,Bahrain,2009-04-26
4,5,2009,5,4,Spanish,2009-05-10


In [55]:
standings = pd.read_csv("data/driverStandings.csv")
standings.head()

Unnamed: 0,driverStandingsId,raceId,driverId,points,position,positionText,wins
0,1,18,1,10.0,1,1,1
1,2,18,2,8.0,2,2,0
2,3,18,3,6.0,3,3,0
3,4,18,4,5.0,4,4,0
4,5,18,5,4.0,5,5,0


In [112]:
# Entity set
es = ft.EntitySet(id = 'top_formula_pilot')
# Entities
es = es.entity_from_dataframe(entity_id="drivers",
                              dataframe=drivers,
                              index="driverId",
                              variable_types={"nationality": ft.variable_types.Categorical,
                                             "dob": ft.variable_types.DateOfBirth})
es = es.entity_from_dataframe(entity_id="standings",
                              dataframe=standings,
                              index="driverStandingsId",
                              variable_types={"raceId": ft.variable_types.Id,
                                              "driverId": ft.variable_types.Id,
                                             "points": ft.variable_types.Numeric,
                                             "position": ft.variable_types.Numeric,
                                             "positionText": ft.variable_types.Text,
                                             "wins": ft.variable_types.Numeric})
# es = es.entity_from_dataframe(entity_id="races",
#                               dataframe=races,
#                               index="raceId",
#                               variable_types={"year": ft.variable_types.Datetime,
#                                              "round": ft.variable_types.Numeric,
#                                              "circuitId": ft.variable_types.Numeric,
#                                              "name": ft.variable_types.Text,
#                                              "date": ft.variable_types.Datetime})

# Relationship
es = es.add_relationship(ft.Relationship(es["drivers"]["driverId"], es["standings"]["driverId"]))
# es = es.add_relationship(ft.Relationship(es["races"]["raceId"], es["standings"]["raceId"]))

In [113]:
ft.list_primitives()

Unnamed: 0,name,type,description
0,mode,aggregation,Determines the most commonly repeated value.
1,percent_true,aggregation,Determines the percent of `True` values.
2,sum,aggregation,"Calculates the total addition, ignoring `NaN`."
3,avg_time_between,aggregation,Computes the average number of seconds between...
4,time_since_last,aggregation,Calculates the time elapsed since the last dat...
5,any,aggregation,Determines if any value is 'True' in a list.
6,std,aggregation,Computes the dispersion relative to the mean v...
7,median,aggregation,Determines the middlemost number in a list of ...
8,num_unique,aggregation,"Determines the number of distinct values, igno..."
9,trend,aggregation,Calculates the trend of a variable over time.


In [114]:
feature_matrix, feature_defs = ft.dfs(entityset=es, 
                                      target_entity="drivers",
                                      agg_primitives=["Count", "mean", "sum"],
                                      max_depth = 2)

# agg_primitives=["sum", "count", "mean", "time_since_first"],
# trans_primitives=["cum_count", "days_since", "year"]

In [115]:
feature_matrix.columns

Index(['nationality', 'COUNT(standings)', 'MEAN(standings.points)',
       'MEAN(standings.position)', 'MEAN(standings.wins)',
       'SUM(standings.points)', 'SUM(standings.position)',
       'SUM(standings.wins)', 'DAY(dob)', 'YEAR(dob)', 'MONTH(dob)',
       'WEEKDAY(dob)', 'MEAN(standings.NUM_WORDS(positionText))',
       'MEAN(standings.NUM_CHARACTERS(positionText))',
       'SUM(standings.NUM_WORDS(positionText))',
       'SUM(standings.NUM_CHARACTERS(positionText))'],
      dtype='object')

In [116]:
# example = feature_matrix[['driverId', 'raceId', 'drivers.SUM(standings.points)', 'races.YEAR(year)', 'races.COUNT(standings.wins)']]
feature_matrix.head()

Unnamed: 0_level_0,nationality,COUNT(standings),MEAN(standings.points),MEAN(standings.position),MEAN(standings.wins),SUM(standings.points),SUM(standings.position),SUM(standings.wins),DAY(dob),YEAR(dob),MONTH(dob),WEEKDAY(dob),MEAN(standings.NUM_WORDS(positionText)),MEAN(standings.NUM_CHARACTERS(positionText)),SUM(standings.NUM_WORDS(positionText)),SUM(standings.NUM_CHARACTERS(positionText))
driverId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,British,207.0,127.864734,2.84058,2.864734,26468.0,588.0,593.0,1.0,1985.0,7.0,0.0,1.0,1.028986,207.0,213.0
2,German,193.0,14.663212,10.357513,0.0,2830.0,1999.0,0.0,5.0,1977.0,10.0,2.0,1.0,1.471503,193.0,284.0
3,German,206.0,82.087379,6.76699,1.305825,16910.0,1394.0,269.0,27.0,1985.0,6.0,3.0,1.0,1.23301,206.0,254.0
4,Spanish,295.0,61.681356,7.054237,1.172881,18196.0,2081.0,346.0,29.0,1981.0,7.0,2.0,1.0,1.274576,295.0,376.0
5,Finnish,111.0,8.585586,14.810811,0.072072,953.0,1644.0,8.0,19.0,1981.0,10.0,0.0,1.0,1.711712,111.0,190.0
