## Importing dependencies

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import psycopg2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as ns
from pathlib import Path
from collections import Counter
from sqlalchemy import create_engine

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

## Connecting to the database

In [4]:
#connect to pgadmin database

#https://blog.panoply.io/connecting-jupyter-notebook-with-postgresql-for-python-data-analysis
#from config import password

addy = "localhost"
port = "5432"
username = "postgres"
pswd = ""
dbname = "Formula_1"

#string that contains the postgres login info
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
    .format(username=username,
    password= pswd,
    ipaddress=addy,
    port=port,
    dbname=dbname))

#connection
connection = create_engine(postgres_str)

In [5]:
#confirming the data connect and loading data to pandas dataframe

#we'll use the table with all our modeling info
df = pd.read_sql_query('''SELECT * FROM sim_data ;''', connection)
#df.count()
df.head()

Unnamed: 0,race_name,date,full_name,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,downforce_level,start_tyre,end_tyre,num_stops,q1,q2,q3
0,Australian Grand Prix,2019-03-17,Valtteri Bottas,1.0,2,70.453279,1015.334426,False,23.477869,High,SOFT,MEDIUM,2.0,0 days 00:01:22.400000,0 days 00:01:21.200000,0 days 00:01:20.600000
1,Australian Grand Prix,2019-03-17,Lewis Hamilton,2.0,1,70.453279,1015.334426,False,23.477869,High,SOFT,MEDIUM,2.0,0 days 00:01:22,0 days 00:01:21,0 days 00:01:20.500000
2,Australian Grand Prix,2019-03-17,Max Verstappen,3.0,4,70.453279,1015.334426,False,23.477869,High,SOFT,MEDIUM,2.0,0 days 00:01:22.900000,0 days 00:01:21.700000,0 days 00:01:21.300000
3,Australian Grand Prix,2019-03-17,Sebastian Vettel,4.0,3,70.453279,1015.334426,False,23.477869,High,SOFT,MEDIUM,2.0,0 days 00:01:22.900000,0 days 00:01:21.900000,0 days 00:01:21.200000
4,Australian Grand Prix,2019-03-17,Charles Leclerc,5.0,5,70.453279,1015.334426,False,23.477869,High,SOFT,HARD,2.0,0 days 00:01:22,0 days 00:01:21.700000,0 days 00:01:21.400000


## Data cleanup/prep

In [6]:
##checking types
df.dtypes

race_name                    object
date                         object
full_name                    object
finish_position             float64
grid_position                 int64
avg_humidity                float64
avg_air_pressure            float64
rainfall                       bool
avg_airtemp                 float64
downforce_level              object
start_tyre                   object
end_tyre                     object
num_stops                   float64
q1                  timedelta64[ns]
q2                  timedelta64[ns]
q3                  timedelta64[ns]
dtype: object

In [7]:
#checking nulls
df.isnull().sum()

race_name             0
date                  0
full_name             0
finish_position     147
grid_position         0
avg_humidity          0
avg_air_pressure      0
rainfall              0
avg_airtemp           0
downforce_level     420
start_tyre           19
end_tyre             19
num_stops            19
q1                   16
q2                  295
q3                  581
dtype: int64

In [8]:
#dropping unneeded columns
clean_df = df.drop(['race_name','date','full_name','downforce_level'],axis=1)
clean_df.head()

Unnamed: 0,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,start_tyre,end_tyre,num_stops,q1,q2,q3
0,1.0,2,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22.400000,0 days 00:01:21.200000,0 days 00:01:20.600000
1,2.0,1,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22,0 days 00:01:21,0 days 00:01:20.500000
2,3.0,4,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22.900000,0 days 00:01:21.700000,0 days 00:01:21.300000
3,4.0,3,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22.900000,0 days 00:01:21.900000,0 days 00:01:21.200000
4,5.0,5,70.453279,1015.334426,False,23.477869,SOFT,HARD,2.0,0 days 00:01:22,0 days 00:01:21.700000,0 days 00:01:21.400000


In [9]:
#replacing null finish position with last place identifier
clean_df['finish_position'] = clean_df['finish_position'].fillna(20)
clean_df.isnull().sum()

finish_position       0
grid_position         0
avg_humidity          0
avg_air_pressure      0
rainfall              0
avg_airtemp           0
start_tyre           19
end_tyre             19
num_stops            19
q1                   16
q2                  295
q3                  581
dtype: int64

In [10]:
#finding min of q1, q2, q3 for minimum qualifying round times, storing as a new column
clean_df['qt_min'] = clean_df[['q1', 'q2', 'q3']].min(axis=1)
clean_df.head()

Unnamed: 0,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,start_tyre,end_tyre,num_stops,q1,q2,q3,qt_min
0,1.0,2,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22.400000,0 days 00:01:21.200000,0 days 00:01:20.600000,0 days 00:01:20.600000
1,2.0,1,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22,0 days 00:01:21,0 days 00:01:20.500000,0 days 00:01:20.500000
2,3.0,4,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22.900000,0 days 00:01:21.700000,0 days 00:01:21.300000,0 days 00:01:21.300000
3,4.0,3,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:22.900000,0 days 00:01:21.900000,0 days 00:01:21.200000,0 days 00:01:21.200000
4,5.0,5,70.453279,1015.334426,False,23.477869,SOFT,HARD,2.0,0 days 00:01:22,0 days 00:01:21.700000,0 days 00:01:21.400000,0 days 00:01:21.400000


In [11]:
#dropping unneeded qualification times prior to dropping all nulls to maintain as much data as possible
clean_df = clean_df.drop(['q1','q2','q3'],axis =1)
clean_df

Unnamed: 0,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,start_tyre,end_tyre,num_stops,qt_min
0,1.0,2,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:20.600000
1,2.0,1,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:20.500000
2,3.0,4,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:21.300000
3,4.0,3,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2.0,0 days 00:01:21.200000
4,5.0,5,70.453279,1015.334426,False,23.477869,SOFT,HARD,2.0,0 days 00:01:21.400000
...,...,...,...,...,...,...,...,...,...,...
1131,20.0,20,76.203109,1013.797409,False,28.816062,MEDIUM,HARD,2.0,0 days 00:01:30.500000
1132,20.0,5,76.203109,1013.797409,False,28.816062,MEDIUM,MEDIUM,3.0,0 days 00:01:27.900000
1133,20.0,14,76.203109,1013.797409,False,28.816062,MEDIUM,HARD,3.0,0 days 00:01:28.900000
1134,20.0,17,76.203109,1013.797409,False,28.816062,HARD,MEDIUM,3.0,0 days 00:01:29.200000


In [12]:
#rechecking nulls
clean_df.isnull().sum()

finish_position      0
grid_position        0
avg_humidity         0
avg_air_pressure     0
rainfall             0
avg_airtemp          0
start_tyre          19
end_tyre            19
num_stops           19
qt_min              16
dtype: int64

In [13]:
#drop all remaining nulls
clean_df = clean_df.dropna()
clean_df.isnull().sum()

finish_position     0
grid_position       0
avg_humidity        0
avg_air_pressure    0
rainfall            0
avg_airtemp         0
start_tyre          0
end_tyre            0
num_stops           0
qt_min              0
dtype: int64

In [14]:
#converting qt_min to nanoseconds and int64 to be able to run modeler, otherwise it throws an error
clean_df['qt_min_ns'] = clean_df['qt_min'].apply(lambda x: x.delta)
clean_df.dtypes

finish_position             float64
grid_position                 int64
avg_humidity                float64
avg_air_pressure            float64
rainfall                       bool
avg_airtemp                 float64
start_tyre                   object
end_tyre                     object
num_stops                   float64
qt_min              timedelta64[ns]
qt_min_ns                     int64
dtype: object

In [15]:
#converting num_stops to integer

clean_df['num_stops'] = clean_df['num_stops'].astype(int)
clean_df

Unnamed: 0,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,start_tyre,end_tyre,num_stops,qt_min,qt_min_ns
0,1.0,2,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2,0 days 00:01:20.600000,80600000000
1,2.0,1,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2,0 days 00:01:20.500000,80500000000
2,3.0,4,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2,0 days 00:01:21.300000,81300000000
3,4.0,3,70.453279,1015.334426,False,23.477869,SOFT,MEDIUM,2,0 days 00:01:21.200000,81200000000
4,5.0,5,70.453279,1015.334426,False,23.477869,SOFT,HARD,2,0 days 00:01:21.400000,81400000000
...,...,...,...,...,...,...,...,...,...,...,...
1131,20.0,20,76.203109,1013.797409,False,28.816062,MEDIUM,HARD,2,0 days 00:01:30.500000,90500000000
1132,20.0,5,76.203109,1013.797409,False,28.816062,MEDIUM,MEDIUM,3,0 days 00:01:27.900000,87900000000
1133,20.0,14,76.203109,1013.797409,False,28.816062,MEDIUM,HARD,3,0 days 00:01:28.900000,88900000000
1134,20.0,17,76.203109,1013.797409,False,28.816062,HARD,MEDIUM,3,0 days 00:01:29.200000,89200000000


In [16]:
#converting rainfall to integers
clean_df['rainfall'] = clean_df['rainfall'].astype(int)
clean_df

Unnamed: 0,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,start_tyre,end_tyre,num_stops,qt_min,qt_min_ns
0,1.0,2,70.453279,1015.334426,0,23.477869,SOFT,MEDIUM,2,0 days 00:01:20.600000,80600000000
1,2.0,1,70.453279,1015.334426,0,23.477869,SOFT,MEDIUM,2,0 days 00:01:20.500000,80500000000
2,3.0,4,70.453279,1015.334426,0,23.477869,SOFT,MEDIUM,2,0 days 00:01:21.300000,81300000000
3,4.0,3,70.453279,1015.334426,0,23.477869,SOFT,MEDIUM,2,0 days 00:01:21.200000,81200000000
4,5.0,5,70.453279,1015.334426,0,23.477869,SOFT,HARD,2,0 days 00:01:21.400000,81400000000
...,...,...,...,...,...,...,...,...,...,...,...
1131,20.0,20,76.203109,1013.797409,0,28.816062,MEDIUM,HARD,2,0 days 00:01:30.500000,90500000000
1132,20.0,5,76.203109,1013.797409,0,28.816062,MEDIUM,MEDIUM,3,0 days 00:01:27.900000,87900000000
1133,20.0,14,76.203109,1013.797409,0,28.816062,MEDIUM,HARD,3,0 days 00:01:28.900000,88900000000
1134,20.0,17,76.203109,1013.797409,0,28.816062,HARD,MEDIUM,3,0 days 00:01:29.200000,89200000000


In [17]:
#converting start_tyre and end_tyre to integers
tire_cats = {np.nan:0, 'SOFT':1,'MEDIUM':2,'HARD':3, 'INTERMEDIATE':4, 'WET':5}

clean_df['start_tyre'] = clean_df['start_tyre'].map(tire_cats)
clean_df['end_tyre'] = clean_df['end_tyre'].map(tire_cats)
clean_df.head()

Unnamed: 0,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,start_tyre,end_tyre,num_stops,qt_min,qt_min_ns
0,1.0,2,70.453279,1015.334426,0,23.477869,1,2,2,0 days 00:01:20.600000,80600000000
1,2.0,1,70.453279,1015.334426,0,23.477869,1,2,2,0 days 00:01:20.500000,80500000000
2,3.0,4,70.453279,1015.334426,0,23.477869,1,2,2,0 days 00:01:21.300000,81300000000
3,4.0,3,70.453279,1015.334426,0,23.477869,1,2,2,0 days 00:01:21.200000,81200000000
4,5.0,5,70.453279,1015.334426,0,23.477869,1,3,2,0 days 00:01:21.400000,81400000000


In [18]:
#creating Y value for model - if racers are in the top ten they get points
clean_df['top_ten'] = np.where(clean_df['finish_position'] <= 10, True,False)
clean_df = clean_df.drop(['qt_min'],axis=1)
clean_df.head(25)

Unnamed: 0,finish_position,grid_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,start_tyre,end_tyre,num_stops,qt_min_ns,top_ten
0,1.0,2,70.453279,1015.334426,0,23.477869,1,2,2,80600000000,True
1,2.0,1,70.453279,1015.334426,0,23.477869,1,2,2,80500000000,True
2,3.0,4,70.453279,1015.334426,0,23.477869,1,2,2,81300000000,True
3,4.0,3,70.453279,1015.334426,0,23.477869,1,2,2,81200000000,True
4,5.0,5,70.453279,1015.334426,0,23.477869,1,3,2,81400000000,True
5,6.0,7,70.453279,1015.334426,0,23.477869,1,2,2,82100000000,True
6,7.0,11,70.453279,1015.334426,0,23.477869,1,3,2,82500000000,True
7,8.0,9,70.453279,1015.334426,0,23.477869,1,2,2,82300000000,True
8,9.0,16,70.453279,1015.334426,0,23.477869,2,3,2,83000000000,True
9,10.0,15,70.453279,1015.334426,0,23.477869,2,3,2,82500000000,True


## Checking data prior to loading into model

In [19]:
#confirming no nulls
clean_df.isnull().sum()


finish_position     0
grid_position       0
avg_humidity        0
avg_air_pressure    0
rainfall            0
avg_airtemp         0
start_tyre          0
end_tyre            0
num_stops           0
qt_min_ns           0
top_ten             0
dtype: int64

In [20]:
#confirming counts
clean_df.count()

finish_position     1101
grid_position       1101
avg_humidity        1101
avg_air_pressure    1101
rainfall            1101
avg_airtemp         1101
start_tyre          1101
end_tyre            1101
num_stops           1101
qt_min_ns           1101
top_ten             1101
dtype: int64

In [21]:
#confirming data types
clean_df.dtypes

finish_position     float64
grid_position         int64
avg_humidity        float64
avg_air_pressure    float64
rainfall              int32
avg_airtemp         float64
start_tyre            int64
end_tyre              int64
num_stops             int32
qt_min_ns             int64
top_ten                bool
dtype: object

## Training the model

In [60]:
#creating features, using all available columns other than predetermined Y - top ten

# Create our features
X = clean_df.drop(columns = 'top_ten')
X = pd.get_dummies(X)
#X.head()
#Create our target
y = clean_df['top_ten']


In [78]:
#splitting data into training and testing, using a test size of 33%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .75, random_state=1)

In [79]:
#preparing model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [80]:
#loading model
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [81]:
#putting predictions into a dataframe
y_pred = classifier.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
556,True,False
1011,True,False
375,True,False
604,True,True
879,True,True
...,...,...
51,True,False
1043,True,True
652,True,False
806,True,True


In [82]:
#checking accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.49878934624697335

In [83]:
#checking confusion matrix
predictions = classifier.predict(X_test)
confusion_matrix(y_test, predictions)

array([[  0, 414],
       [  0, 412]], dtype=int64)

In [77]:
#determining the importance of features
importance = classifier.coef_
importance

array([[-4.31363650e-21, -2.49705990e-21,  3.04631230e-22,
         5.62353578e-21,  1.75814157e-25,  1.31668769e-22,
        -6.39824526e-23,  7.03322477e-23,  8.68652460e-23,
        -4.14106138e-14]])