In [175]:
import warnings
warnings.filterwarnings('ignore')

In [176]:
import psycopg2
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sqlalchemy import create_engine
from matplotlib import pyplot as plt

In [177]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [187]:
#connect to pgadmin database

#https://blog.panoply.io/connecting-jupyter-notebook-with-postgresql-for-python-data-analysis
from config import password

addy = "localhost"
port = "5432"
username = "postgres"
pswd = ""
dbname = "Formula_1"

#string that contains the postgres login info
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
    .format(username=username,
    password= pswd,
    ipaddress=addy,
    port=port,
    dbname=dbname))

#connection
connection = create_engine(postgres_str)

In [188]:
#loading data to pandas dataframe


#we'll use the table with all our modeling info
df = pd.read_sql_query('''SELECT * FROM modeling_data ;''', connection)
#df.count()
df.head()

Unnamed: 0,full_name,driverId,race_name,raceId,date,grid_position,finish_position,avg_humidity,avg_air_pressure,rainfall,avg_airtemp,safety_car_laps,redflag,total_lap_time,downforce_level,longest_flat_out,first_stop,start_tyre,end_tyre,num_stops
0,Valtteri Bottas,822,Australian Grand Prix,1010,2019-03-17,2,1.0,70.453279,1015.334426,False,23.477869,0,0,5127.325,High,843.0,23.0,SOFT,MEDIUM,2.0
1,Lewis Hamilton,1,Australian Grand Prix,1010,2019-03-17,1,2.0,70.453279,1015.334426,False,23.477869,0,0,5148.211,High,843.0,15.0,SOFT,MEDIUM,2.0
2,Max Verstappen,830,Australian Grand Prix,1010,2019-03-17,4,3.0,70.453279,1015.334426,False,23.477869,0,0,5149.845,High,843.0,25.0,SOFT,MEDIUM,2.0
3,Sebastian Vettel,20,Australian Grand Prix,1010,2019-03-17,3,4.0,70.453279,1015.334426,False,23.477869,0,0,5184.434,High,843.0,14.0,SOFT,MEDIUM,2.0
4,Charles Leclerc,844,Australian Grand Prix,1010,2019-03-17,5,5.0,70.453279,1015.334426,False,23.477869,0,0,5185.555,High,843.0,28.0,SOFT,HARD,2.0


In [152]:
#data cleanup/prep

#drop null columns
clean_df = df.dropna(axis='columns',how='all')
#clean_df.count()

#drop null rows
clean_df = clean_df.dropna()
#clean_df.count()

#converting non-numerical data into numbers
clean_df['rainfall'] = np.where(clean_df['rainfall'] == 'False', 0,1)

#converting categorical variables to numerical
#test_df['downforce_level'] = test_df['downforce_level'].replace(["Low","Low/medium","Medium","High","Maximum"],[1,2,3,4,5],inplace=True)

#test_df
#'finish_position','avg_humidity','avg_air_pressure','avg_airtemp'
#dropping unneeded columns
#test_df = clean_df.drop(['driverId','raceId','full_name','race_name','date','downforce_level','start_tyre','end_tyre'], axis=1)

#converting finish position to int
#test_df['finish_position'] = test_df['finish_position'].astype(int)

#test_df.dtypes
clean_df.dtypes

full_name            object
driverId              int64
race_name            object
raceId                int64
date                 object
grid_position         int64
finish_position     float64
avg_humidity        float64
avg_air_pressure    float64
rainfall              int32
avg_airtemp         float64
safety_car_laps       int64
redflag               int64
total_lap_time      float64
downforce_level      object
longest_flat_out    float64
first_stop          float64
start_tyre           object
end_tyre             object
num_stops           float64
dtype: object

In [136]:
#split data into training and testing

# Create our features
X = clean_df.drop(columns = 'finish_position')
X = pd.get_dummies(X)
#X.head()
#Create our target
y = clean_df['finish_position']


In [137]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [138]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [139]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [140]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1072,7.0,14.0
162,7.0,3.0
207,7.0,8.0
247,6.0,8.0
1028,7.0,10.0
...,...,...
880,7.0,1.0
462,7.0,4.0
460,7.0,2.0
393,6.0,14.0


In [143]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.028169014084507043

In [142]:
print(accuracy_score(y_test, y_pred))

0.1056338028169014


In [144]:
#balanced random forest classifers
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [145]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix
y_pred = brfc.predict(X_test)


from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.07464285714285715

In [146]:
#confusion matrix
confusion_matrix(y_test, y_pred)

array([[5, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 3, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 3, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 0, 1, 2, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 2, 4, 0, 1, 2, 0, 0, 0, 1, 0, 2, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 2, 0, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 0, 3, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 3,

In [147]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        1.0       0.38      0.62      0.94      0.48      0.77      0.57         8
        2.0       0.23      0.60      0.93      0.33      0.75      0.54         5
        3.0       0.00      0.00      0.97      0.00      0.00      0.00         8
        4.0       0.09      0.12      0.93      0.11      0.34      0.11         8
        5.0       0.00      0.00      0.94      0.00      0.00      0.00         9
        6.0       0.00      0.00      0.96      0.00      0.00      0.00         9
        7.0       0.00      0.00      0.94      0.00      0.00      0.00         1
        8.0       0.00      0.00      0.92      0.00      0.00      0.00         8
        9.0       0.00      0.00      0.87      0.00      0.00      0.00        15
       10.0       0.00      0.00      0.97      0.00      0.00      0.00         9
       11.0       0.00      0.00      0.96      0.00      0.00      0.00         7
   

In [148]:
#EasyEnsembleClassifier

from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [149]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.100239898989899

In [150]:

# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1, 2, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 2, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 3, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 3, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 1, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 1, 1, 0, 0, 0, 0, 2, 0],
       [0, 1, 0, 1, 0, 2, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 1, 2, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 2, 0, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 3, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 1, 2, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0,

In [151]:

# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        1.0       0.50      0.12      0.99      0.20      0.35      0.11         8
        2.0       0.13      0.40      0.91      0.20      0.60      0.34         5
        3.0       0.36      0.62      0.93      0.45      0.76      0.57         8
        4.0       0.11      0.12      0.94      0.12      0.34      0.11         8
        5.0       0.00      0.00      0.99      0.00      0.00      0.00         9
        6.0       0.12      0.11      0.95      0.12      0.32      0.10         9
        7.0       0.00      0.00      0.91      0.00      0.00      0.00         1
        8.0       0.00      0.00      0.96      0.00      0.00      0.00         8
        9.0       0.17      0.07      0.96      0.10      0.25      0.06        15
       10.0       0.00      0.00      0.97      0.00      0.00      0.00         9
       11.0       0.00      0.00      0.92      0.00      0.00      0.00         7
   