In [20]:
# importing required libraries 

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")

In [21]:
result_df = pd.read_csv('../input/formula-1-world-championship-1950-2020/results.csv')
stats_df = pd.read_csv('../input/formula-1-world-championship-1950-2020/status.csv')
drivers_df = pd.read_csv('../input/formula-1-world-championship-1950-2020/drivers.csv')
races_df = pd.read_csv('../input/formula-1-world-championship-1950-2020/races.csv')
constructor_df = pd.read_csv('../input/formula-1-world-championship-1950-2020/constructors.csv')
driver_standings_df = pd.read_csv('../input/formula-1-world-championship-1950-2020/driver_standings.csv')
pd.get_option("display.max_columns",None)

In [22]:
result_df.head() 

In [23]:
stats_df.head() 

In [24]:
drivers_df.head() 

In [25]:
races_df.head() 

In [26]:
constructor_df.head() 

In [27]:
driver_standings_df.head() 

In [28]:
# merging all seperate dataframe into single dataframe as df

con1 = pd.merge(result_df, races_df, on ='raceId')
con2 = pd.merge(con1, drivers_df, on = 'driverId')
con3 = pd.merge(con2, driver_standings_df, on = 'driverId')
con4 = pd.merge(con3, constructor_df, on ='constructorId')
df = pd.merge(con4, stats_df, on ='statusId')
pd.get_option("display.max_columns",None)
df.head()

In [29]:
# checking of null values

df.isna().sum()

In [30]:
df.info()

In [31]:
# summary
df.describe().T  

In [32]:
df.columns

In [33]:
# droping unwanted columns 

df = df.drop(['url','url_x','position_x','fastestLapTime','positionText_x','time_x','time_y','driverRef',
              'constructorRef','nationality_y','url_y','positionText_y','raceId_y','points_y'],1)

In [34]:
# changing of old column name to meaningful name 

col_name = {'number_x':'number','milliseconds':'timetaken_in_millisec','fastestLapSpeed':'max_speed',
 'name_x':'grand_prix','number_y':'driver_num','code':'driver_code','nationality_x':'nationality','name_y':'company',
 'raceId_x':'racerId','points_x':'points','position_y':'position'}

df.rename(columns=col_name,inplace=True)
df.head()

In [35]:
# Adding two column (forename,surname) to create a single column called driver_name

df['driver_name'] = df['forename']+' '+df['surname']

In [36]:
df = df.drop(['forename','surname'],1)

In [37]:
# converting to date format from string 

pd.to_datetime(df.date)

In [38]:
df['dob'] = pd.to_datetime(df['dob'])
df['date'] = pd.to_datetime(df['date'])

In [39]:
from datetime import datetime

In [40]:
# calculating driver's age and creating it as a new column

dates = datetime.today()-df['dob']
age = dates.dt.days/365

In [41]:
df['age'] = round(age)

In [42]:
pd.set_option('display.max_columns', None)
df.head()

In [43]:
# changing datatype

l = ['number','timetaken_in_millisec','fastestLap','rank','max_speed','driver_num']
for i in l:
    df[i] = pd.to_numeric(df[i],errors='coerce')

In [44]:
df.drop('driver_num',1,inplace=True)

In [45]:
# seperating categorical and numerical columns for understading 

cat = []
num = []
for i in df.columns:
    if df[i].dtypes == 'O':
        cat.append(i)
    else:
        num.append(i)

In [46]:
df[cat].head()

In [47]:
df[num].head()

In [48]:
df.dtypes

In [49]:
df.head()

In [50]:
df.isnull().sum() / len(df) * 100

In [51]:
df['max_speed'].mean()

In [52]:
df[['rank','fastestLap']] = df[['rank','fastestLap']].fillna(0)
df['timetaken_in_millisec'] = df['timetaken_in_millisec'].fillna(df['timetaken_in_millisec'].mean())
df['max_speed']= df['max_speed'].fillna(df['max_speed'].mean())
df['number'] = df['number'].fillna(0)

In [53]:
df.isnull().sum() / len(df) * 100

In [54]:
df.head()

In [55]:
df.describe().T

In [56]:
circuit_df = pd.read_csv('../input/formula-1-world-championship-1950-2020/circuits.csv')
circuit_df.head()

In [57]:
# ploting the f1 track using lat and lng in worldmap

import folium
coordinates=[]
for lat,lng in zip(circuit_df['lat'],circuit_df['lng']):
    coordinates.append([lat,lng])
maps = folium.Map(zoom_start=2,tiles='Stamen Watercolor')  #map_types (Stamen Terrain, Stamen Toner, Mapbox Bright, cartodbpositron)
for i,j in zip(coordinates,circuit_df.name):
    marker = folium.Marker(
        location=i,
        icon=folium.Icon(icon="star",color='cadetblue'),
        popup="<strong>{0}</strong>".format(j))  #strong is used to bold the font (optional)
    marker.add_to(maps)
maps

In [58]:
df['rank'].unique()

In [59]:
df_fin = df[df['status'] == 'Finished']
df_fin.tail()

In [60]:
mean = df.max_speed.mean()
mean2 = df.fastestLap.mean()
df = df_fin[df_fin['max_speed']>mean]
df.head()

In [61]:
df[df['fastestLap']>mean2]

In [62]:
df.year.unique()

In [63]:
# filtering the data by mean of driver's age and events after year 2012

df = df[(df['age']<df['age'].mean()) & (df['year']>2012)]
df

In [64]:
# droping unwanted columns

df.drop('date',1,inplace=True)
df.drop('dob',1,inplace=True)
df.drop('statusId',1,inplace=True)

In [65]:
df.skew()

In [66]:
# outlier removal 

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df<(Q1-1.5*IQR)) | (df>(Q3+1.5*IQR))).any(axis=1)]
df.head()

In [68]:
# heatmap

plt.figure(figsize=(14,10))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [69]:
num.remove('date')
num.remove('dob')
num.remove('statusId')

In [73]:
# kde plot for checking the normalization 

plt.figure(figsize=(18,52)) 
for i,j in zip(num,range(1,len(num)+1)):
    plt.subplot(14,2,j)
    sns.kdeplot(df[i],shade=True,color='darkblue')
plt.show()

In [71]:
df.skew()

In [75]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [76]:
# encoding categorical columns

for i in cat:
    df[i] = le.fit_transform(df[i])
df.head()

In [77]:
x = df.drop('driver_name',1)
y = df.driver_name

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [80]:
# importing ML libraries 

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

In [82]:
clf = DecisionTreeClassifier(max_depth=5,random_state=1234)
clf.fit(X_train, y_train)

In [83]:
tree.export_text(clf)

In [84]:
fn = list(df.columns)
fn.remove('driver_name')

In [85]:
# plotting decision tree workflow 

fig = plt.figure(figsize=(40,40))
_ = tree.plot_tree(clf,
               feature_names=fn,
               filled=True)

In [86]:
# classification ML algorithms 

lr = LogisticRegression(solver='sag')
dt = DecisionTreeClassifier()
rn = RandomForestClassifier()
knn = KNeighborsClassifier()
gb = GaussianNB()
sgd = SGDClassifier()

In [87]:
li = [lr,sgd,knn,gb,rn,dt]
d = {}
for i in li:
    i.fit(xtrain,ytrain)
    ypred = i.predict(xtest)
    print(i,":",accuracy_score(ypred,ytest)*100)
    d.update({str(i):i.score(xtest,ytest)*100})

In [91]:
plt.figure(figsize=(14, 7))
plt.title("Algorithm vs Accuracy", fontweight='bold')
plt.xlabel("Algorithms")
plt.ylabel("Accuracys")
plt.plot(d.keys(),d.values(),marker='o',color='plum',linewidth=4,markersize=13,
         markerfacecolor='gold',markeredgecolor='slategray')
plt.show()

# Scaling the Data for better Algorithmic Performance

In [92]:
from sklearn.preprocessing import MinMaxScaler
# fit scaler on training data
norm = MinMaxScaler().fit(xtrain)
# transform training data
X_train_norm = norm.transform(xtrain)
# transform testing data
X_test_norm = norm.transform(xtest)

In [93]:
li = [lr,sgd,rn,knn,gb,dt]
di = {}
for i in li:
    i.fit(X_train_norm,ytrain)
    ypred = i.predict(X_test_norm)
    print(i,":",accuracy_score(ypred,ytest)*100)
    di.update({str(i):i.score(X_test_norm,ytest)*100})

In [97]:
plt.figure(figsize=(14, 7))
plt.title("Algorithm vs Accuracy", fontweight='bold')
plt.xlabel("Algorithm")
plt.ylabel("Accuracy")
plt.plot(di.keys(),di.values(),marker='o',color='black',linewidth=4,markersize=13,
         markerfacecolor='gold',markeredgecolor='black')
plt.show()

In [98]:
from sklearn.preprocessing import StandardScaler
# fit scaler on training data
std = StandardScaler().fit(xtrain)
# transform train data
x_train_std = std.transform(xtrain)
# transform test data
x_test_std = std.transform(xtest)

In [99]:
li = [lr,sgd,rn,knn,gb,dt]
dic = {}
for i in li:
    i.fit(x_train_std,ytrain)
    ypred = i.predict(x_test_std)
    print(i,":",accuracy_score(ypred,ytest)*100)
    dic.update({str(i):i.score(x_test_std,ytest)*100})

In [100]:
plt.figure(figsize=(14, 7))
plt.title("Algorithm vs Accuracy", fontweight='bold')
plt.xlabel("Algorithm")
plt.ylabel("Accuracy")
plt.plot(dic.keys(),dic.values(),marker='o',color='purple',linewidth=4,markersize=13,
         markerfacecolor='gold',markeredgecolor='black')
plt.show()

In [101]:
from sklearn.preprocessing import RobustScaler
# fit scaler on train data
scaler = RobustScaler().fit(xtrain)
# transform train data
xtrain_scaled = scaler.transform(xtrain)
# transform test data
xtest_scaled = scaler.transform(xtest)

In [102]:
li = [lr,sgd,rn,knn,gb,dt]
dics = {}
for i in li:
    i.fit(xtrain_scaled,ytrain)
    ypred = i.predict(xtest_scaled)
    print(i,":",accuracy_score(ypred,ytest)*100)
    dics.update({str(i):i.score(xtest_scaled,ytest)*100})

In [103]:
plt.figure(figsize=(14, 7))
plt.title("Algorithm vs Accuracy", fontweight='bold')
plt.xlabel("Algorithm")
plt.ylabel("Accuracy")
plt.plot(dics.keys(),dics.values(),marker='o',color='brown',linewidth=4,markersize=13,
         markerfacecolor='gold',markeredgecolor='black')
plt.show()