In [24]:
import numpy as np
import pandas as pd
import sklearn as sk
import pickle

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, plot_roc_curve

In [2]:
data = pd.read_csv('data/data.csv', sep=',')

In [3]:
data2 = data[['stop_time', 'driver_age', 'driver_gender',  'driver_race', 'violation', 'search_conducted', 'drugs_related_stop', 'is_arrested']]

In [4]:
display(data2)

Unnamed: 0,stop_time,driver_age,driver_gender,driver_race,violation,search_conducted,drugs_related_stop,is_arrested
0,1:55,20.0,M,White,Speeding,False,False,False
1,8:15,40.0,M,White,Speeding,False,False,False
2,23:15,33.0,M,White,Speeding,False,False,False
3,17:15,19.0,M,White,Other,False,False,True
4,10:00,21.0,F,White,Speeding,False,False,False
...,...,...,...,...,...,...,...,...
52961,1:55,,,,,False,False,
52962,6:43,25.0,M,White,Speeding,False,False,False
52963,6:49,,,,,False,False,
52964,7:19,25.0,F,White,Speeding,False,False,False


In [5]:
data2.isna().any()

stop_time             True
driver_age            True
driver_gender         True
driver_race           True
violation             True
search_conducted      True
drugs_related_stop    True
is_arrested           True
dtype: bool

In [6]:
data2 = data2.dropna()

In [7]:
driver_races = data2['driver_race'].unique()
driver_race_dict = {
    x: i
    for i, x in enumerate(driver_races)
}
driver_race_dict

{'White': 0, 'Black': 1, 'Asian': 2, 'Hispanic': 3, 'Other': 4}

In [8]:
driver_genders = data2['driver_gender'].unique()
driver_gender_dict = {
    x: i
    for i, x in enumerate(driver_genders)
}
driver_gender_dict

{'M': 0, 'F': 1}

In [9]:
violations = data2['violation'].unique()
violation_dict = {
    x: i
    for i, x in enumerate(violations) 
}
violation_dict

{'Speeding': 0,
 'Other': 1,
 'Equipment': 2,
 'Moving violation': 3,
 'Registration/plates': 4}

In [10]:
drugs_related_stops = data2['drugs_related_stop'].unique()
drugs_related_stop_dict = {
    False: 0,
    True: 1 
}
drugs_related_stop_dict

{False: 0, True: 1}

In [11]:
is_arresteds = data2['is_arrested'].unique()
is_arrested_dict = {
    False: 0,
    True: 1
}
is_arrested_dict

{False: 0, True: 1}

In [12]:
search_conducteds = data2['search_conducted'].unique()
search_conducted_dict = {
    False: 0,
    True: 1
}
search_conducted_dict

{False: 0, True: 1}

In [13]:
def time_to_int(x):
    tmp = str(x).split(':')
    res = (int(tmp[0]) if tmp[0] != '00' else 0) * 60 + (int(tmp[1]) if tmp[1] != '00' else 0)
    return res

def float_to_int(x):
    return int(x)

data2['stop_time'] = data2['stop_time'].apply(time_to_int)
data2['driver_age'] = data2['driver_age'].apply(float_to_int)
data2['driver_race'] = data2['driver_race'].map(driver_race_dict)
data2['driver_gender'] = data2['driver_gender'].map(driver_gender_dict)
data2['violation'] = data2['violation'].map(violation_dict)
data2['drugs_related_stop'] = data2['drugs_related_stop'].map(drugs_related_stop_dict)
data2['is_arrested'] = data2['is_arrested'].map(is_arrested_dict)
data2['search_conducted'] = data2['search_conducted'].map(search_conducted_dict)

In [14]:
data2

Unnamed: 0,stop_time,driver_age,driver_gender,driver_race,violation,search_conducted,drugs_related_stop,is_arrested
0,115,20,0,0,0,0,0,0
1,495,40,0,0,0,0,0,0
2,1395,33,0,0,0,0,0,0
3,1035,19,0,0,1,0,0,1
4,600,21,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
52956,1342,32,0,0,0,0,0,1
52958,1356,44,1,3,0,0,0,0
52959,1405,20,1,1,0,0,0,0
52962,403,25,0,0,0,0,0,0


In [17]:
X = data2.drop('is_arrested',axis=1)
y = data2['is_arrested']
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=100)
X_train.head()

Unnamed: 0,stop_time,driver_age,driver_gender,driver_race,violation,search_conducted,drugs_related_stop
7327,152,23,0,1,0,0,0
26440,716,52,0,0,3,0,0
48145,485,25,1,0,0,0,0
32915,948,30,0,0,0,0,0
40592,774,30,0,0,3,0,0


In [21]:
Model = RandomForestClassifier(random_state=100,n_jobs=-1,class_weight='balanced')

params = {'n_estimators':[200],
          'min_samples_leaf':[40,60,100,150,200],
          'max_depth':[3,5,10,15,20],
          'max_features':[0.05,0.1,0.15,0.2,0.25]}

grid_search = GridSearchCV(estimator=Model,param_grid=params,verbose=1,n_jobs=-1,scoring='accuracy')
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


In [22]:
Model_best = grid_search.best_estimator_
y_train_pred = Model_best.predict(X_train)
y_test_pred = Model_best.predict(X_test)
print('Train Accuracy :',accuracy_score(y_train,y_train_pred))
print('Train Recall :',recall_score(y_train,y_train_pred))

Train Accuracy : 0.8428506282934739
Train Recall : 0.8260577568838147


In [26]:
filename = 'finalized_model.sav'
pickle.dump(Model_best, open(filename, 'wb'))

In [None]:
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)