In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
import os
import pickle
import json
import joblib
import numpy as np

In [2]:
positive_outcome_labels = [
    'Community resolution',
    'Khat or Cannabis warning',
    'Caution (simple or conditional)',
    'Arrest',
    'Penalty Notice for Disorder',
    'Summons / charged by post',
    'Suspect arrested',
    'Suspect summoned to court',
]


In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
# df['Outcome linked to object of search'] = df['Outcome linked to object of search'].fillna(False)

In [5]:
df['positive_outcome'] = df['Outcome'].apply(lambda x: True if x in positive_outcome_labels else False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856610 entries, 0 to 856609
Data columns (total 17 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   observation_id                            856610 non-null  object 
 1   Type                                      856610 non-null  object 
 2   Date                                      856610 non-null  object 
 3   Part of a policing operation              646620 non-null  object 
 4   Latitude                                  700308 non-null  float64
 5   Longitude                                 700308 non-null  float64
 6   Gender                                    856610 non-null  object 
 7   Age range                                 856610 non-null  object 
 8   Self-defined ethnicity                    841427 non-null  object 
 9   Officer-defined ethnicity                 856610 non-null  object 
 10  Legislation         

In [7]:
df[['positive_outcome', 'Outcome linked to object of search', ]].value_counts().sort_index()

positive_outcome  Outcome linked to object of search
False             False                                 110662
                  True                                   45153
True              False                                  28461
                  True                                   70407
dtype: int64

In [8]:
df['label'] = df[['positive_outcome', 'Outcome linked to object of search', ]].apply(lambda x: x['positive_outcome'] and x['Outcome linked to object of search'], axis=1)


In [9]:
df

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,Legislation,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station,positive_outcome,label
0,2e4d0094-c30b-471b-a211-72a9790feca2,Person search,2020-12-01T01:10:00+00:00,,50.798824,-1.089471,Male,25-34,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,Community resolution,False,False,hampshire,True,False
1,4779fbe8-6e05-4534-85fd-db32952ee309,Person search,2020-12-01T02:00:00+00:00,,50.785099,-1.091540,Male,over 34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire,False,False
2,cb5c685d-acac-42e2-914d-75e6ff73b0a8,Person search,2020-12-01T09:15:00+00:00,,50.952006,-1.403341,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,True,hampshire,False,False
3,f486e116-5b1e-45db-9931-a7f070c5c478,Person search,2020-12-01T10:20:00+00:00,,50.806383,-1.079844,Male,10-17,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,A no further action disposal,False,False,hampshire,False,False
4,78f4020e-12cc-4889-bf1a-2f2c29b2f662,Person search,2020-12-01T10:24:00+00:00,,50.806670,-1.081982,Male,10-17,Other ethnic group - Not stated,Asian,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,A no further action disposal,False,False,hampshire,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856605,ee337b9a-12ad-45fd-8c60-49091a0f4ab8,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,18-24,White - Any other White background,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,False,northumbria,False,False
856606,5973a004-e579-4dd2-bc26-71ab5717f87a,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,25-34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,True,northumbria,False,False
856607,ad053a34-364e-4d24-8f5c-9734ab5fdbe0,Person and Vehicle search,2020-04-30T17:00:00+00:00,,54.966266,-1.453704,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Khat or Cannabis warning,True,False,northumbria,True,True
856608,8736e5ec-7ca2-420b-ad56-4dd88d27fe6e,Person search,2020-04-30T17:35:00+00:00,,54.971596,-1.636589,Male,25-34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Arrest,True,False,northumbria,True,True


In [10]:
df.Outcome.value_counts()

A no further action disposal       627474
Arrest                             111748
Community resolution                68280
Penalty Notice for Disorder         17319
Summons / charged by post           16834
Caution (simple or conditional)      3259
Name: Outcome, dtype: int64

In [11]:
df[["Object of search"]].value_counts().sort_index()

Object of search                              
Anything to threaten or harm anyone                15535
Article for use in theft                           38706
Articles for use in criminal damage                16696
Controlled drugs                                  558100
Crossbows                                             54
Detailed object of search unavailable                309
Evidence of hunting any wild mammal with a dog        12
Evidence of offences under the Act                 23080
Evidence of wildlife offences                         46
Firearms                                            5420
Fireworks                                           5072
Game or poaching equipment                           321
Goods on which duty has not been paid etc.            97
Offensive weapons                                 112652
Psychoactive substances                             4805
Seals or hunting equipment                             7
Stolen goods                             

In [12]:
df[["Object of search","Outcome linked to object of search", "Outcome"]].value_counts().sort_index()

Object of search                     Outcome linked to object of search  Outcome                        
Anything to threaten or harm anyone  False                               A no further action disposal       4851
                                                                         Arrest                              446
                                                                         Caution (simple or conditional)       4
                                                                         Community resolution                102
                                                                                                            ... 
Stolen goods                         True                                Caution (simple or conditional)      53
                                                                         Community resolution                845
                                                                         Penalty Notice for Disorder    

In [13]:
df[["Outcome linked to object of search"]].value_counts().sort_index()

Outcome linked to object of search
False                                 139123
True                                  115560
dtype: int64

In [14]:
df[["Part of a policing operation"]].value_counts()

Part of a policing operation
False                           628148
True                             18472
dtype: int64

In [16]:
df_clean  = df[[
    'observation_id',
    'Type',
    'Date',
    'Part of a policing operation',
    'Latitude',
    'Longitude',
    'Gender',
    'Age range',
    'Officer-defined ethnicity',
    'Legislation',
    'Object of search',
    'station',
    'label',
    'Outcome linked to object of search',
]].dropna(subset=[
    'Type',
    'Gender',
    'Age range',
    'Officer-defined ethnicity',
    'Object of search',
    'station',
    'label',
    'Part of a policing operation',
    'Outcome linked to object of search',
 ])

In [17]:
X = df_clean[[
    'observation_id',
    'Type',
    'Date',
    'Part of a policing operation',
    'Latitude',
    'Longitude',
    'Gender',
    'Age range',
    'Officer-defined ethnicity',
    'Legislation',
    'Object of search',
    'station',
]]

In [18]:
y = df_clean[["label"]]

In [19]:
X

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Officer-defined ethnicity,Legislation,Object of search,station
940,282dc259-9d99-432a-a733-fe0c4850bf49,Person and Vehicle search,2020-12-01T00:00:00+00:00,False,50.854422,0.578369,Male,18-24,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,sussex
941,e30f5a54-2ca5-481f-aa9b-053ee132d432,Person search,2020-12-01T00:00:00+00:00,False,51.116846,-0.159758,Male,over 34,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,sussex
942,85a73606-583a-4eed-8915-96060c9a7da7,Person search,2020-12-01T00:00:00+00:00,False,51.013698,-0.452762,Male,10-17,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,sussex
943,79673ff4-10bc-4ee3-bb41-b4806fa5f2fb,Person search,2020-12-01T00:00:00+00:00,False,51.013698,-0.452762,Male,10-17,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,sussex
944,879982f7-6ee8-44b8-a7c6-d74367a1520e,Person and Vehicle search,2020-12-01T00:00:00+00:00,False,51.066980,-0.328613,Male,25-34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,sussex
...,...,...,...,...,...,...,...,...,...,...,...,...
854513,ae55d5fa-abc0-4d59-81bf-81583932ed37,Person search,2020-04-30T14:42:00+00:00,False,51.370400,-0.361494,Male,10-17,White,,Controlled drugs,surrey
854514,34257d94-a2a1-4032-bd49-69b34a49af91,Person search,2020-04-30T16:38:00+00:00,False,51.239837,-0.570065,Male,18-24,White,,Anything to threaten or harm anyone,surrey
854515,d5d34ede-a7e8-4fa9-a5e8-54d12401c51a,Person search,2020-04-30T18:20:00+00:00,False,51.419366,-0.498645,Male,25-34,Black,,Article for use in theft,surrey
854516,adeefd40-4732-4088-a839-9b382777aa78,Person search,2020-04-30T22:30:00+00:00,False,51.187296,-0.162019,Male,25-34,White,,Controlled drugs,surrey


In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157982 entries, 940 to 854517
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   observation_id                157982 non-null  object 
 1   Type                          157982 non-null  object 
 2   Date                          157982 non-null  object 
 3   Part of a policing operation  157982 non-null  object 
 4   Latitude                      116607 non-null  float64
 5   Longitude                     116607 non-null  float64
 6   Gender                        157982 non-null  object 
 7   Age range                     157982 non-null  object 
 8   Officer-defined ethnicity     157982 non-null  object 
 9   Legislation                   140058 non-null  object 
 10  Object of search              157982 non-null  object 
 11  station                       157982 non-null  object 
dtypes: float64(2), object(10)
memory usage: 15

In [21]:
X.columns.tolist()

['observation_id',
 'Type',
 'Date',
 'Part of a policing operation',
 'Latitude',
 'Longitude',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Legislation',
 'Object of search',
 'station']

In [31]:
y.value_counts()/len(y)

label
False    0.757675
True     0.242325
dtype: float64

In [22]:
selected_columns = [
 'Type',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Object of search',
 'station',
]

In [23]:
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [24]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

In [25]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [26]:
models = [
    # ('linear_Regression', LinearRegression()),
    # ('Logistic Regression', LogisticRegression()),
    # ('SVM', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
]

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [28]:
pipe = Pipeline([
    ('column_selector', ColumnTransformer([("selector", "passthrough", selected_columns)], remainder="drop")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('classifier', None),
])


In [29]:
model_scores = []
for model_name, model in models:
    pipe.set_params(classifier=model)
    scores = cross_validate(pipe, X, y.astype('int').values.reshape(-1), scoring='accuracy', cv=5, verbose=1, n_jobs=4)
    
    model_scores.append({model_name: scores})
    
    print(f'{model_name}:')
    print(scores)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    4.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Decision Tree:
{'fit_time': array([1.30948067, 1.1498673 , 1.13290334, 1.10577297, 0.98703551]), 'score_time': array([0.03498125, 0.03510857, 0.03441024, 0.03640056, 0.03136325]), 'test_score': array([0.81169098, 0.81314682, 0.80734903, 0.80032283, 0.80212685])}
Random Forest:
{'fit_time': array([56.02470469, 55.63275599, 56.86808348, 55.68814826, 42.66782117]), 'score_time': array([0.42067838, 0.40988755, 0.38083482, 0.40910554, 0.3684392 ]), 'test_score': array([0.81191252, 0.81264044, 0.80690594, 0.79991138, 0.8020952 ])}


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  1.7min finished


In [23]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer



pipeline = Pipeline(steps=[
    ('column_selector', ColumnTransformer([("selector", "passthrough", selected_columns)], remainder="drop")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
    ])          

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 646620 entries, 940 to 854517
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   observation_id                646620 non-null  object 
 1   Type                          646620 non-null  object 
 2   Date                          646620 non-null  object 
 3   Part of a policing operation  646620 non-null  object 
 4   Latitude                      527277 non-null  float64
 5   Longitude                     527277 non-null  float64
 6   Gender                        646620 non-null  object 
 7   Age range                     646620 non-null  object 
 8   Officer-defined ethnicity     646620 non-null  object 
 9   Legislation                   608719 non-null  object 
 10  Object of search              646620 non-null  object 
 11  station                       646620 non-null  object 
dtypes: float64(2), object(10)
memory usage: 64

In [25]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 646620 entries, 940 to 854517
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   label   646620 non-null  bool 
dtypes: bool(1)
memory usage: 5.5 MB


In [26]:
pipeline

Pipeline(steps=[('column_selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['Type', 'Gender',
                                                   'Age range',
                                                   'Officer-defined ethnicity',
                                                   'Object of search',
                                                   'station'])])),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])

In [27]:
pipeline.fit(X,y.astype('int'))

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('column_selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['Type', 'Gender',
                                                   'Age range',
                                                   'Officer-defined ethnicity',
                                                   'Object of search',
                                                   'station'])])),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])

In [28]:
os.getcwd()

'/home/xjrr/ldsa-capstone-project'

In [29]:
TMP_DIR = 'save-model-forest-2'

In [30]:
if not os.path.exists(TMP_DIR):
   # Create a new directory because it does not exist
   os.makedirs(TMP_DIR)

In [31]:
with open(os.path.join(os.getcwd(), TMP_DIR, 'columns.json'), 'w') as file_save:
    json.dump(X.columns.tolist(), file_save)
    
# pipeline

joblib.dump(pipeline, os.path.join(os.getcwd(), TMP_DIR, 'pipeline.pickle'))

# dtypes

with open(os.path.join(os.getcwd(), TMP_DIR, 'dtypes.pickle'), 'wb') as file_save:
    pickle.dump(X.dtypes, file_save)