In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
import os
import pickle
import json
import joblib

In [2]:
positive_outcome_labels = [
    'Community resolution',
    'Khat or Cannabis warning',
    'Caution (simple or conditional)',
    'Arrest',
    'Penalty Notice for Disorder',
    'Summons / charged by post',
    'Suspect arrested',
    'Suspect summoned to court',
]


In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
df['Outcome linked to object of search'] = df['Outcome linked to object of search'].fillna(False)

In [5]:
df['positive_outcome'] = df['Outcome'].apply(lambda x: True if x in positive_outcome_labels else False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856610 entries, 0 to 856609
Data columns (total 17 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   observation_id                            856610 non-null  object 
 1   Type                                      856610 non-null  object 
 2   Date                                      856610 non-null  object 
 3   Part of a policing operation              646620 non-null  object 
 4   Latitude                                  700308 non-null  float64
 5   Longitude                                 700308 non-null  float64
 6   Gender                                    856610 non-null  object 
 7   Age range                                 856610 non-null  object 
 8   Self-defined ethnicity                    841427 non-null  object 
 9   Officer-defined ethnicity                 856610 non-null  object 
 10  Legislation         

In [7]:
df[['positive_outcome', 'Outcome linked to object of search', ]].value_counts().sort_index()

positive_outcome  Outcome linked to object of search
False             False                                 582321
                  True                                   45153
True              False                                 158729
                  True                                   70407
dtype: int64

In [11]:
df['label'] = df[['positive_outcome', 'Outcome linked to object of search', ]].apply(lambda x: x['positive_outcome'] and x['Outcome linked to object of search'], axis=1)


In [12]:
df

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,Legislation,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station,positive_outcome,label
0,2e4d0094-c30b-471b-a211-72a9790feca2,Person search,2020-12-01T01:10:00+00:00,,50.798824,-1.089471,Male,25-34,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,Community resolution,False,False,hampshire,True,False
1,4779fbe8-6e05-4534-85fd-db32952ee309,Person search,2020-12-01T02:00:00+00:00,,50.785099,-1.091540,Male,over 34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire,False,False
2,cb5c685d-acac-42e2-914d-75e6ff73b0a8,Person search,2020-12-01T09:15:00+00:00,,50.952006,-1.403341,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,True,hampshire,False,False
3,f486e116-5b1e-45db-9931-a7f070c5c478,Person search,2020-12-01T10:20:00+00:00,,50.806383,-1.079844,Male,10-17,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,A no further action disposal,False,False,hampshire,False,False
4,78f4020e-12cc-4889-bf1a-2f2c29b2f662,Person search,2020-12-01T10:24:00+00:00,,50.806670,-1.081982,Male,10-17,Other ethnic group - Not stated,Asian,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,A no further action disposal,False,False,hampshire,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856605,ee337b9a-12ad-45fd-8c60-49091a0f4ab8,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,18-24,White - Any other White background,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,False,northumbria,False,False
856606,5973a004-e579-4dd2-bc26-71ab5717f87a,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,25-34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,True,northumbria,False,False
856607,ad053a34-364e-4d24-8f5c-9734ab5fdbe0,Person and Vehicle search,2020-04-30T17:00:00+00:00,,54.966266,-1.453704,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Khat or Cannabis warning,True,False,northumbria,True,True
856608,8736e5ec-7ca2-420b-ad56-4dd88d27fe6e,Person search,2020-04-30T17:35:00+00:00,,54.971596,-1.636589,Male,25-34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Arrest,True,False,northumbria,True,True


In [None]:
df.Outcome.value_counts()

In [None]:
df[["Object of search"]].value_counts().sort_index()

In [None]:
df[["Object of search","Outcome linked to object of search", "Outcome"]].value_counts().sort_index()

In [None]:
df[["Outcome linked to object of search"]].value_counts().sort_index()

In [None]:
df[["Part of a policing operation"]].value_counts()

In [None]:
df_clean  = df[["observation_id",
 "Type",
 "Date",
 "Part of a policing operation",
 "Latitude",
 "Longitude",
 "Gender",
 "Age range",
 "Officer-defined ethnicity",
 "Legislation",
 "Object of search",
 "station",
 "Outcome linked to object of search"
]].dropna(subset=[
 'Type',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Object of search',
 'station',
 'Outcome linked to object of search'
 ])

In [None]:
X = df_clean[["observation_id",
 "Type",
 "Date",
 "Part of a policing operation",
 "Latitude",
 "Longitude",
 "Gender",
 "Age range",
 "Officer-defined ethnicity",
 "Legislation",
 "Object of search",
 "station",
]]

In [None]:
y = df_clean[["Outcome linked to object of search"]]

In [None]:
X

In [None]:
X.info()

In [None]:
X.columns.tolist()

In [None]:
selected_columns = [
 'Type',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Object of search',
 'station'
 ]

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer



pipeline = Pipeline(steps=[
    ('column_selector', ColumnTransformer([("selector", "passthrough", selected_columns)], remainder="drop")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
    ])          

In [None]:
X.info()

In [None]:
y.info()

In [None]:
pipeline

In [None]:
pipeline.fit(X,y.astype('int'))

In [None]:
os.getcwd()

In [None]:
TMP_DIR = 'saves'

In [None]:
with open(os.path.join(os.getcwd(), TMP_DIR, 'columns.json'), 'w') as file_save:
    json.dump(X.columns.tolist(), file_save)
    
# pipeline

joblib.dump(pipeline, os.path.join(os.getcwd(), TMP_DIR, 'pipeline.pickle'))

# dtypes

with open(os.path.join(os.getcwd(), TMP_DIR, 'dtypes.pickle'), 'wb') as file_save:
    pickle.dump(X.dtypes, file_save)