In [102]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
import os
import pickle
import json
import joblib

In [3]:
df = pd.read_csv('data/train.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856610 entries, 0 to 856609
Data columns (total 16 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   observation_id                            856610 non-null  object 
 1   Type                                      856610 non-null  object 
 2   Date                                      856610 non-null  object 
 3   Part of a policing operation              646620 non-null  object 
 4   Latitude                                  700308 non-null  float64
 5   Longitude                                 700308 non-null  float64
 6   Gender                                    856610 non-null  object 
 7   Age range                                 856610 non-null  object 
 8   Self-defined ethnicity                    841427 non-null  object 
 9   Officer-defined ethnicity                 856610 non-null  object 
 10  Legislation         

In [6]:
df.Outcome.value_counts()

A no further action disposal       627474
Arrest                             111748
Community resolution                68280
Penalty Notice for Disorder         17319
Summons / charged by post           16834
Caution (simple or conditional)      3259
Name: Outcome, dtype: int64

In [14]:
df[["Object of search"]].value_counts().sort_index()

Object of search                              
Anything to threaten or harm anyone                15535
Article for use in theft                           38706
Articles for use in criminal damage                16696
Controlled drugs                                  558100
Crossbows                                             54
Detailed object of search unavailable                309
Evidence of hunting any wild mammal with a dog        12
Evidence of offences under the Act                 23080
Evidence of wildlife offences                         46
Firearms                                            5420
Fireworks                                           5072
Game or poaching equipment                           321
Goods on which duty has not been paid etc.            97
Offensive weapons                                 112652
Psychoactive substances                             4805
Seals or hunting equipment                             7
Stolen goods                             

In [50]:
df[["Object of search","Outcome linked to object of search", "Outcome"]].value_counts().sort_index()

Object of search                     Outcome linked to object of search  Outcome                        
Anything to threaten or harm anyone  False                               A no further action disposal       4851
                                                                         Arrest                              446
                                                                         Caution (simple or conditional)       4
                                                                         Community resolution                102
                                                                                                            ... 
Stolen goods                         True                                Caution (simple or conditional)      53
                                                                         Community resolution                845
                                                                         Penalty Notice for Disorder    

In [51]:
df[["Outcome linked to object of search"]].value_counts().sort_index()

Outcome linked to object of search
False                                 139123
True                                  115560
dtype: int64

In [19]:
df[["Part of a policing operation"]].value_counts()

Part of a policing operation
False                           628148
True                             18472
dtype: int64

In [81]:
df_clean  = df[["observation_id",
 "Type",
 "Date",
 "Part of a policing operation",
 "Latitude",
 "Longitude",
 "Gender",
 "Age range",
 "Officer-defined ethnicity",
 "Legislation",
 "Object of search",
 "station",
 "Outcome linked to object of search"
]].dropna(subset=[
 'Type',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Object of search',
 'station',
 'Outcome linked to object of search'
 ])

In [82]:
X = df_clean[["observation_id",
 "Type",
 "Date",
 "Part of a policing operation",
 "Latitude",
 "Longitude",
 "Gender",
 "Age range",
 "Officer-defined ethnicity",
 "Legislation",
 "Object of search",
 "station",
]]

In [83]:
y = df_clean[["Outcome linked to object of search"]]

In [84]:
X

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Officer-defined ethnicity,Legislation,Object of search,station
0,2e4d0094-c30b-471b-a211-72a9790feca2,Person search,2020-12-01T01:10:00+00:00,,50.798824,-1.089471,Male,25-34,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,hampshire
1,4779fbe8-6e05-4534-85fd-db32952ee309,Person search,2020-12-01T02:00:00+00:00,,50.785099,-1.091540,Male,over 34,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,hampshire
2,cb5c685d-acac-42e2-914d-75e6ff73b0a8,Person search,2020-12-01T09:15:00+00:00,,50.952006,-1.403341,Male,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,hampshire
3,f486e116-5b1e-45db-9931-a7f070c5c478,Person search,2020-12-01T10:20:00+00:00,,50.806383,-1.079844,Male,10-17,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,hampshire
4,78f4020e-12cc-4889-bf1a-2f2c29b2f662,Person search,2020-12-01T10:24:00+00:00,,50.806670,-1.081982,Male,10-17,Asian,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,hampshire
...,...,...,...,...,...,...,...,...,...,...,...,...
856605,ee337b9a-12ad-45fd-8c60-49091a0f4ab8,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,18-24,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,northumbria
856606,5973a004-e579-4dd2-bc26-71ab5717f87a,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,25-34,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,northumbria
856607,ad053a34-364e-4d24-8f5c-9734ab5fdbe0,Person and Vehicle search,2020-04-30T17:00:00+00:00,,54.966266,-1.453704,Male,18-24,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,northumbria
856608,8736e5ec-7ca2-420b-ad56-4dd88d27fe6e,Person search,2020-04-30T17:35:00+00:00,,54.971596,-1.636589,Male,25-34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,northumbria


In [85]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254683 entries, 0 to 856609
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   observation_id                254683 non-null  object 
 1   Type                          254683 non-null  object 
 2   Date                          254683 non-null  object 
 3   Part of a policing operation  157982 non-null  object 
 4   Latitude                      203942 non-null  float64
 5   Longitude                     203942 non-null  float64
 6   Gender                        254683 non-null  object 
 7   Age range                     254683 non-null  object 
 8   Officer-defined ethnicity     254683 non-null  object 
 9   Legislation                   236164 non-null  object 
 10  Object of search              254683 non-null  object 
 11  station                       254683 non-null  object 
dtypes: float64(2), object(10)
memory usage: 25.3

In [86]:
X.columns.tolist()

['observation_id',
 'Type',
 'Date',
 'Part of a policing operation',
 'Latitude',
 'Longitude',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Legislation',
 'Object of search',
 'station']

In [87]:
selected_columns = [
 'Type',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Object of search',
 'station'
 ]

In [88]:
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [94]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer



pipeline = Pipeline(steps=[
    ('column_selector', ColumnTransformer([("selector", "passthrough", selected_columns)], remainder="drop")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
    ])          

In [95]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254683 entries, 0 to 856609
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   observation_id                254683 non-null  object 
 1   Type                          254683 non-null  object 
 2   Date                          254683 non-null  object 
 3   Part of a policing operation  157982 non-null  object 
 4   Latitude                      203942 non-null  float64
 5   Longitude                     203942 non-null  float64
 6   Gender                        254683 non-null  object 
 7   Age range                     254683 non-null  object 
 8   Officer-defined ethnicity     254683 non-null  object 
 9   Legislation                   236164 non-null  object 
 10  Object of search              254683 non-null  object 
 11  station                       254683 non-null  object 
dtypes: float64(2), object(10)
memory usage: 25.3

In [96]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254683 entries, 0 to 856609
Data columns (total 1 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   Outcome linked to object of search  254683 non-null  object
dtypes: object(1)
memory usage: 3.9+ MB


In [97]:
pipeline

Pipeline(steps=[('column_selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['Type', 'Gender',
                                                   'Age range',
                                                   'Officer-defined ethnicity',
                                                   'Object of search',
                                                   'station'])])),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])

In [98]:
pipeline.fit(X,y.astype('int'))

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('column_selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['Type', 'Gender',
                                                   'Age range',
                                                   'Officer-defined ethnicity',
                                                   'Object of search',
                                                   'station'])])),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])

In [107]:
os.getcwd()

'/home/xjrr/ldsa-capstone-project'

In [104]:
TMP_DIR = 'saves'

In [109]:
with open(os.path.join(os.getcwd(), TMP_DIR, 'columns.json'), 'w') as file_save:
    json.dump(X.columns.tolist(), file_save)
    
# pipeline

joblib.dump(pipeline, os.path.join(os.getcwd(), TMP_DIR, 'pipeline.pickle'))

# dtypes

with open(os.path.join(os.getcwd(), TMP_DIR, 'dtypes.pickle'), 'wb') as file_save:
    pickle.dump(X.dtypes, file_save)