In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
import os
import pickle
import json
import joblib
import numpy as np

In [4]:
positive_outcome_labels = [
    'Community resolution',
    'Khat or Cannabis warning',
    'Caution (simple or conditional)',
    'Arrest',
    'Penalty Notice for Disorder',
    'Summons / charged by post',
    'Suspect arrested',
    'Suspect summoned to court',
]


In [5]:
df = pd.read_csv('data/train.csv')

In [6]:
df = df.loc[df['station']!='metropolitan']

In [7]:
# df['Outcome linked to object of search'] = df['Outcome linked to object of search'].fillna(False)

In [8]:
df['positive_outcome'] = df['Outcome'].apply(lambda x: True if x in positive_outcome_labels else False)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 419743 entries, 0 to 856609
Data columns (total 17 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   observation_id                            419743 non-null  object 
 1   Type                                      419743 non-null  object 
 2   Date                                      419743 non-null  object 
 3   Part of a policing operation              209753 non-null  object 
 4   Latitude                                  309527 non-null  float64
 5   Longitude                                 309527 non-null  float64
 6   Gender                                    419743 non-null  object 
 7   Age range                                 419743 non-null  object 
 8   Self-defined ethnicity                    404608 non-null  object 
 9   Officer-defined ethnicity                 419743 non-null  object 
 10  Legislation         

In [10]:
df[['positive_outcome', 'Outcome linked to object of search', ]].value_counts().sort_index()

positive_outcome  Outcome linked to object of search
False             False                                 110662
                  True                                   45153
True              False                                  28461
                  True                                   70407
dtype: int64

In [11]:
df['label'] = df[['positive_outcome', 'Outcome linked to object of search', ]].apply(lambda x: x['positive_outcome'] and x['Outcome linked to object of search'], axis=1)


In [12]:
df

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,Legislation,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station,positive_outcome,label
0,2e4d0094-c30b-471b-a211-72a9790feca2,Person search,2020-12-01T01:10:00+00:00,,50.798824,-1.089471,Male,25-34,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,Community resolution,False,False,hampshire,True,False
1,4779fbe8-6e05-4534-85fd-db32952ee309,Person search,2020-12-01T02:00:00+00:00,,50.785099,-1.091540,Male,over 34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire,False,False
2,cb5c685d-acac-42e2-914d-75e6ff73b0a8,Person search,2020-12-01T09:15:00+00:00,,50.952006,-1.403341,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,True,hampshire,False,False
3,f486e116-5b1e-45db-9931-a7f070c5c478,Person search,2020-12-01T10:20:00+00:00,,50.806383,-1.079844,Male,10-17,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,A no further action disposal,False,False,hampshire,False,False
4,78f4020e-12cc-4889-bf1a-2f2c29b2f662,Person search,2020-12-01T10:24:00+00:00,,50.806670,-1.081982,Male,10-17,Other ethnic group - Not stated,Asian,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,A no further action disposal,False,False,hampshire,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856605,ee337b9a-12ad-45fd-8c60-49091a0f4ab8,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,18-24,White - Any other White background,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,False,northumbria,False,False
856606,5973a004-e579-4dd2-bc26-71ab5717f87a,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,25-34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,True,northumbria,False,False
856607,ad053a34-364e-4d24-8f5c-9734ab5fdbe0,Person and Vehicle search,2020-04-30T17:00:00+00:00,,54.966266,-1.453704,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Khat or Cannabis warning,True,False,northumbria,True,True
856608,8736e5ec-7ca2-420b-ad56-4dd88d27fe6e,Person search,2020-04-30T17:35:00+00:00,,54.971596,-1.636589,Male,25-34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Arrest,True,False,northumbria,True,True


In [13]:
df.Outcome.value_counts()

A no further action disposal       307822
Arrest                              56013
Community resolution                28604
Summons / charged by post           10405
Caution (simple or conditional)      2908
Penalty Notice for Disorder          2295
Name: Outcome, dtype: int64

In [14]:
df[["Object of search"]].value_counts().sort_index()

Object of search                              
Anything to threaten or harm anyone                 9349
Article for use in theft                           38706
Articles for use in criminal damage                14567
Controlled drugs                                  267946
Crossbows                                             54
Detailed object of search unavailable                309
Evidence of hunting any wild mammal with a dog        12
Evidence of offences under the Act                  3304
Evidence of wildlife offences                         46
Firearms                                            3300
Fireworks                                           3331
Game or poaching equipment                           321
Goods on which duty has not been paid etc.            97
Offensive weapons                                  45455
Psychoactive substances                             4805
Seals or hunting equipment                             7
Stolen goods                             

In [15]:
df[["Object of search","Outcome linked to object of search", "Outcome"]].value_counts().sort_index()

Object of search                     Outcome linked to object of search  Outcome                        
Anything to threaten or harm anyone  False                               A no further action disposal       4851
                                                                         Arrest                              446
                                                                         Caution (simple or conditional)       4
                                                                         Community resolution                102
                                                                                                            ... 
Stolen goods                         True                                Caution (simple or conditional)      53
                                                                         Community resolution                845
                                                                         Penalty Notice for Disorder    

In [16]:
df[["Outcome linked to object of search"]].value_counts().sort_index()

Outcome linked to object of search
False                                 139123
True                                  115560
dtype: int64

In [17]:
df[["Part of a policing operation"]].value_counts()

Part of a policing operation
False                           191281
True                             18472
dtype: int64

In [18]:
df_clean  = df[[
    'observation_id',
    'Type',
    'Date',
    'Part of a policing operation',
    'Latitude',
    'Longitude',
    'Gender',
    'Age range',
    'Officer-defined ethnicity',
    'Legislation',
    'Object of search',
    'station',
    'label',
    'Outcome linked to object of search',
]].dropna(subset=[
    'Type',
    'Gender',
    'Age range',
    'Officer-defined ethnicity',
    'Object of search',
    'station',
    'label',
    'Part of a policing operation',
    'Outcome linked to object of search',
 ])

In [19]:
X, X_test, y, y_test = train_test_split(df_clean[[
                                                        'observation_id',
                                                        'Type',
                                                        'Date',
                                                        'Part of a policing operation',
                                                        'Latitude',
                                                        'Longitude',
                                                        'Gender',
                                                        'Age range',
                                                        'Officer-defined ethnicity',
                                                        'Legislation',
                                                        'Object of search',
                                                        'station',
                                                    ]], df_clean[["label"]], test_size=0.33, random_state=42)

In [20]:
# X = df_clean[[
#     'observation_id',
#     'Type',
#     'Date',
#     'Part of a policing operation',
#     'Latitude',
#     'Longitude',
#     'Gender',
#     'Age range',
#     'Officer-defined ethnicity',
#     'Legislation',
#     'Object of search',
#     'station',
# ]]

In [21]:
# y = df_clean[["label"]]

In [22]:
X

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Officer-defined ethnicity,Legislation,Object of search,station
458720,639dd366-bb14-4da5-8a7f-60f7cb005191,Person search,2021-07-22T23:45:00+00:00,False,52.926522,0.496395,Male,25-34,Asian,Misuse of Drugs Act 1971 (section 23),Controlled drugs,norfolk
595850,f93a2ccd-f12b-4e3c-b537-8e3312aa52df,Person and Vehicle search,2021-05-03T13:36:29+00:00,False,52.350592,-1.270390,Male,18-24,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,warwickshire
296544,b816b66f-8135-4d18-bcab-b6f1599b81db,Person search,2020-06-11T03:01:00+00:00,False,51.846788,1.268630,Male,18-24,Black,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,essex
280638,7e2e77c9-ab19-4b47-bf8d-a58b7aef3e38,Person search,2020-06-13T18:15:00+00:00,False,53.776089,-1.541984,Male,25-34,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,west-yorkshire
647721,09f2e251-dd5a-4546-9872-c591526715ce,Person search,2020-05-30T00:01:00+00:00,False,51.604211,0.084091,Male,25-34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,essex
...,...,...,...,...,...,...,...,...,...,...,...,...
688949,a00bd29b-8603-4c8b-b4ba-d23969360d75,Person search,2020-05-13T14:03:00+00:00,False,51.860974,-2.241702,Male,25-34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,gloucestershire
569043,5e5f548a-e346-47b2-b9e2-c6d893bbf201,Person search,2021-05-06T21:26:24+00:00,False,52.371855,-2.244513,Male,over 34,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,west-mercia
731011,3f8865b1-2f70-4044-b56f-92b0bdb988ba,Person and Vehicle search,2021-09-12T23:50:00+00:00,False,,,Male,over 34,Black,,Controlled drugs,thames-valley
801642,f760da22-43a5-4ae4-b4b1-83fb0089f16a,Person search,2020-08-12T03:00:00+00:00,False,,,Male,18-24,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,south-yorkshire


In [23]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105847 entries, 458720 to 692954
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   observation_id                105847 non-null  object 
 1   Type                          105847 non-null  object 
 2   Date                          105847 non-null  object 
 3   Part of a policing operation  105847 non-null  object 
 4   Latitude                      78254 non-null   float64
 5   Longitude                     78254 non-null   float64
 6   Gender                        105847 non-null  object 
 7   Age range                     105847 non-null  object 
 8   Officer-defined ethnicity     105847 non-null  object 
 9   Legislation                   93832 non-null   object 
 10  Object of search              105847 non-null  object 
 11  station                       105847 non-null  object 
dtypes: float64(2), object(10)
memory usage:

In [24]:
X.columns.tolist()

['observation_id',
 'Type',
 'Date',
 'Part of a policing operation',
 'Latitude',
 'Longitude',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Legislation',
 'Object of search',
 'station']

In [25]:
y.value_counts()/len(y)

label
False    0.7568
True     0.2432
dtype: float64

In [26]:
selected_columns = [
 'Type',
 'Gender',
 'Age range',
 'Officer-defined ethnicity',
 'Object of search',
 'station',
]

In [27]:
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [28]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

In [29]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier



In [30]:
models = [
    # ('linear_Regression', LinearRegression()),
    ('Logistic Regression', LogisticRegression()),
    # ('SVM', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
]

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [32]:
pipe = Pipeline([
    ('column_selector', ColumnTransformer([("selector", "passthrough", selected_columns)], remainder="drop")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('classifier', None),
])


In [33]:
import copy

In [34]:
model_pred = {}
model_pipe = {}
for model_name, model in models:
    pipe.set_params(classifier=model)
    pipe.fit(X,y.astype('int').values.reshape(-1))
    predictions = pipe.predict(X_test)
    
    model_pred[model_name]= predictions
    model_pipe[model_name]= copy.deepcopy(pipe)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
model_pred

{'Logistic Regression': array([1, 0, 0, ..., 0, 0, 0]),
 'Decision Tree': array([1, 0, 0, ..., 0, 0, 0]),
 'Random Forest': array([1, 0, 0, ..., 0, 0, 0]),
 'AdaBoost': array([1, 0, 0, ..., 0, 0, 0])}

In [36]:
model_pipe

{'Logistic Regression': Pipeline(steps=[('column_selector',
                  ColumnTransformer(transformers=[('selector', 'passthrough',
                                                   ['Type', 'Gender',
                                                    'Age range',
                                                    'Officer-defined ethnicity',
                                                    'Object of search',
                                                    'station'])])),
                 ('onehot', OneHotEncoder(handle_unknown='ignore')),
                 ('classifier', LogisticRegression())]),
 'Decision Tree': Pipeline(steps=[('column_selector',
                  ColumnTransformer(transformers=[('selector', 'passthrough',
                                                   ['Type', 'Gender',
                                                    'Age range',
                                                    'Officer-defined ethnicity',
                                

In [37]:
def verify_no_discrimination(X_test, y_true, y_pred, sensitive_column='SubjectRaceCode', max_diff=0.05):
    """
    Verifies that no subdeparment has discrimination in between protected races
    
    
    """
    
    departments = X_test['station'].unique()
    sensitive_classes = X_test[sensitive_column].unique()
    
    is_satisfied = True
    problematic_departments = []
    good_deparments = []
    for department in departments:
        precisions = {}
        for sensitive_class in sensitive_classes:
            mask = (X_test[sensitive_column] == sensitive_class) & (X_test['station'] == department)
            if mask.sum():
                precisions[sensitive_class] = precision_score(y_true[mask], y_pred[mask], pos_label=1, zero_division=0)
                
        diff = np.max(list(precisions.values())) - np.min(list(precisions.values()))
        if diff > max_diff:
            is_satisfied = False
            problematic_departments.append((department, diff, precisions))
        else:
            good_deparments.append((department, diff, precisions))

    return is_satisfied, problematic_departments, good_deparments

In [38]:
y_test.astype(int)

Unnamed: 0,label
290896,1
647423,0
775583,1
469178,0
247583,0
...,...
568944,0
646154,0
694232,0
316641,0


In [39]:
model_pred["Logistic Regression"]

array([1, 0, 0, ..., 0, 0, 0])

In [40]:
pd.DataFrame(model_pred["Logistic Regression"], index=y_test.index, columns=y_test.columns)

Unnamed: 0,label
290896,1
647423,0
775583,0
469178,0
247583,0
...,...
568944,0
646154,0
694232,0
316641,0


In [41]:
def verify_success_rate_above(y_true, y_pred, min_success_rate=0.1):
    """
    Verifies the success rate on a test set is above a provided minimum
    
    
    """
    
    precision = precision_score(y_true, y_pred, pos_label=True)
    is_satisfied = (precision >= min_success_rate)
    
    return is_satisfied, precision

In [42]:
def verify_across_stations(X_test, y_true, y_pred, max_diff=0.1, min_success_rate=0.1):
    """
    Verifies that no station has discrimination in sensitive_column
    
    
    """
    
    departments = X_test['station'].unique()

    is_satisfied = True
    problematic_departments = []
    good_deparments = []
    precisions = []
    for department in departments:
        mask = (X_test['station'] == department)
        if mask.sum():
            station_precision = precision_score(y_true[mask], y_pred[mask], pos_label=1, zero_division=0)
            precisions.append(station_precision)

        if station_precision < min_success_rate:
            problematic_departments.append((department, station_precision))
        else:
            good_deparments.append((department, station_precision))

    diff = np.max(precisions) - np.min(precisions)
    if diff > max_diff:
            is_satisfied = False

    return is_satisfied, diff, problematic_departments, good_deparments

In [45]:
for model_predictions in model_pred:
    print("\n")
    print(model_predictions)
    is_verified, success_rate = verify_success_rate_above(y_test.astype(int), pd.DataFrame(model_pred[model_predictions], index=y_test.index, columns=y_test.columns))

    print(is_verified, success_rate)
    print("\nGender")
    is_satisfied, problematic_departments, good_deparments = verify_no_discrimination(X_test, y_test.astype(int), pd.DataFrame(model_pred[model_predictions], index=y_test.index, columns=y_test.columns), sensitive_column='Gender', max_diff=0.05)

    if not is_satisfied:
        print("Requirement failed 😢")
        print("Num problematic departments: {}".format(len(problematic_departments)))
        print("Num good departments: {}".format(len(good_deparments)))

        print("avg diff:", np.mean([p[1] for p in problematic_departments]))
            
            
    print("\nEthnicity")

    is_satisfied, problematic_departments, good_deparments = verify_no_discrimination(X_test, y_test.astype(int), pd.DataFrame(model_pred[model_predictions], index=y_test.index, columns=y_test.columns), sensitive_column='Officer-defined ethnicity', max_diff=0.05)

    if not is_satisfied:
        print("Requirement failed 😢")
        print("Num problematic departments: {}".format(len(problematic_departments)))
        print("Num good departments: {}".format(len(good_deparments)))

        print("avg diff:", np.mean([p[1] for p in problematic_departments]))
    else:
        print("Requirement satisfied! 🚀")

    print("Departments analysed: {}".format(len(problematic_departments) + len(good_deparments)))

    across_stations_is_satisfied, across_stations_diff, across_stations_problematic_departments, across_stations_good_deparments = verify_across_stations(X_test, y_test.astype(int), pd.DataFrame(model_pred[model_predictions], index=y_test.index, columns=y_test.columns))

    print(f"\nAcross stations is satisfied: {across_stations_is_satisfied}")
    print(f"Across stations is diff: {across_stations_diff}")
    print(f"Across stations problematic departments: {across_stations_problematic_departments}")    


    print("Recall score: ", recall_score(y_test.astype(int), pd.DataFrame(model_pred[model_predictions], index=y_test.index, columns=y_test.columns)))
    print("---------------------------------------------------------------------------------------------------------------------------------------------")



Logistic Regression
True 0.7254901960784313

Gender
Requirement failed 😢
Num problematic departments: 3
Num good departments: 18
avg diff: 0.10745444102104677

Ethnicity
Requirement failed 😢
Num problematic departments: 6
Num good departments: 15
avg diff: 0.34563279054349066
Departments analysed: 21

Across stations is satisfied: False
Across stations is diff: 0.8571428571428571
Across stations problematic departments: [('essex', 0.0), ('kent', 0.0), ('west-yorkshire', 0.0), ('south-yorkshire', 0.0), ('northamptonshire', 0.0), ('surrey', 0.0), ('norfolk', 0.0), ('cumbria', 0.0), ('west-mercia', 0.0), ('gloucestershire', 0.0), ('sussex', 0.0), ('warwickshire', 0.0), ('suffolk', 0.0), ('derbyshire', 0.0)]
Recall score:  0.3215852005422215
---------------------------------------------------------------------------------------------------------------------------------------------


Decision Tree
True 0.7619441571871768

Gender
Requirement failed 😢
Num problematic departments: 13
Num goo

In [59]:
model_pipe

{'Logistic Regression': Pipeline(steps=[('column_selector',
                  ColumnTransformer(transformers=[('selector', 'passthrough',
                                                   ['Type', 'Gender',
                                                    'Age range',
                                                    'Officer-defined ethnicity',
                                                    'Object of search',
                                                    'station'])])),
                 ('onehot', OneHotEncoder(handle_unknown='ignore')),
                 ('classifier', LogisticRegression())]),
 'Decision Tree': Pipeline(steps=[('column_selector',
                  ColumnTransformer(transformers=[('selector', 'passthrough',
                                                   ['Type', 'Gender',
                                                    'Age range',
                                                    'Officer-defined ethnicity',
                                

In [60]:
best_model = model_pipe["Logistic Regression"]

In [63]:
best_model.fit(df_clean[[
    'observation_id',
    'Type',
    'Date',
    'Part of a policing operation',
    'Latitude',
    'Longitude',
    'Gender',
    'Age range',
    'Officer-defined ethnicity',
    'Legislation',
    'Object of search',
    'station',
]], df_clean["label"].astype(int))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('column_selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['Type', 'Gender',
                                                   'Age range',
                                                   'Officer-defined ethnicity',
                                                   'Object of search',
                                                   'station'])])),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('classifier', LogisticRegression())])

In [23]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer



pipeline = Pipeline(steps=[
    ('column_selector', ColumnTransformer([("selector", "passthrough", selected_columns)], remainder="drop")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
    ])          

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 646620 entries, 940 to 854517
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   observation_id                646620 non-null  object 
 1   Type                          646620 non-null  object 
 2   Date                          646620 non-null  object 
 3   Part of a policing operation  646620 non-null  object 
 4   Latitude                      527277 non-null  float64
 5   Longitude                     527277 non-null  float64
 6   Gender                        646620 non-null  object 
 7   Age range                     646620 non-null  object 
 8   Officer-defined ethnicity     646620 non-null  object 
 9   Legislation                   608719 non-null  object 
 10  Object of search              646620 non-null  object 
 11  station                       646620 non-null  object 
dtypes: float64(2), object(10)
memory usage: 64

In [25]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 646620 entries, 940 to 854517
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   label   646620 non-null  bool 
dtypes: bool(1)
memory usage: 5.5 MB


In [26]:
pipeline

Pipeline(steps=[('column_selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['Type', 'Gender',
                                                   'Age range',
                                                   'Officer-defined ethnicity',
                                                   'Object of search',
                                                   'station'])])),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])

In [27]:
pipeline.fit(X,y.astype('int'))

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('column_selector',
                 ColumnTransformer(transformers=[('selector', 'passthrough',
                                                  ['Type', 'Gender',
                                                   'Age range',
                                                   'Officer-defined ethnicity',
                                                   'Object of search',
                                                   'station'])])),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])

In [28]:
os.getcwd()

'/home/xjrr/ldsa-capstone-project'

In [29]:
TMP_DIR = 'save-model-forest-2'

In [30]:
if not os.path.exists(TMP_DIR):
   # Create a new directory because it does not exist
   os.makedirs(TMP_DIR)

In [31]:
with open(os.path.join(os.getcwd(), TMP_DIR, 'columns.json'), 'w') as file_save:
    json.dump(X.columns.tolist(), file_save)
    
# pipeline

joblib.dump(pipeline, os.path.join(os.getcwd(), TMP_DIR, 'pipeline.pickle'))

# dtypes

with open(os.path.join(os.getcwd(), TMP_DIR, 'dtypes.pickle'), 'wb') as file_save:
    pickle.dump(X.dtypes, file_save)

In [64]:
TMP_DIR = 'save-LR'
if not os.path.exists(TMP_DIR):
   # Create a new directory because it does not exist
   os.makedirs(TMP_DIR)

joblib.dump(best_model, os.path.join(os.getcwd(), TMP_DIR, 'pipeline.pickle'))


['/home/jrr/ldsa-capstone-project/save-LR/pipeline.pickle']