In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
#from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier


In [12]:
def load_data(file):
    df = pd.read_csv(os.path.join("data", file))
    return df

df = load_data("train.csv")

In [13]:
new_df = load_data("test_data.csv")


In [14]:
new_df = new_df.rename(columns={"Outcome": "success"})


In [15]:
def clean(df):


    df = df.drop(df[df['station'].isin(["leicestershire",'humberside', 'lancashire','metropolitan','west-midlands'])].index)
    
    df['Part of a policing operation'] = df['Part of a policing operation'].fillna(False)
    df['Part of a policing operation'] = df['Part of a policing operation'].astype(bool)

    df['Outcome linked to object of search'] = df['Outcome linked to object of search'].fillna(False)

    df['Legislation'] = df['Legislation'].fillna('unknown')
    
    df.loc[df['Outcome'] == 'A no further action disposal', 'Outcome linked to object of search'] = False
    
    success_outcomes = ['Community resolution', 'Khat or Cannabis warning', 'Caution (simple or conditional)', 
                    'Arrest', 'Penalty Notice for Disorder', 'Summons / charged by post', 
                    'Suspect arrested', 'Suspect summoned to court']

    # create a new column called "success" with 1 if the outcome is in the list of successful outcomes, 0 otherwise
    df['success'] = df.apply(lambda x: True if x['Outcome'] in success_outcomes and x['Outcome linked to object of search'] == True else False, axis=1)

    df=df[['observation_id', 'Type', 'Date', 'Part of a policing operation',
       'Latitude', 'Longitude', 'Gender', 'Age range',
       'Officer-defined ethnicity', 'Legislation', 'Object of search',
       'station', "success"]]
    
    #df=df.dropna()
    return df

df_new = clean(df)

In [16]:
new_df=new_df.drop(columns='Predicted outcome', axis=1)

In [17]:
combined_df = pd.concat([new_df, df_new], axis=0, ignore_index=True)

In [18]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        X['Hour'] = X['Date'].dt.hour
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['DayOfWeek'] = X['Date'].dt.weekday
        X=X.drop(columns = "Date", axis=1)
        
        return X[['Hour','Month', 'Day', 'DayOfWeek']]

    def get_feature_names_out(self):
        return [('Date', 'Hour'), ('Date', 'Month'), ('Date', 'Day'),('Date', 'DayOfWeek')]


In [19]:

features = ['Part of a policing operation','Legislation', 'Object of search','Date','station','Gender', 'Age range','Officer-defined ethnicity']
target = 'success'

# Split data into training and testing sets
X_train = combined_df[features]
y_train =  combined_df[target]

categorical_columns = ['Legislation', 'Object of search','Part of a policing operation','Date',"Age range","Gender",'Officer-defined ethnicity','station']
feat_columns = ['Date']

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

preprocessor = ColumnTransformer(
    transformers=[
       ('date_transformer', DateTransformer(), feat_columns),
        ('categorical_transformers', categorical_transformer, categorical_columns),
    ])


preprocessor.fit(X_train)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           #("classifer",    LGBMClassifier(n_estimators=100, max_depth = 3,learning_rate=0.1,class_weight="balanced", random_state=42, n_jobs=-1))
                           ("classifer",LogisticRegression(C=1, class_weight='balanced', n_jobs=-1, random_state=42))
                        ])

# Fit model on training data
pipeline.fit(X_train, y_train)




In [20]:
import pickle
import gzip

columns = list(X_train.columns)

# Serialize the list to a JSON file in the temporary directory
filepath = os.path.join('columns.json')

with open(filepath, 'w') as f:
    json.dump(columns, f)
    
    
dtypes = X_train.dtypes

filepath = os.path.join('dtypes.pickle')

with open(filepath, 'wb') as f:
    pickle.dump(dtypes, f)
    

    
filepath = os.path.join('pipeline.pickle')

with open(filepath, 'wb') as f:
    pickle.dump(pipeline, f)
    


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Test run on the api

In [21]:
import pandas as pd

# Read the JSON files
df1 = pd.read_json('newdata/subset_moment_1.json')
df2 = pd.read_json('newdata/subset_moment_2.json')

In [22]:
# Create a new DataFrame from the 'data' column
new_df1 = pd.json_normalize(df1['data'])
new_df2 = pd.json_normalize(df2['data'])


In [None]:
# Set the base URL of your API
import requests
import pandas as pd
import time

APP_NAME = 'railcap-production.up.railway.app'
base_url = "https://{}/should_search/".format(APP_NAME)
# Test locally
#base_url = f"http://localhost:5000/should_search/"

# Define the columns to check for missing values
x=0
l=[]
for i, row in new_df1.iterrows():
    # Remove any columns with NaN values from the row
    row = row.dropna()

    # Convert the row to a dictionary
    row_dict = row.to_dict()
    print(row_dict)
    # Send a POST request to your API with the row data
    response = requests.post(base_url, json=row_dict)

    # Print the response status code and content for debugging purposes
    print(response.status_code)
    print(response.content)
    
    time.sleep(0.1)
    x+=1
    if x == 1000 :
        break



{'observation_id': 'ae3e0d41-7c19-4ecc-8d32-767e3bfdb52a', 'Type': 'Person search', 'Date': '2022-01-01T00:35:00+00:00', 'Part of a policing operation': False, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'Black', 'Legislation': 'Criminal Justice Act 1988 (section 139B)', 'Object of search': 'Offensive weapons', 'station': 'nottinghamshire'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '1b553034-0b2b-435b-b614-ce0a6c8d5f83', 'Type': 'Person search', 'Date': '2022-01-01T01:05:06+00:00', 'Part of a policing operation': False, 'Latitude': 52.574809, 'Longitude': -0.241137, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'Asian', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'cambridgeshire'}
405
b'{\n  "error": "Observation ID: \\"1b553034-0b2b-435b-b614-ce0a6c8d5f83\\" already exists",\n  "outcome": true\n}\n'
{'observation_id': '36ec1461-5190-496c-b82f-95c7ae9ff59f', 'Type': 

405
b'{\n  "error": "Observation ID: \\"9e8783a9-8377-448f-8882-d4b2f352187a\\" already exists",\n  "outcome": false\n}\n'
{'observation_id': '5f10c933-9c8b-4f9e-872c-c807b41a75c2', 'Type': 'Person search', 'Date': '2022-01-01T05:21:31+00:00', 'Latitude': 52.536892, 'Longitude': -0.26474000000000003, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'Other', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'cambridgeshire'}
405
b'{\n  "error": "Observation ID: \\"5f10c933-9c8b-4f9e-872c-c807b41a75c2\\" already exists",\n  "outcome": true\n}\n'
{'observation_id': '1e06ef0a-35e0-48af-b524-1eeeb5029b72', 'Type': 'Person search', 'Date': '2022-01-01T05:41:27+00:00', 'Latitude': 51.518091, 'Longitude': -0.078308, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Offensive weapons', 'station': 'city-o

405
b'{\n  "error": "Observation ID: \\"c948d19f-00fc-443c-8bc8-004692b969c6\\" already exists",\n  "outcome": true\n}\n'
{'observation_id': '66b0867a-d116-4309-95a3-b4a994325c8a', 'Type': 'Person search', 'Date': '2022-01-01T23:54:00+00:00', 'Part of a policing operation': False, 'Latitude': 51.879133, 'Longitude': -0.423014, 'Gender': 'Male', 'Age range': 'over 34', 'Officer-defined ethnicity': 'Asian', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Offensive weapons', 'station': 'bedfordshire'}
405
b'{\n  "error": "Observation ID: \\"66b0867a-d116-4309-95a3-b4a994325c8a\\" already exists",\n  "outcome": false\n}\n'
{'observation_id': '6b3370a4-0c58-4130-a951-c25b6a6d54a8', 'Type': 'Person and Vehicle search', 'Date': '2022-01-02T01:10:00+00:00', 'Part of a policing operation': False, 'Latitude': 50.128305, 'Longitude': -5.541647, 'Gender': 'Male', 'Age range': 'over 34', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs A

200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'f7d59b8e-85e0-4f8c-b844-6a092ed12796', 'Type': 'Person and Vehicle search', 'Date': '2022-01-02T11:21:56+00:00', 'Latitude': 51.512108, 'Longitude': -0.09797299999999999, 'Gender': 'Female', 'Age range': '18-24', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'city-of-london'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '5070589d-1109-4ace-8739-b0b7d1630a2a', 'Type': 'Person search', 'Date': '2022-01-02T11:31:14+00:00', 'Latitude': 51.516814, 'Longitude': -0.08162000000000001, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'Black', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'city-of-london'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '23ec32ae-4336-4d54-ab82-56220edff2a3', 'Type': 'Person search', 'Date': '2022-01-02T12:37:00+00:00', 'P

200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'f1e83622-4a1a-484d-ba4a-4976cb9d1be8', 'Type': 'Person search', 'Date': '2022-01-02T17:17:00+00:00', 'Part of a policing operation': True, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'nottinghamshire'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '6a6a3100-1434-478b-88bc-48f315ef4bd8', 'Type': 'Person search', 'Date': '2022-01-02T17:20:00+00:00', 'Part of a policing operation': False, 'Gender': 'Male', 'Age range': '25-34', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Article for use in theft', 'station': 'nottinghamshire'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '7b80a649-7f61-45ee-b1d9-70e809f03867', 'Type': 'Person and Vehicle search', 'Date': '2022-01-02T17:49:00+00:00', 'Part of a policing o

200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'e9ae94b2-d3f8-40ff-bd94-f994b687081b', 'Type': 'Person and Vehicle search', 'Date': '2022-01-03T01:15:00+00:00', 'Latitude': 54.521018, 'Longitude': -1.551211, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Article for use in theft', 'station': 'durham'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'b2c0c0d3-ca87-471d-8d22-da78286bf56a', 'Type': 'Person search', 'Date': '2022-01-03T01:24:00+00:00', 'Latitude': 54.521018, 'Longitude': -1.551211, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Article for use in theft', 'station': 'durham'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '14dd2769-d267-47b3-b441-18b56e1aa52b', 'Type': 'Person search', 'Date': '2022-01-03T01:27:00+00:00', 'L

200
b'{\n  "outcome": false\n}\n'
{'observation_id': '9ebb4d68-0133-4204-a0b3-615de7574ff2', 'Type': 'Person and Vehicle search', 'Date': '2022-01-03T21:00:00+00:00', 'Part of a policing operation': False, 'Latitude': 51.91952, 'Longitude': -0.436911, 'Gender': 'Female', 'Age range': '18-24', 'Officer-defined ethnicity': 'Black', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'bedfordshire'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'ef803a01-3308-4332-a5dd-5c4337e76983', 'Type': 'Person search', 'Date': '2022-01-03T21:00:00+00:00', 'Part of a policing operation': False, 'Latitude': 51.91952, 'Longitude': -0.436911, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'Black', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'bedfordshire'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '1ad32956-1e26-405a-b25d-a412b72c7f75', 'Type': 'Pers

200
b'{\n  "outcome": true\n}\n'
{'observation_id': '6f2d2f32-8e78-442b-ad08-0ace30be0544', 'Type': 'Person search', 'Date': '2022-01-04T12:55:00+00:00', 'Latitude': 54.712354, 'Longitude': -1.698667, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Article for use in theft', 'station': 'durham'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '8f3ea15e-f916-48ca-805a-69309ce609df', 'Type': 'Person search', 'Date': '2022-01-04T13:31:00+00:00', 'Part of a policing operation': False, 'Latitude': 50.371812, 'Longitude': -4.147043, 'Gender': 'Male', 'Age range': '25-34', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'devon-and-cornwall'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'cdf7b6cb-a068-4fca-a77e-0c1a0550aedf', 'Type': 'Person search', 'Date': '2022-01-04T

200
b'{\n  "outcome": true\n}\n'
{'observation_id': '2246e59a-4159-48a8-bfe0-51dc40dba6b8', 'Type': 'Person and Vehicle search', 'Date': '2022-01-05T00:19:00+00:00', 'Part of a policing operation': False, 'Gender': 'Male', 'Age range': 'over 34', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Article for use in theft', 'station': 'bedfordshire'}
200
b'{\n  "outcome": false\n}\n'
{'observation_id': '5d667333-9954-47e0-8a8c-07e34f8780a4', 'Type': 'Person search', 'Date': '2022-01-05T01:25:00+00:00', 'Part of a policing operation': False, 'Gender': 'Female', 'Age range': '25-34', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'nottinghamshire'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '212841a8-7b7c-45e2-8d6c-84dd47a37c7f', 'Type': 'Person search', 'Date': '2022-01-05T01:47:00+00:00', 'Part of a policin

200
b'{\n  "outcome": true\n}\n'
{'observation_id': '6e3c1f4f-cb0c-4cf1-a380-94084d9f5c19', 'Type': 'Person search', 'Date': '2022-01-05T12:12:00+00:00', 'Part of a policing operation': False, 'Gender': 'Male', 'Age range': 'over 34', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Stolen goods', 'station': 'nottinghamshire'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '0971d652-25f2-4e3b-9a31-31c9439452f6', 'Type': 'Person and Vehicle search', 'Date': '2022-01-05T12:35:00+00:00', 'Latitude': 54.748727, 'Longitude': -1.289953, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'durham'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': '39316bf8-aa91-4962-b6fd-a33e5c5e43da', 'Type': 'Person and Vehicle search', 'Date': '2022-01-05T13:12:00+00:00', 'Latitude': 54.8246

200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'd0123bd6-14de-473f-8a9b-8f654260a8ef', 'Type': 'Person search', 'Date': '2022-01-05T19:55:00+00:00', 'Part of a policing operation': False, 'Latitude': 50.209689, 'Longitude': -5.300219, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'devon-and-cornwall'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'cc9783a0-83f4-424f-872e-de633b1e584d', 'Type': 'Person search', 'Date': '2022-01-05T19:59:00+00:00', 'Part of a policing operation': False, 'Latitude': 51.066028, 'Longitude': -4.080262, 'Gender': 'Female', 'Age range': '18-24', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'devon-and-cornwall'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'bc349f7c-443f-4962-a26f-5439590ebc16', 'Type': 'Per

200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'f01bc84f-260f-4cc3-a077-fb5191e253d3', 'Type': 'Person search', 'Date': '2022-01-06T04:43:00+00:00', 'Latitude': 54.748187, 'Longitude': -2.017632, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'White', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Stolen goods', 'station': 'durham'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'f787b261-488f-4353-9697-987bf643d5b8', 'Type': 'Person search', 'Date': '2022-01-06T05:36:25+00:00', 'Latitude': 51.508257, 'Longitude': -0.104503, 'Gender': 'Male', 'Age range': '18-24', 'Officer-defined ethnicity': 'Black', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Offensive weapons', 'station': 'city-of-london'}
200
b'{\n  "outcome": true\n}\n'
{'observation_id': 'ec391408-d4c1-4600-8d7c-0bc74306c1b5', 'Type': 'Person search', 'Date': '2022-01-06T06:36:14+00:00', 'Latitude': 51.512598, 'L

In [46]:
# Set the base URL of your API
import requests
import pandas as pd
import time

APP_NAME = 'railcap-production.up.railway.app'
base_url = "https://{}/should_search/".format(APP_NAME)


# Define the columns to check for missing values
x=0
l=[]
for i, row in new_df1.iterrows():
    # Remove any columns with NaN values from the row
    row = row.dropna()

    # Convert the row to a dictionary
    row_dict = row.to_dict()
    
    # Send a POST request to your API with the row data
    response = requests.post(base_url, json=row_dict)

    # Print the response status code and content for debugging purposes
    print(response.status_code)
    print(response.content)
    
    time.sleep(0.1)
    x+=1
    if x == 1 :
        break

503
b'<!DOCTYPE html>\n<html>\n  <head>\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <meta charset="utf-8" />\n    <title>Server Error</title>\n    <style media="screen">\n      :root {\n        --bg: hsl(250, 24%, 9%);\n        --fg: hsl(0, 0%, 100%);\n        --pink: hsl(270, 60%, 52%);\n      }\n\n      html,\n      body {\n        margin: 0;\n        padding: 0;\n        height: 100%;\n        overflow: hidden;\n        background-color: var(--bg);\n        color: var(--fg);\n        font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto,\n          Oxygen-Sans, Ubuntu, Cantarell, Helvetica Neue, sans-serif,\n          Apple Color Emoji, Segoe UI Emoji, Segoe UI Symbol;\n      }\n\n      main {\n        display: flex;\n        flex-direction: column;\n        align-items: center;\n        justify-content: center;\n        padding: 16px;\n        height: 100%;\n      }\n\n      .logo {\n        width: 160px;\n        height: 160px;\n    