In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [12]:
def load_data(file):
    df = pd.read_csv(os.path.join("data", file))
    return df

df = load_data("train.csv")

In [13]:
df_train = load_data("test_data.csv")


In [14]:
def clean(df):


    df = df.drop(df[df['station'].isin(["leicestershire",'humberside', 'lancashire','metropolitan','west-midlands'])].index)
    
    df['Part of a policing operation'] = df['Part of a policing operation'].fillna(False)
    df['Part of a policing operation'] = df['Part of a policing operation'].astype(bool)

    df['Outcome linked to object of search'] = df['Outcome linked to object of search'].fillna(False)

    df['Legislation'] = df['Legislation'].fillna('unknown')
    
    df.loc[df['Outcome'] == 'A no further action disposal', 'Outcome linked to object of search'] = False
    
    success_outcomes = ['Community resolution', 'Khat or Cannabis warning', 'Caution (simple or conditional)', 
                    'Arrest', 'Penalty Notice for Disorder', 'Summons / charged by post', 
                    'Suspect arrested', 'Suspect summoned to court']

    # create a new column called "success" with 1 if the outcome is in the list of successful outcomes, 0 otherwise
    df['success'] = df.apply(lambda x: True if x['Outcome'] in success_outcomes and x['Outcome linked to object of search'] == True else False, axis=1)


    #df=df.dropna()
    return df

df_new = clean(df)

In [15]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        X['Hour'] = X['Date'].dt.hour
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['DayOfWeek'] = X['Date'].dt.weekday
        X=X.drop(columns = "Date", axis=1)
        
        return X[['Hour','Month', 'Day', 'DayOfWeek']]

    def get_feature_names_out(self):
        return [('Date', 'Hour'), ('Date', 'Month'), ('Date', 'Day'),('Date', 'DayOfWeek')]


In [16]:
from lightgbm import LGBMClassifier

features = ["observation_id",'Type','Part of a policing operation', 'Latitude', 'Longitude','Legislation', 'Object of search','Date',"Age range","Gender",'station','Officer-defined ethnicity']
target = 'success'

X_train = df_new[features]
y_train =  df_new[target]



categorical_columns = ['Legislation', 'Object of search','Part of a policing operation','Date',"Age range","Gender",'station']
feat_columns = ['Date']

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

preprocessor = ColumnTransformer(
    transformers=[
       ('date_transformer', DateTransformer(), feat_columns),
        ('categorical_transformers', categorical_transformer, categorical_columns),
    ])


preprocessor.fit(X_train)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ("classifer",LogisticRegression(C=1, class_weight='balanced', n_jobs=-1, random_state=42))
                           #("classifer",GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3,random_state=42))
                           #("Classifier", LogisticRegression(n_jobs=-1))
                           #("classifier", RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=5, class_weight="balanced", random_state=42, n_jobs=-1))
                            #("classifier", LGBMClassifier(n_estimators=100, max_depth = 7,learning_rate=0.1,class_weight="balanced", random_state=42, n_jobs=-1))
                           
                           
                
                            ])

# Fit model on training data
pipeline.fit(X_train, y_train)



In [17]:
X_test = df_train[features]
y_test = df_train['Outcome']


In [19]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

def verify_no_discrimination(X_test, y_true, y_pred, sensitive_column='Age range', max_diff=0.05):
    """
    Verifies that no department has discrimination in between protected age ranges
    """
    
    departments = X_test['station'].unique()
    sensitive_classes = X_test[sensitive_column].unique()
    
    is_satisfied = True
    problematic_departments = []
    good_departments = []
    for department in departments:
        precisions = {}
        for sensitive_class in sensitive_classes:
            mask = (X_test[sensitive_column] == sensitive_class) & (X_test['station'] == department)
            if mask.sum():
                precisions[sensitive_class] = precision_score(y_true[mask], y_pred[mask], pos_label=1, zero_division=0)
                
        diff = np.max(list(precisions.values())) - np.min(list(precisions.values()))
        if diff > max_diff:
            is_satisfied = False
            problematic_departments.append((department, diff, precisions))
        else:
            good_departments.append((department, diff, precisions))

    return is_satisfied, problematic_departments, good_departments

classifiers = [
    LogisticRegression(C=1, class_weight='balanced', n_jobs=-1, random_state=42),
    LGBMClassifier(n_estimators=100, max_depth = 3,learning_rate=0.1,class_weight="balanced", random_state=42, n_jobs=-1)

]

i=0
for classifier in classifiers:
    print("\n\new")
    print(classifier)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])

    # Fit model on training data
    pipeline.fit(X_train, y_train)

    y_pred_proba = pipeline.predict_proba(X_test)[:, 1] # Get predicted probabilities for positive class
    

    l =[0.95]
    
    for s in l:
    
    
        best_threshold = 0
        best_false_false_rate = 0
        recall_range = (s, 1)

        for threshold in range(0, 100):
            threshold /= 100
            y_pred = (y_pred_proba > threshold).astype(int)
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
            false_false_rate = tn / (tn + fp)
            recall = tp / (tp + fn)
            if false_false_rate > best_false_false_rate and recall_range[0] <= recall <= recall_range[1]:
                best_threshold = threshold
                best_false_false_rate = false_false_rate

        threshold = best_threshold 
        #threshold =0.38
        #thresholds = [0.35,0.38]
        
        #threshold=thresholds[i]
        #i+=1
        
        print("threshold",threshold)
        y_pred = (y_pred_proba > threshold).astype(int)


        print('F1 score:', f1_score(y_test, y_pred))
        print('Recall score:', recall_score(y_test, y_pred))
        print('Precision score:', precision_score(y_test, y_pred))
        print("ROC score:", roc_auc_score(y_test, y_pred))

        print("\n\nGender")
        is_satisfied, problematic_departments, good_deparments = verify_no_discrimination(
            X_test, y_test, y_pred, sensitive_column='Gender')

        if not is_satisfied:
            print("Requirement failed 😢")
            print("Num problematic departments: {}".format(len(problematic_departments)))
            print("Num good departments: {}".format(len(good_deparments)))
            g_a= np.mean([p[1] for p in problematic_departments])
            print("avg diff:", np.mean([p[1] for p in problematic_departments]))


        print("\nEthnicity")
        is_satisfied, problematic_departments, good_deparments = verify_no_discrimination(
            X_test, y_test, y_pred, sensitive_column='Officer-defined ethnicity')

        if not is_satisfied:
            print("Requirement failed 😢")
            print("Num problematic departments: {}".format(len(problematic_departments)))
            print("Num good departments: {}".format(len(good_deparments)))
            e_a = np.mean([p[1] for p in problematic_departments])
            print("avg diff:", np.mean([p[1] for p in problematic_departments]))
        else:
            print("Requirement satisfied! 🚀")


        print("\n age ")
        is_satisfied, problematic_departments, good_deparments = verify_no_discrimination(
            X_test, y_test, y_pred)

        if not is_satisfied:
            print("Requirement failed 😢")
            print("Num problematic departments: {}".format(len(problematic_departments)))
            print("Num good departments: {}".format(len(good_deparments)))
            a_a = np.mean([p[1] for p in problematic_departments])
            print("avg diff:", np.mean([p[1] for p in problematic_departments]))
        else:
            print("Requirement satisfied! 🚀")

        cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_, normalize='true')
        print("\n")
        print(cm)
        print((a_a+g_a+e_a)/3)

        print("Departments analysed: {}".format(len(problematic_departments) + len(good_deparments)))
        print('--------------------------------------------------------------------------------')
    print('--------------------------------------------------------------------------------')



ew
LogisticRegression(C=1, class_weight='balanced', n_jobs=-1, random_state=42)
threshold 0.33
F1 score: 0.37464104263309034
Recall score: 0.9592760180995475
Precision score: 0.23277518528685148
ROC score: 0.5340066314987534


Gender
Requirement failed 😢
Num problematic departments: 3
Num good departments: 3
avg diff: 0.10442316965681592

Ethnicity
Requirement failed 😢
Num problematic departments: 6
Num good departments: 0
avg diff: 0.23957401520848187

 age 
Requirement failed 😢
Num problematic departments: 6
Num good departments: 0
avg diff: 0.1919675841812527


[[0.10873724 0.89126276]
 [0.04072398 0.95927602]]
0.17865492301551686
Departments analysed: 6
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------


ew
LGBMClassifier(class_weight='balanced', max_depth=3, random_state=42)
threshold 0.36
F1 score: 0.3752503894947697
Recall score: 0.9536199095022625
Precision score: 0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt