In [None]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import (
    roc_curve,
    confusion_matrix,
    accuracy_score,
    classification_report,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from plot_metric.functions import BinaryClassification
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import PowerTransformer

# Set visualization style
sns.set_style("whitegrid")

# Increase the maximum number of displayed rows
pd.options.display.max_rows = 200

def Find_Optimal_Cutoff(target, predicted):
    '''
    Find data-driven cut-off for classification
    
    Cut-off is determined using Youden's index defined as sensitivity + specificity - 1.
    
    Parameters
    ----------
    target : array, shape = [n_samples]
        True binary labels.
        
    predicted : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive class,
        confidence values, or non-thresholded measure of decisions (as returned by
        "decision_function" on some classifiers).
        
    References
    ----------
    Ewald, B. (2006). Post hoc choice of cut points introduced bias to diagnostic research.
    Journal of clinical epidemiology, 59(8), 798-801.
    
    Steyerberg, E.W., Van Calster, B., & Pencina, M.J. (2011). Performance measures for
    prediction models and markers: evaluation of predictions and classifications.
    Revista Espanola de Cardiologia (English Edition), 64(9), 788-794.
    
    Jiménez-Valverde, A., & Lobo, J.M. (2007). Threshold criteria for conversion of probability
    of species presence to either–or presence–absence. Acta oecologica, 31(3), 361-369.
    '''
    fpr, tpr, thresholds = roc_curve(target, predicted)
    idx = np.argmax(tpr - fpr)
    return thresholds[idx]

def train_model(data_final):
    # List of features to use for training
    feature_list_for_training = [
        'sex_M',
        'age_18-29',  'age_40-49', 'age_50-59', 'age_60+',
        'substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych', 'selfharm',
        'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit',
        'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH',
        'visit_family_gp', 'visit_im', 'visit_neurology', 'visit_other', 'visit_pharmacy', 'visit_psychiatry',
        'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC',
        'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC',
        'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV',
        'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity',
        'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug',
        'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD',
        'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco',
        'EX_Delirium', 'Ex_MS', 'EX_parkinsons',
    ]

    # Separate the features for training
    X = data_final[feature_list_for_training]
    
    # Separate the labels for training
    y = data_final['police_interaction_followup']

    # Add the constant to the X features
    X = sm.add_constant(X)

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

    # Fit the logistic regression model
    log_reg = sm.Logit(y_train, X_train).fit()

    # Predict probabilities for the test set
    yhat = log_reg.predict(X_test)

    # Find optimal probability threshold by using yhat as a continuous measure
    threshold = Find_Optimal_Cutoff(y_test, yhat)

    # Find predictions by applying the threshold to yhat
    prediction = pd.Series(yhat).map(lambda x: 1 if x > threshold else 0)

    # Calculate evaluation metrics
    f1 = f1_score(y_test, prediction)
    roc_auc = roc_auc_score(y_test, prediction)
    sensitivity = recall_score(y_test, prediction)
    precision = precision_score(y_test, prediction)

    # Create a DataFrame to store the evaluation metrics
    df_results = pd.DataFrame((round(f1, 2), round(roc_auc, 2), round(sensitivity, 2), round(precision, 2)))
    df_results = df_results.T

    return df_results, X_train, X_test, y_train, y_test, prediction, log_reg, feature_list_for_training, y



In [None]:

# Read the data
data_final = pd.read_hdf('data/df_subjects_retro_policing_preproc.h5', 'df_subjects_retro_policing_preproc')

print("Display unique values of the 'police_interaction_followup' column")
print(data_final['police_interaction_followup'].unique())

# Convert 'age' column to integer
data_final['age'] = data_final['age'].map(lambda x: int(x))

# Call the train_model function and store the results
df_results, X_train, X_test, y_train, y_test, prediction, log_reg, feature_list_for_training, y = train_model(data_final)

# Confusion matrix for the test dataset
cm = confusion_matrix(y_test, prediction)
print("Confusion Matrix:\n", cm)

# Accuracy score of the model
print("Test accuracy =", accuracy_score(y_test, prediction))

# Classification report
print(classification_report(y_test, prediction))

# F1 score
print("F1 Score: {}".format(f1_score(y_test, prediction)))

# ROC AUC score
roc_auc = roc_auc_score(y_test, prediction)
print("AUC: %.2f" % roc_auc + "%")

# Visualisation with plot_metric
bc = BinaryClassification(y_test, prediction, labels=["Class 1", "Class 2"])

# Plot ROC curve
plt.figure(figsize=(5, 5))
bc.plot_roc_curve()
plt.show()


# Oddsratio

In [None]:
result = log_reg

# Calculate the odds ratios and confidence intervals
df_results = np.round(np.exp(result.conf_int()), 2)
df_results.columns = ['2.5%', '97.5%']
df_results['OR'] = pd.DataFrame(np.exp(result.params))
df_results['OR'] = df_results['OR'].map(lambda x: np.round(x, 2))

# Calculate the p-values
df_results['P_value'] = np.round(result.pvalues.values, 2)

# Format the confidence intervals, rounding to two decimal places
df_results['2.5%'] = df_results['2.5%'].map(lambda x: np.round(x, 2))
df_results['97.5%'] = df_results['97.5%'].map(lambda x: '{:.2f}'.format(float(x)))



In [None]:
df_results[['OR','2.5%', '97.5%', 'P_value']][:30].sort_values("OR",ascending=False)