In [None]:
# Import the libraries
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, classification_report, roc_auc_score, f1_score, precision_score, recall_score
from plot_metric.functions import BinaryClassification
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import PowerTransformer

pd.options.display.max_rows = 200

def Find_Optimal_Cutoff(target, predicted):
    '''Find data-driven cut-off for classification
    
    Cut-off is determined using Youden's index defined as sensitivity + specificity - 1.
    
    Parameters
    ----------
    target : array, shape = [n_samples]
        True binary labels.
        
    predicted : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive class,
        confidence values, or non-thresholded measure of decisions (as returned by
        “decision_function” on some classifiers).
        
    References
    ----------
    Ewald, B. (2006). Post hoc choice of cut points introduced bias to diagnostic research.
    Journal of clinical epidemiology, 59(8), 798-801.
    
    Steyerberg, E.W., Van Calster, B., & Pencina, M.J. (2011). Performance measures for
    prediction models and markers: evaluation of predictions and classifications.
    Revista Espanola de Cardiologia (English Edition), 64(9), 788-794.
    
    Jiménez-Valverde, A., & Lobo, J.M. (2007). Threshold criteria for conversion of probability
    of species presence to either–or presence–absence. Acta oecologica, 31(3), 361-369.
    '''
    fpr, tpr, thresholds = roc_curve(target, predicted)
    idx = np.argmax(tpr - fpr)
    return thresholds[idx]


def train_model(data_final):
    feature_list_for_training = [
        'sex_M',
        'age',
        'substance',
        'mood',
        'anxiety',
        'psychotic',
        'cognitive',
        'otherpsych',
        'selfharm',
        'visit_emr_MH_non_elect',
        'visit_emr_NonMH',
        'visit_emr_visit',
        'visit_hosp_visit',
        'visit_hospitalized_MH',
        'visit_hospitalized_NonMH',
        'visit_family_gp',
        'visit_im',
        'visit_neurology',
        'visit_other',
        'visit_pharmacy',
        'visit_psychiatry',
        'EX_CHF',
        'EX_Arrhy',
        'EX_VD',
        'EX_PCD',
        'EX_PVD',
        'EX_HPTN_UC',
        'EX_HPTN_C',
        'EX_Para',
        'Ex_OthND',
        'Ex_COPD',
        'Ex_Diab_UC',
        'Ex_Diab_C',
        'Ex_Hptothy',
        'Ex_RF',
        'Ex_LD',
        'Ex_PUD_NB',
        'Ex_HIV',
        'Ex_Lymp',
        'Ex_METS',
        'Ex_Tumor',
        'Ex_Rheum_A',
        'Ex_Coag',
        'Ex_Obesity',
        'Ex_WL',
        'Ex_Fluid',
        'Ex_BLA',
        'Ex_DA',
        'Ex_Alcohol',
        'Ex_Drug',
        'Ex_Psycho',
        'Ex_Dep',
        'Ex_Stroke',
        'Ex_Dyslipid',
        'Ex_Sleep',
        'Ex_IHD',
        'EX_Fall',
        'EX_Urinary',
        'EX_Visual',
        'EX_Hearing',
        'EX_Tobacco',
        'EX_Delirium',
        'Ex_MS',
        'EX_parkinsons',
    ]
    
    # Separate the features for training
    X = data_final[feature_list_for_training]
    
    scaler_PT = PowerTransformer() 
    X = pd.DataFrame(scaler_PT.fit_transform(X), columns=X.columns)
    
    # Add the constant to the X features
    X = sm.add_constant(X)
    
    # Separate the labels for training
    y = data_final['police_interaction_followup']
    
    # Split the dataset into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    
    # Oversample the minority class to address class imbalance
    over_sampler = RandomOverSampler(random_state=42)
    # Alternatively, you can use SMOTE:
    # over_sampler = SMOTE(random_state=42)
    X_train, y_train = over_sampler.fit_resample(X_train, y_train)
    
    print("### Classes percentage in the training as there are no longer any imbalances:")
    print(y_train.value_counts() * 100 / len(y_train))
    
    # Fit the logistic regression model
    log_reg = sm.Logit(y_train, X_train).fit()
    
    # After fitting the model, we need to test the model on X_test
    # yhat are the predicted probabilities
    yhat = log_reg.predict(X_test)
    
    # Receive the labels from yhat for classification
    # Find optimal probability threshold by using yhat as a continuous measure
    threshold = Find_Optimal_Cutoff(y_test, yhat)
    
    # Find predictions by applying the threshold to the predicted probabilities
    prediction = pd.Series(yhat).map(lambda x: 1 if x > threshold else 0)
    
    # Calculate evaluation metrics
    f1 = f1_score(y_test, prediction)
    roc_auc = roc_auc_score(y_test, prediction)
    sensitivity = recall_score(y_test, prediction)
    precision = precision_score(y_test, prediction)
 
    # Create a dataframe to store the evaluation metrics
    df_results = pd.DataFrame((round(f1, 2), round(roc_auc, 2), round(sensitivity, 2), round(precision, 2)))
    df_results = df_results.T
    
    return (df_results, X_train, X_test, y_train, y_test, prediction, log_reg, feature_list_for_training, y)


In [None]:
# Read the data from the HDF file
data_final = pd.read_hdf('data/df_subjects_retro_policing.h5', 'df_subjects_retro_policing')

# Fill missing values in the 'police_interaction_followup' column with 0
data_final.police_interaction_followup = data_final.police_interaction_followup.fillna(0)

# Map the 'police_interaction_followup' column to convert it to binary labels
data_final['police_interaction_followup'] = data_final['police_interaction_followup'].map(lambda x: 0 if x == 0 else 1)


# Create dummy variables for the 'sex' column
data_final = data_final.join(pd.get_dummies(data_final['sex'], prefix='sex'))

# Create age categories and create dummy variables for the 'age_categorical' column
data_final['age_categorical'] = data_final['age'].map(lambda x: '18-29' if ((x >= 18) and (x < 30)) else ('30-39' if ((x >= 30) and (x < 40)) else ('40-49' if ((x >= 40) and (x < 50)) else ('50-59' if ((x >= 50) and (x < 60)) else ('60+' if x >= 60 else '')))))
data_final = data_final.join(pd.get_dummies(data_final['age_categorical'], prefix='age'))

print ("unique values of 'police_interaction_followup' column")
print(data_final['police_interaction_followup'].unique())

# Convert 'age' column to integer
data_final['age'] = data_final['age'].map(lambda x: int(x))

# Train the model and obtain the results
df_results, X_train, X_test, y_train, y_test, prediction, log_reg, feature_list_for_training, y = train_model(data_final)

# Confusion matrix for the test dataset
cm = confusion_matrix(y_test, prediction)
print("Confusion Matrix: \n", cm)

# Accuracy score of the model
print('Test accuracy = ', accuracy_score(y_test, prediction))

# Classification report
print(classification_report(y_test, prediction))

# F1 score
print("F1 Score: {}".format(f1_score(y_test, prediction)))

# ROC AUC score
roc_auc = roc_auc_score(y_test, prediction)
print('AUC: %.2f' % roc_auc + "%")

# Visualisation with plot_metric
bc = BinaryClassification(y_test, prediction, labels=["Class 1", "Class 2"])

# Plot ROC curve
plt.figure(figsize=(5, 5))
bc.plot_roc_curve()
plt.show()


In [None]:
# Save the final data to an HDF file
data_final.to_hdf('retro_ARC_Policing_training/data/retro_data_final_policing.h5', 'retro_data_final_policing')
