In [None]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, classification_report, roc_auc_score, f1_score, precision_score, recall_score
from plot_metric.functions import BinaryClassification
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import PowerTransformer

pd.options.display.max_rows = 200

def Find_Optimal_Cutoff(target, predicted):
    '''Find data-driven cut-off for classification
    
    Cut-off is determined using Youden's index defined as sensitivity + specificity - 1.
    
    Parameters
    ----------
    target : array, shape = [n_samples]
        True binary labels.
        
    predicted : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive class,
        confidence values, or non-thresholded measure of decisions (as returned by
        “decision_function” on some classifiers).
        
    References
    ----------
    Ewald, B. (2006). Post hoc choice of cut points introduced bias to diagnostic research.
    Journal of clinical epidemiology, 59(8), 798-801.
    
    Steyerberg, E.W., Van Calster, B., & Pencina, M.J. (2011). Performance measures for
    prediction models and markers: evaluation of predictions and classifications.
    Revista Espanola de Cardiologia (English Edition), 64(9), 788-794.
    
    Jiménez-Valverde, A., & Lobo, J.M. (2007). Threshold criteria for conversion of probability
    of species presence to either–or presence–absence. Acta oecologica, 31(3), 361-369.
    '''
    fpr, tpr, thresholds = roc_curve(target, predicted)
    idx = np.argmax(tpr - fpr)
    return thresholds[idx]

# Read the data
data_final = pd.read_hdf('data/df_sub_police_interaction_preproc.h5', 'df_sub_police_interaction_preproc')

# Convert age column to integer
data_final['age'] = data_final['age'].apply(lambda x: x.astype(int))

# Define the feature list for training
feature_list_for_training = [
    'sex_M',
    'age',
    'substance',
    'mood',
    'anxiety',
    'psychotic',
    'cognitive',
    'otherpsych',
    'selfharm',
    'visit_emr_MH_non_elect',
    'visit_emr_NonMH',
    'visit_emr_visit',
    'visit_hosp_visit',
    'visit_hospitalized_MH',
    'visit_hospitalized_NonMH',
    'visit_family_gp',
    'visit_im',
    'visit_neurology',
    'visit_other',
    'visit_pharmacy',
    'visit_psychiatry',
    'EX_CHF',
    'EX_Arrhy',
    'EX_VD',
    'EX_PCD',
    'EX_PVD',
    'EX_HPTN_UC',
    'EX_HPTN_C',
    'EX_Para',
    'Ex_OthND',
    'Ex_COPD',
    'Ex_Diab_UC',
    'Ex_Diab_C',
    'Ex_Hptothy',
    'Ex_RF',
    'Ex_LD',
    'Ex_PUD_NB',
    'Ex_HIV',
    'Ex_Lymp',
    'Ex_METS',
    'Ex_Tumor',
    'Ex_Rheum_A',
    'Ex_Coag',
    'Ex_Obesity',
    'Ex_WL',
    'Ex_Fluid',
    'Ex_BLA',
    'Ex_DA',
    'Ex_Alcohol',
    'Ex_Drug',
    'Ex_Psycho',
    'Ex_Dep',
    'Ex_Stroke',
    'Ex_Dyslipid',
    'Ex_Sleep',
    'Ex_IHD',
    'EX_Fall',
    'EX_Urinary',
    'EX_Visual',
    'EX_Hearing',
    'EX_Tobacco',
    'EX_Delirium',
    'Ex_MS',
    'EX_parkinsons'
]

# Separate the features for training
X = data_final[feature_list_for_training]
# Separate the labels for training
y = data_final['police_interaction']

# Add the constant to the X features
X = sm.add_constant(X)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

######################################

print("### Classes percentage in the training set as there is imbalance:")
print(y_train.value_counts() * 100 / len(y_train))

# Fit the model
log_reg = sm.Logit(y_train, X_train).fit()

# After fitting the model, we need to test the model on X_test (yhat are the predicted probabilities)
yhat = log_reg.predict(X_test)
# Receive the labels from yhat for classification

# Find optimal probability threshold by using yhat as a continuous measure
threshold = Find_Optimal_Cutoff(y_test, yhat)
print(threshold)

# Find prediction for the dataframe applying threshold
# Choose the cutoff so that the outcome is more balanced, using either the Youden index
# threshold = 0.5
prediction = pd.Series(yhat).map(lambda x: 1 if x > threshold else 0)

# Confusion matrix for the test dataset
cm = confusion_matrix(y_test, prediction)
print("Confusion Matrix:\n", cm)

# Accuracy score of the model
print("Test accuracy =", accuracy_score(y_test, prediction))

# Classification report
print(classification_report(y_test, prediction))

# F1 Score
print("F1 Score:", f1_score(y_test, prediction))

# AUC score
roc_auc = roc_auc_score(y_test, prediction)
print("AUC: %.2f" % roc_auc + "%")

# Visualisation with plot_metric
bc = BinaryClassification(y_test, prediction, labels=["Class 1", "Class 2"])

# Figures
plt.figure(figsize=(5, 5))
bc.plot_roc_curve()
plt.show()


# balance with  oversampling 

In [None]:
# Only balance the classes without normalization
X = data_final[feature_list_for_training]
X = sm.add_constant(X)

# Split into test and training sets
# X_train: features for training the model
# X_test: features for testing the model
# y_train: labels for training the model
# y_test: labels for testing the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Only oversample for the train set
scaler_PT = PowerTransformer() 
over_sampler = RandomOverSampler(random_state=42)
# SMOTE
# over_sampler = SMOTE(random_state=42)
X_train, y_train = over_sampler.fit_resample(X_train, y_train)

######################################

print("### Classes percentage in the training set after balancing:")
print(y_train.value_counts() * 100 / len(y_train))

# Fit the model
log_reg = sm.Logit(y_train, X_train).fit()

# After fitting the model, we need to test the model on X_test (yhat are the predicted probabilities)
yhat = log_reg.predict(X_test)
# Receive the labels from yhat for classification

# Find optimal probability threshold by using yhat as a continuous measure
threshold = Find_Optimal_Cutoff(y_test, yhat)
print(threshold)

# Find prediction for the dataframe applying threshold
# Choose the cutoff so that the outcome is more balanced, using either the Youden index
prediction = pd.Series(yhat).map(lambda x: 1 if x > threshold else 0)

# Confusion matrix
cm = confusion_matrix(y_test, prediction)
print("Confusion Matrix:\n", cm)

# Accuracy score of the model
print("Test accuracy =", accuracy_score(y_test, prediction))

# Classification report
print(classification_report(y_test, prediction))

# F1 Score
print("F1 Score:", f1_score(y_test, prediction))

# AUC score
roc_auc = roc_auc_score(y_test, prediction)
print("AUC: %.2f" % roc_auc + "%")

# Visualisation with plot_metric
bc = BinaryClassification(y_test, prediction, labels=["Class 1", "Class 2"])

# Figures
plt.figure(figsize=(5, 5))
bc.plot_roc_curve()
plt.show()
