In [None]:
# Import libraries

import pandas as pd
from collections import defaultdict
import numpy as np
import os

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

## Functions

### Calibration Slope

In [None]:
def logit(p):
    return np.log(p/(1-p))

def calibration_slope(ground_truth, probabilities):
    probabilities = np.array(probabilities)
    logit_probabilities = logit(probabilities).reshape(-1,1)
    lr = LogisticRegression(penalty='none', fit_intercept=True).fit(logit_probabilities, ground_truth)
    return lr.coef_.item()

### Logistic Regression Model

In [None]:
def model_logistic(data_train_X, data_train_y, variable_name):
   
    X_1_train = data_train_X[variable_name]
    y_1_train = data_train_y
        
    #fit the model by statsmodels
    X_train_const1 = sm.add_constant(X_1_train)
    model_11 = sm.Logit(y_1_train, X_train_const1)
    results_11 = model_11.fit()
    print(results_11.summary())

    #output the probability
    y_train_score1 = results_11.predict(X_train_const1)

    #find the optimized classification threshold by maximum tpr-fpr
    fpr1, tpr1, thresholds1 = roc_curve(y_1_train, y_train_score1)
    optimal_idx1 = np.argmax(tpr1 - fpr1)
    print('threshold is:',thresholds1[optimal_idx1])
    op_value = tpr1[optimal_idx1] - fpr1[optimal_idx1]
    print('max tpr-fpr is:',op_value)
    
    return results_11, thresholds1[optimal_idx1] #output the model and the threshold value by train set
    
    

### Model Evaluation

In [None]:
def model_evaluation(data_test_X_site, data_test_y_site, results_11, thresholds1, variable_name):

    
    #data subgroups
    
    X_1_test = data_test_X_site
    y_1_test = data_test_y_site

    X_1_test_other = X_1_test[(X_1_test['race_2']==0)]
    X_1_test_black = X_1_test[(X_1_test['race_2']==1)]
    y_1_test_other = y_1_test[X_1_test["race_2"] == 0]
    y_1_test_black = y_1_test[X_1_test["race_2"] == 1]
    
    X_1_test_l = X_1_test[(X_1_test['nSES']==0)]
    X_1_test_h = X_1_test[(X_1_test['nSES']==1)]
    y_1_test_l = y_1_test[X_1_test["nSES"] == 0]
    y_1_test_h = y_1_test[X_1_test["nSES"] == 1]
    
    
    X_1_test = X_1_test[variable_name]
    X_1_test_other = X_1_test_other[variable_name]
    X_1_test_black = X_1_test_black[variable_name]
    X_1_test_l = X_1_test_l[variable_name]
    X_1_test_h = X_1_test_h[variable_name]
    
    
    
    #model prediction probability

    X_test_const1 = sm.add_constant(X_1_test,has_constant='add')
    X_test_const1_blc = sm.add_constant(X_1_test_black,has_constant='add')
    X_test_const1_oth = sm.add_constant(X_1_test_other,has_constant='add')
    X_test_const1_low = sm.add_constant(X_1_test_l,has_constant='add')
    X_test_const1_high = sm.add_constant(X_1_test_h,has_constant='add')

    y_test_score1 = results_11.predict(X_test_const1)
    y_test_score1_blc = results_11.predict(X_test_const1_blc)
    y_test_score1_oth = results_11.predict(X_test_const1_oth)
    y_test_score1_low = results_11.predict(X_test_const1_low)
    y_test_score1_high = results_11.predict(X_test_const1_high)
    
    
    
    #model prediction label
    
    y_test_pred_cat1 = (results_11.predict(X_test_const1) > thresholds1).astype(int)
    y_test_pred_cat_blc = (results_11.predict(X_test_const1_blc) > thresholds1).astype(int)
    y_test_pred_cat_oth = (results_11.predict(X_test_const1_oth) > thresholds1).astype(int)
    y_test_pred_cat1_low = (results_11.predict(X_test_const1_low) > thresholds1).astype(int)
    y_test_pred_cat1_hig = (results_11.predict(X_test_const1_high) > thresholds1).astype(int)
    
    print()
    
    
    #evaluation metrics calclulation
    
    #overall data
    print("All Samples")
    cm_1=confusion_matrix(y_1_test, y_test_pred_cat1)
    print(cm_1)

    TPR_1 = cm_1[1,1]/(cm_1[1,1]+cm_1[1,0])
    print('TPR is:',TPR_1)
    FPR_1 = cm_1[0,1]/(cm_1[0,1]+cm_1[0,0])
    print('FPR is:',FPR_1)

    Accuracy_1=sum(np.diag(cm_1))/sum(cm_1.sum(axis=0))
    print('Accuracy is:',Accuracy_1)
    auc_score_1_pro = roc_auc_score(y_1_test,y_test_score1)
    print('AUC is:',auc_score_1_pro)
    cl_1 = calibration_slope(y_1_test,y_test_score1)
    print('CL is:',cl_1)
    
    print()
    
    #black racial group
    print("Black Racial Group")
    cm_1_blc=confusion_matrix(y_1_test_black, y_test_pred_cat_blc)
    print(cm_1_blc)

    TPR_1_blc = cm_1_blc[1,1]/(cm_1_blc[1,1]+cm_1_blc[1,0])
    print('TPR is:',TPR_1_blc)
    FPR_1_blc = cm_1_blc[0,1]/(cm_1_blc[0,1]+cm_1_blc[0,0])
    print('FPR is:',FPR_1_blc)

    Accuracy_1_blc=sum(np.diag(cm_1_blc))/sum(cm_1_blc.sum(axis=0))
    print('Accuracy is:',Accuracy_1_blc)
    auc_score_1_pro_blc = roc_auc_score(y_1_test_black,y_test_score1_blc)
    print('AUC is:',auc_score_1_pro_blc)
    cl_1_blc = calibration_slope(y_1_test_black,y_test_score1_blc)
    print('CL is:',cl_1_blc)
    
    print()
    
    #other racial group
    print("Others Racial Group")
    cm_1_oth=confusion_matrix(y_1_test_other, y_test_pred_cat_oth)
    print(cm_1_oth)

    TPR_1_oth = cm_1_oth[1,1]/(cm_1_oth[1,1]+cm_1_oth[1,0])
    print('TPR is:',TPR_1_oth)
    FPR_1_oth = cm_1_oth[0,1]/(cm_1_oth[0,1]+cm_1_oth[0,0])
    print('FPR is:',FPR_1_oth)

    Accuracy_1_oth=sum(np.diag(cm_1_oth))/sum(cm_1_oth.sum(axis=0))
    print('Accuracy is:',Accuracy_1_oth)
    auc_score_1_pro_oth = roc_auc_score(y_1_test_other,y_test_score1_oth)
    print('AUC is:',auc_score_1_pro_oth)
    cl_1_oth = calibration_slope(y_1_test_other,y_test_score1_oth)
    print('CL is:',cl_1_oth)
    
    print()
    
    #disparity of racial groups 
    print("disparity of race")
    dis_TPR1 = TPR_1_blc - TPR_1_oth
    dis_FPR1 = FPR_1_blc - FPR_1_oth
    dis_AUC1 = auc_score_1_pro_blc - auc_score_1_pro_oth
    dis_cl1 = cl_1_blc - cl_1_oth
    print('disparity TPR is:',dis_TPR1)
    print('disparity FPR is:',dis_FPR1)
    print('disparity AUC is:',dis_AUC1)
    print('disparity CL is:',dis_cl1)
    
    print()
    
    #low nSES group
    print("low nSES Group")
    cm_1_low=confusion_matrix(y_1_test_l, y_test_pred_cat1_low)
    print(cm_1_low)

    TPR_1_low = cm_1_low[1,1]/(cm_1_low[1,1]+cm_1_low[1,0])
    print('TPR is:',TPR_1_low)
    FPR_1_low = cm_1_low[0,1]/(cm_1_low[0,1]+cm_1_low[0,0])
    print('FPR is:',FPR_1_low)

    Accuracy_1_low=sum(np.diag(cm_1_low))/sum(cm_1_low.sum(axis=0))
    print('Accuracy is:',Accuracy_1_low)
    auc_score_1_pro_low = roc_auc_score(y_1_test_l, y_test_score1_low)
    print('AUC is:',auc_score_1_pro_low)
    cl_1_low = calibration_slope(y_1_test_l, y_test_score1_low)
    print('CL is:',cl_1_low)
    
    print()
    
    #high nSES group
    print("high nSES Group")
    cm_1_hig=confusion_matrix(y_1_test_h, y_test_pred_cat1_hig)
    print(cm_1_hig)

    TPR_1_hig = cm_1_hig[1,1]/(cm_1_hig[1,1]+cm_1_hig[1,0])
    print('TPR is:',TPR_1_hig)
    FPR_1_hig = cm_1_hig[0,1]/(cm_1_hig[0,1]+cm_1_hig[0,0])
    print('FPR is:',FPR_1_hig)

    Accuracy_1_hig=sum(np.diag(cm_1_hig))/sum(cm_1_hig.sum(axis=0))
    print('Accuracy is:',Accuracy_1_hig)
    auc_score_1_pro_hig = roc_auc_score(y_1_test_h,y_test_score1_high)
    print('AUC is:',auc_score_1_pro_hig)
    cl_1_hig = calibration_slope(y_1_test_h,y_test_score1_high)
    print('CL is:',cl_1_hig)
    
    print()
    
    #disparity of nSES Group
    print("disparity of nSES Group")
    dis_TPR2 = TPR_1_low - TPR_1_hig
    dis_FPR2 = FPR_1_low - FPR_1_hig
    dis_AUC2 = auc_score_1_pro_low - auc_score_1_pro_hig
    dis_cl2 = cl_1_low - cl_1_hig
    print('disparity TPR is:',dis_TPR2)
    print('disparity FPR is:',dis_FPR2)
    print('disparity AUC is:',dis_AUC2)
    print('disparity CL is:',dis_cl2)

      

## Data Subgroups

In [None]:
data_train=pd.read_csv("../../code/thesis_code/mesa_preprocess_train.csv")
data_test=pd.read_csv("../../code/thesis_code/mesa_preprocess_test.csv")


data_train = data_train.astype({'site1c':'int'})
data_train = data_train.astype({'site1c':'str'})
train_set_1 = pd.get_dummies(data_train)
train_set_1 = train_set_1.drop(["site1c_3"], axis = 1)


data_test = data_test.astype({'site1c':'int'})
data_test = data_test.astype({'site1c':'str'})
test_set_1 = pd.get_dummies(data_test)
test_set_1 = test_set_1.drop(["site1c_3"], axis = 1)


### Overall Train and Test Set

In [None]:
#Overall train and test with dummy variables
data_train_X = train_set_1[['gender1','site1c_4','site1c_5','site1c_6','site1c_7','site1c_8','race_2','cig1c','diabet1','nSES','age1c','chol1','hdl1','sbp1c','F3_PC2','gender_race']]
data_train_y = train_set_1[['y']]
data_test_X = test_set_1[['gender1','site1c_4','site1c_5','site1c_6','site1c_7','site1c_8','race_2','cig1c','diabet1','nSES','age1c','chol1','hdl1','sbp1c','F3_PC2','gender_race']]
data_test_y = test_set_1[['y']]


### Test Set - Site Subgroups

In [None]:
def data_site_groups(train_set_1, test_set_1, site_group):
    train_set_1 = train_set_1.loc[(train_set_1['site1c_4'] == site_group[0])&(train_set_1['site1c_5'] == site_group[1])&(train_set_1['site1c_6'] == site_group[2])&(train_set_1['site1c_7'] == site_group[3])&(train_set_1['site1c_8'] == site_group[4])]
    test_set_1 = test_set_1.loc[(test_set_1['site1c_4'] == site_group[0])&(test_set_1['site1c_5'] == site_group[1])&(test_set_1['site1c_6'] == site_group[2])&(test_set_1['site1c_7'] == site_group[3])&(test_set_1['site1c_8'] == site_group[4])]
    
    data_test_site_X = test_set_1[['gender1','site1c_4','site1c_5','site1c_6','site1c_7','site1c_8','race_2','cig1c','diabet1','nSES','age1c','chol1','hdl1','sbp1c','F3_PC2','gender_race']]
    data_test_site_y = test_set_1[['y']]
    
    return data_test_site_X, data_test_site_y 
      

## Logistic Regression Model Results

* **Model 1 - No Interaction - No nSES** 
    * Model Training: Based on the overall train set
    * Overall Samples Evaluation: Based on the overall test set
    * Site 3 Evaluation: Based on the site 3 subgroups in the test set
    * Site 4 Evaluation: Based on the site 4 subgroups in the test set
    * Site 5 Evaluation: Based on the site 5 subgroups in the test set
    * Site 7 Evaluation: Based on the site 7 subgroups in the test set
    * Site 8 Evaluation: Based on the site 8 subgroups in the test set
    
* **Model 2 - No Interaction - With nSES**
    * Model Training
    * Overall Samples Evaluation
    * Site 3 Evaluation
    * Site 4 Evaluation
    * Site 5 Evaluation
    * Site 7 Evaluation
    * Site 8 Evaluation  
    
* **Model 3 - With Interaction - No nSES**
    * Model Training
    * Overall Samples Evaluation
    * Site 3 Evaluation
    * Site 4 Evaluation
    * Site 5 Evaluation
    * Site 7 Evaluation
    * Site 8 Evaluation  
    
* **Model 4 - With Interaction - With nSES**
    * Model Training
    * Overall Samples Evaluation
    * Site 3 Evaluation
    * Site 4 Evaluation
    * Site 5 Evaluation
    * Site 7 Evaluation
    * Site 8 Evaluation  
    

### Model 1 - No Interaction - No nSES

#### Model 1 Model Training

In [None]:
#Model Training: 
variable_name = ['gender1','site1c_4','site1c_5','site1c_6','site1c_7','site1c_8','race_2','cig1c','diabet1','age1c','chol1','hdl1','sbp1c']
model = model_logistic(data_train_X, data_train_y, variable_name)


#### Model 1: Overall dataset evaluation: 

In [None]:
#Overall dataset evaluation: 
model_evaluation(data_test_X, data_test_y, model[0], model[1], variable_name)

#### Model 1: Site 3 Evaluation: 

In [None]:
# site 3
site_group = [0,0,0,0,0] #site 3
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 1: Site 4 Evaluation: 

In [None]:
# site 4
site_group = [1,0,0,0,0] #site 4
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 1: Site 5 Evaluation: 

In [None]:
# site 5
site_group = [0,1,0,0,0] #site 5
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 1: Site 7 Evaluation: 

In [None]:
# site 7
site_group = [0,0,0,1,0] #site 7
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 1: Site 8 Evaluation: 

In [None]:
# site 8
site_group = [0,0,0,0,1] #site 8
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

### Model 2 - No Interaction - With nSES

#### Model 2 Model Training: 

In [None]:
#Model Training: 
variable_name = ['gender1','site1c_4','site1c_5','site1c_6','site1c_7','site1c_8','race_2','nSES','cig1c','diabet1','age1c','chol1','hdl1','sbp1c']
model = model_logistic(data_train_X, data_train_y, variable_name)


#### Model 2: Overall dataset evaluation:  

In [None]:
#Overall dataset evaluation: 
model_evaluation(data_test_X, data_test_y, model[0], model[1], variable_name)

#### Model 2: Site 3 Evaluation: 

In [None]:
# site 3
site_group = [0,0,0,0,0] #site 3
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 2: Site 4 Evaluation: 

In [None]:
# site 4
site_group = [1,0,0,0,0] #site 4
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 2: Site 5 Evaluation: 

In [None]:
# site 5
site_group = [0,1,0,0,0] #site 5
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 2: Site 7 Evaluation: 

In [None]:
# site 7
site_group = [0,0,0,1,0] #site 7
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 2: Site 8 Evaluation: 

In [None]:
# site 8
site_group = [0,0,0,0,1] #site 8
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

### Model 3 - With Interaction - No nSES

#### Model 3 Model Training

In [None]:
#Model Training: 
variable_name = ['gender1','site1c_4','site1c_5','site1c_6','site1c_7','site1c_8','race_2','cig1c','diabet1','age1c','chol1','hdl1','sbp1c','gender_race']
model = model_logistic(data_train_X, data_train_y, variable_name)

#### Model 3: Overall dataset evaluation: 

In [None]:
#Overall dataset evaluation: 
model_evaluation(data_test_X, data_test_y, model[0], model[1], variable_name)

#### Model 3: Site 3 Evaluation: 

In [None]:
# site 3
site_group = [0,0,0,0,0] #site 3
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 3: Site 4 Evaluation: 

In [None]:
# site 4
site_group = [1,0,0,0,0] #site 4
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 3: Site 5 Evaluation: 

In [None]:
# site 5
site_group = [0,1,0,0,0] #site 5
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 3: Site 7 Evaluation: 

In [None]:
# site 7
site_group = [0,0,0,1,0] #site 7
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 3: Site 8 Evaluation: 

In [None]:
# site 8
site_group = [0,0,0,0,1] #site 8
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

### Model 4 - With Interaction - With nSES

In [None]:
#Model Training: 
variable_name = ['gender1','site1c_4','site1c_5','site1c_6','site1c_7','site1c_8','race_2','nSES','cig1c','diabet1','age1c','chol1','hdl1','sbp1c','gender_race']
model = model_logistic(data_train_X, data_train_y, variable_name)

#### Model 4: Overall dataset evaluation: 

In [None]:
#Overall dataset evaluation: 
model_evaluation(data_test_X, data_test_y, model[0], model[1], variable_name)

#### Model 4: Site 3 Evaluation: 

In [None]:
# site 3
site_group = [0,0,0,0,0] #site 3
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 4: Site 4 Evaluation: 

In [None]:
# site 4
site_group = [1,0,0,0,0] #site 4
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 4: Site 5 Evaluation: 

In [None]:
# site 5
site_group = [0,1,0,0,0] #site 5
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 4: Site 7 Evaluation: 

In [None]:
# site 7
site_group = [0,0,0,1,0] #site 7
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)

#### Model 4: Site 8 Evaluation: 

In [None]:
# site 8
site_group = [0,0,0,0,1] #site 8
data_site_X, data_site_y = data_site_groups(train_set_1, test_set_1, site_group)
model_evaluation(data_site_X, data_site_y, model[0], model[1], variable_name)