In [None]:
import pandas as pd
import numpy as np

import random
import matplotlib.pyplot as plt
import statistics
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_text

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

from sklearn import tree 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics

In [None]:
def logit(p):
  return np.log(p/(1-p))

def calibration_slope(ground_truth, probabilities):
  probabilities = np.array(probabilities)
  logit_probabilities = logit(probabilities).reshape(-1,1)
  lr = LogisticRegression(penalty='none', fit_intercept=True).fit(logit_probabilities, ground_truth)
  return lr.coef_.item()

# threshold calculation
class LogisticRegressionWithThreshold(LogisticRegression):
    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return LogisticRegression.predict(self, X)
        else:
            y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
            y_pred_with_threshold = (y_scores >= threshold).astype(int)

            return y_pred_with_threshold
    
    def threshold_from_optimal_tpr_minus_fpr(self, X, y):
        y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
        fpr, tpr, thresholds = roc_curve(y, y_scores) 

        optimal_idx = np.argmax(tpr - fpr)

        return thresholds[optimal_idx], tpr[optimal_idx] - fpr[optimal_idx]
    
# roc curve
def PlotROC(y_test,  y_pred_proba, AUCvalue):
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
    plt.text(0.3,0, "".join(['AUC =', AUCvalue]), fontsize = 15)
    plt.plot(fpr,tpr)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    

# frequency plot
def PlotFreq(y_pred_proba):
    y_pred_plot = pd.Series(y_pred_proba)
    y_pred_plot.plot.hist(grid=True, bins=20, rwidth=0.9,
                       color='#607c8e')
    plt.xlabel('Prob')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()

In [None]:
class TreeFit:
    ## input
    def __init__(self, tr_dat, te_dat, params, nSESInclude):   ## con_var & cate_var are lists of variable names
        self.tr_dat = tr_dat
        self.te_dat = te_dat
        self.params = params
        self.nSESInclude = nSESInclude
        
    ### separate x, y
        self.X_tr = self.tr_dat.loc[:,self.tr_dat.columns!='y']
        self.X_te = self.te_dat.loc[:,self.te_dat.columns!='y']
        self.y_tr = self.tr_dat[['y']]
        self.y_te = self.te_dat[['y']]
  
    ### subgrouping
        self.X_tr_0 = self.X_tr[self.y_tr["y"] == 0]  
        self.X_tr_1 = self.X_tr[self.y_tr["y"] == 1]
        self.X_te_0 = self.X_te[self.y_te["y"] == 0]  
        self.X_te_1 = self.X_te[self.y_te["y"] == 1]
        
        self.X_tr_low = self.X_tr[self.X_tr["nSES"] == 0]  
        self.X_tr_high = self.X_tr[self.X_tr["nSES"] == 1]
    
        self.X_te_low = self.X_te[self.X_te["nSES"] == 0]  
        self.y_te_low = self.y_te[self.X_te["nSES"] == 0]
        self.X_te_high = self.X_te[self.X_te["nSES"] == 1]
        self.y_te_high = self.y_te[self.X_te["nSES"] == 1]
        
        self.x_te_f = self.X_te[self.X_te['gender'] == 0]
        self.y_te_f = self.y_te[self.X_te["gender"] == 0]
        self.x_te_m = self.X_te[self.X_te['gender'] == 1]
        self.y_te_m = self.y_te[self.X_te["gender"] == 1]
        
        # subgroups by both gender and nses
        self.x_te_f_low = self.X_te[(self.X_te['gender'] == 0) & (self.X_te["nSES"] == 0)]
        self.x_te_f_high = self.X_te[(self.X_te["gender"] == 0) & self.X_te["nSES"] == 1]
        self.x_te_m_low = self.X_te[(self.X_te['gender'] == 1) & (self.X_te["nSES"] == 0)]
        self.x_te_m_high = self.X_te[(self.X_te['gender'] == 1) & (self.X_te["nSES"] == 1)]
        
        self.y_te_f_low = self.y_te[(self.X_te['gender'] == 0) & (self.X_te["nSES"] == 0)]
        self.y_te_f_high = self.y_te[(self.X_te["gender"] == 0) & self.X_te["nSES"] == 1]
        self.y_te_m_low = self.y_te[(self.X_te['gender'] == 1) & (self.X_te["nSES"] == 0)]
        self.y_te_m_high = self.y_te[(self.X_te['gender'] == 1) & (self.X_te["nSES"] == 1)]
        
        print("Subgrouping")
   
    # whether or not include nSES as predictor
        if nSESInclude == 0:
            self.X_tr_0 = self.X_tr_0.loc[:,self.X_tr.columns!='nSES']
            self.X_tr_1 = self.X_tr_1.loc[:,self.X_tr.columns!='nSES']
            self.X_te_0 = self.X_te_0.loc[:,self.X_te.columns!='nSES']
            self.X_te_1 = self.X_te_1.loc[:,self.X_te.columns!='nSES']
            
            self.X_tr_low = self.X_tr.loc[:,self.X_tr.columns!='nSES']
            self.X_tr_high = self.X_tr.loc[:,self.X_te.columns!='nSES']
        
            self.X_tr = self.X_tr.loc[:,self.X_tr.columns!='nSES']
            self.X_te = self.X_te.loc[:,self.X_te.columns!='nSES']
            self.X_te_low = self.X_te_low.loc[:,self.X_te_low.columns!='nSES']
            self.X_te_high = self.X_te_high.loc[:,self.X_te_high.columns!='nSES']
            self.x_te_f = self.x_te_f.loc[:,self.x_te_f.columns!='nSES']
            self.x_te_m = self.x_te_m.loc[:,self.x_te_m.columns!='nSES']
            
            self.x_te_f_low = self.x_te_f_low.loc[:, self.x_te_f_low.columns != 'nSES']
            self.x_te_f_high = self.x_te_f_high.loc[:, self.x_te_f_high.columns != 'nSES']
            self.x_te_m_low = self.x_te_m_low.loc[:, self.x_te_m_low.columns != 'nSES']
            self.x_te_m_high = self.x_te_m_high.loc[:, self.x_te_m_high.columns != 'nSES']
    
    # model fitting and metrics
        print("\n>>>Model Results")
        self.fit()
        print("\n>>>Model performance")
        self.metrics()
    
    def fit(self):    
        # fit model
        clf = tree.DecisionTreeClassifier(random_state=0,class_weight = "balanced")
        gcv = GridSearchCV(estimator=clf,param_grid=self.params)
        gcv.fit(self.X_tr,self.y_tr)
        self.model = gcv.best_estimator_
        self.model.fit(self.X_tr,self.y_tr)
        print("\n>>>Model fitted")
        
        # threshold calculation
        def threshold_from_optimal_tpr_minus_fpr(model, X, y): 
            pred_prob = model.predict_proba(X)[::,1]
            fpr, tpr, thresholds = roc_curve(y, pred_prob) 
            optimal_idx = np.argmax(tpr - fpr)
            return thresholds[optimal_idx], tpr[optimal_idx] - fpr[optimal_idx]
 
        self.threshold, optimal_tpr_minus_fpr = threshold_from_optimal_tpr_minus_fpr(self.model,self.X_tr, self.y_tr)
        print("\n>>>threshold=",self.threshold)
    
    
    def metrics(self):
        
        # pred
        ## prob
        self.tr_low_pred = self.model.predict_proba(self.X_tr_low)[::,1]
        self.tr_high_pred = self.model.predict_proba(self.X_tr_high)[::,1]
        
        self.tr_0_pred = self.model.predict_proba(self.X_tr_0)[::,1]
        self.tr_1_pred = self.model.predict_proba(self.X_tr_1)[::,1]
        self.te_0_pred = self.model.predict_proba(self.X_te_0)[::,1]
        self.te_1_pred = self.model.predict_proba(self.X_te_1)[::,1]
        
        self.y_pred_proba = self.model.predict_proba(self.X_te)[::,1]
        self.y_pred_proba_low = self.model.predict_proba(self.X_te_low)[::,1]
        self.y_pred_proba_high = self.model.predict_proba(self.X_te_high)[::,1]
        self.y_pred_proba_f = self.model.predict_proba(self.x_te_f)[::,1]
        self.y_pred_proba_m = self.model.predict_proba(self.x_te_m)[::,1]
        self.y_pred_proba_f_low = self.model.predict_proba(self.x_te_f_low)[::,1]
        self.y_pred_proba_f_high = self.model.predict_proba(self.x_te_f_high)[::,1]
        self.y_pred_proba_m_low = self.model.predict_proba(self.x_te_m_low)[::,1]
        self.y_pred_proba_m_high = self.model.predict_proba(self.x_te_m_high)[::,1]
        
        ## label binary prediction with threshold
        
        self.pred_lable = (self.y_pred_proba >= self.threshold).astype(int)
        self.pred_lable_low = (self.y_pred_proba_low >= self.threshold).astype(int)
        self.pred_lable_high = (self.y_pred_proba_high >= self.threshold).astype(int)
        self.pred_lable_f = (self.y_pred_proba_f >= self.threshold).astype(int)
        self.pred_lable_m = (self.y_pred_proba_m >= self.threshold).astype(int)
        self.pred_lable_f_low = (self.y_pred_proba_f_low >= self.threshold).astype(int)
        self.pred_lable_f_high = (self.y_pred_proba_f_high >= self.threshold).astype(int)
        self.pred_lable_m_low = (self.y_pred_proba_m_low >= self.threshold).astype(int)
        self.pred_lable_m_high = (self.y_pred_proba_m_high >= self.threshold).astype(int)
        
        # accuracy
        print("\n>>>>total accuracy=", metrics.accuracy_score(self.y_te, self.pred_lable))

        # AUC
        print("\n>>>>AUC")
        auc = metrics.roc_auc_score(self.y_te, self.y_pred_proba)
        print("AUC=", auc)
        auc_low = metrics.roc_auc_score(self.y_te_low, self.y_pred_proba_low)
        auc_high = metrics.roc_auc_score(self.y_te_high, self.y_pred_proba_high)
        print("Disparity in AUC =",abs(auc_low - auc_high))
        auc_f_low = metrics.roc_auc_score(self.y_te_f_low, self.y_pred_proba_f_low)
        auc_f_high = metrics.roc_auc_score(self.y_te_f_high, self.y_pred_proba_f_high)
        auc_m_low = metrics.roc_auc_score(self.y_te_m_low, self.y_pred_proba_m_low)
        auc_m_high = metrics.roc_auc_score(self.y_te_m_high, self.y_pred_proba_m_high)
        print("AUC in f_low group = ",auc_f_low,
             "\n AUC in f_high group = ",auc_f_high,
             "\n AUC in m_low group = ",auc_m_low,
             "\n AUC in m_high group = ",auc_m_high)
        
        
        # TPR&FPR
        print("\n>>>>TPR&FPR")
        def TprFpr(TrueLabel, PredLable, subgroup):
            CM = confusion_matrix(TrueLabel, PredLable)
            tpr = CM[1,1]/(CM[1,1]+CM[1,0])
            fpr = CM[0,1]/(CM[0,1]+CM[0,0])
            print(subgroup,": \n",
                  "".join(['TPR=', str(tpr), ',','FPR=', str(fpr)]))
    
        TprFpr(self.y_te, self.pred_lable,"whole")
        TprFpr(self.y_te_f_low, self.pred_lable_f_low,"f_low")
        TprFpr(self.y_te_f_high, self.pred_lable_f_high,"f_high")
        TprFpr(self.y_te_m_low, self.pred_lable_m_low, "m_low")
        TprFpr(self.y_te_m_high, self.pred_lable_m_high,"m_high")
        
        # roc curve
        PlotROC(self.y_te, self.y_pred_proba, str(auc))
        
        # frequency plot
        print("\n>>>>Tr Low nSES")
        PlotFreq(self.tr_low_pred)
        print("\n>>>>Tr High nSES")
        PlotFreq(self.tr_high_pred)
        print("\n>>>>Te Low nSES")
        PlotFreq(self.y_pred_proba_low)
        print("\n>>>>Te High nSES")
        PlotFreq(self.y_pred_proba_high)
        
        print("\n>>>>Tr no CVD")
        PlotFreq(self.tr_0_pred)
        print("\n>>>>Tr CVD")
        PlotFreq(self.tr_1_pred)
        print("\n>>>>Te no CVD")
        PlotFreq(self.te_0_pred)
        print("\n>>>>Te CVD")
        PlotFreq(self.te_1_pred)
        
        # tree
        r = export_text(decision_tree = self.model, feature_names=self.X_tr.columns.values.tolist())
        print(r)

In [None]:
### temporarily removed from the class

# calibration slope
        print("\n>>>>Calibration Slope")
        print("calibration slope =",calibration_slope(self.y_te, self.y_pred_proba))
        dis_cs = abs(calibration_slope(self.y_te_low, self.y_pred_proba_low) 
                     - calibration_slope(self.y_te_high, self.y_pred_proba_high))
        print("Disparity in calibration slope =",dis_cs)
        print("CS in f_low group = ",calibration_slope(self.y_te_f_low, self.y_pred_proba_f_low),
             "\n CS in f_high group = ",calibration_slope(self.y_te_f_high, self.y_pred_proba_f_high),
             "\n CS in m_low group = ",calibration_slope(self.y_te_m_low, self.y_pred_proba_m_low),
             "\n CS in m_high group = ",calibration_slope(self.y_te_m_high, self.y_pred_proba_m_high))  
        
# visualize the tree
        plt.figure(figsize=(20,20))
        features = self.X_tr.columns
        classes = ['No CVD','CVD']
        tree.plot_tree(self.model,feature_names=features,class_names=classes,filled=True)
        plt.show()

In [None]:
### JHS (no SMOTE) ###

In [None]:
jhs_tr = pd.read_csv('data/jhs_tr.csv')
jhs_te = pd.read_csv('data/jhs_te.csv')

In [None]:
### M1 no nses

In [None]:
# M1: without nSES

params = {'max_depth': [2,3,4,5,6,7,8,9,10],
         'min_samples_split': [1,2],
         'min_samples_leaf': [1,2]}

jhs_tr_m1 = jhs_tr.loc[:,~jhs_tr.columns.isin(['subjid','nbSESpc2score', 'y1', 'y2', 'y3'])]
jhs_te_m1 = jhs_te.loc[:,~jhs_te.columns.isin(['subjid', 'nbSESpc2score', 'y1', 'y2', 'y3'])]

jhs_tr_m1 = jhs_tr_m1.rename(columns={"y_tot": "y"})
jhs_te_m1 = jhs_te_m1.rename(columns={"y_tot": "y"})

jhs_m1 = TreeFit(jhs_tr_m1, jhs_te_m1,params, nSESInclude = 0)


In [None]:
# M2: with binary nSES

jhs_m2 = TreeFit(jhs_tr_m1, jhs_te_m1,params, nSESInclude = 1)

In [None]:
# M3: nSES score

jhs_tr_m3 = jhs_tr.loc[:,~jhs_tr.columns.isin(['subjid', 'y1', 'y2', 'y3'])]
jhs_te_m3 = jhs_te.loc[:,~jhs_te.columns.isin(['subjid', 'y1', 'y2', 'y3'])]

jhs_tr_m3 = jhs_tr_m3.rename(columns={"y_tot": "y"})
jhs_te_m3 = jhs_te_m3.rename(columns={"y_tot": "y"})

jhs_m3 = TreeFit(jhs_tr_m3, jhs_te_m3,params, nSESInclude = 0)

In [None]:
### Merged (no SMOTE) ###

In [None]:
merged_tr = pd.read_csv('data/merged_tr_nosmt.csv')
merged_te = pd.read_csv('data/merged_te_nosmt.csv')

In [None]:
### M1 no nses

In [None]:
# M1: without nSES

params = {'max_depth': [2,3,4,5,6,7,8,9,10],
         'min_samples_split': [1,2],
         'min_samples_leaf': [1,2]}

merged_tr_m1 = merged_tr.loc[:,~merged_tr.columns.isin(['subjid','nSESscore', 'y1', 'y2', 'y3','dat'])]
merged_te_m1 = merged_te.loc[:,~merged_te.columns.isin(['subjid', 'nSESscore', 'y1', 'y2', 'y3','dat'])]

merged_tr_m1 = merged_tr_m1.rename(columns={"y_tot": "y"})
merged_te_m1 = merged_te_m1.rename(columns={"y_tot": "y"})

merged_m1 = TreeFit(merged_tr_m1, merged_te_m1,params, nSESInclude = 0)


In [None]:
# M2: with binary nSES

merged_m2 = TreeFit(merged_tr_m1, merged_te_m1,params, nSESInclude = 1)

In [None]:
# M3: nSES score

merged_tr_m3 = merged_tr.loc[:,~merged_tr.columns.isin(['subjid', 'y1', 'y2', 'y3','dat'])]
merged_te_m3 = merged_te.loc[:,~merged_te.columns.isin(['subjid', 'y1', 'y2', 'y3','dat'])]

merged_tr_m3 = merged_tr_m3.rename(columns={"y_tot": "y"})
merged_te_m3 = merged_te_m3.rename(columns={"y_tot": "y"})

merged_m3 = TreeFit(merged_tr_m3, merged_te_m3,params, nSESInclude = 0)