In [1]:
import logging
import time
from os import path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, average_precision_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,balanced_accuracy_score

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold

from definitions import DATA_PATH
from definitions import LOGS_PATH
import src.framingham.aha_frs_cvd as aha_frs_cvd
import src.lib.optimal_threhold_related as thres
import src.lib.fairness_tests as fair

In [2]:
time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())

log_files = path.join(LOGS_PATH, 'log_benchmark.txt')
logging.basicConfig(filename=log_files+str(time_stamp), level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This is a log message.')

In [3]:
#Please identify the framingham features
#framing_features= ['Gender','AGE','MEDIAN_VALUE_Chol','MEDIAN_VALUE_HDL-C','MEDIAN_SYSTOLIC','Smoking','T2DM_CNT','HTN_DRUG_CNT','Race']
framing_features = ['Gender', 'AGE', 'RECENT_VALUE_Chol', 'RECENT_VALUE_HDL-C', 'RECENT_SYSTOLIC', 'Smoking',
                    'T2DM_CNT', 'HTN_DRUG_CNT', 'Race']

In [4]:
def compute_ind_frs(df):
    score_list = []
    for index, row in df.iterrows():
        X = row[framing_features].values
        score = aha_frs_cvd.frs(*X)
        score_list.append(score)
    df['frs'] = pd.Series(score_list)
    return df

In [5]:
def predict_by_framingham(df: object, reset_index: object = True) -> object:
    # if the input df is cross validation split, need to reset index
    if reset_index:
        df.reset_index(drop=True)
    y = df.Class.values
    df = compute_ind_frs(df)
    print("after compute", df.shape)
    x = df[(df['Gender'] == 'F') & (df['Race'] == 'W')]['frs']

    mean_frs_women_w = -29.18
    mean_frs_women_b = 86.61
    mean_frs_men_w = 61.18
    mean_frs_men_b = 19.54
    risk_list = []
    for index, row in df.iterrows():
        gender = row['Gender']
        race = row['Race']
        risk = 0

        if gender == 'F':
            if race == 'W':
                risk = aha_frs_cvd.estimiate_risk(ind_frs=row['frs'], mean_frs=mean_frs_women_w, gender='F', race='W')
            elif race == 'B':
                risk = aha_frs_cvd.estimiate_risk(ind_frs=row['frs'], mean_frs=mean_frs_women_b, gender='F', race='B')
            else:
                print('1',race)
        elif gender == 'M':
            if race == 'W':
                risk = aha_frs_cvd.estimiate_risk(ind_frs=row['frs'], mean_frs=mean_frs_men_w, gender='M', race='W')
            elif race == 'B':
                risk = aha_frs_cvd.estimiate_risk(ind_frs=row['frs'], mean_frs=mean_frs_men_b, gender='M', race='B')
            else:
                print('2', race)
        else:
            print('3', gender)
        # if np.isnan(risk):
        #     print(index)

        risk_list.append(risk)
    df['risk'] = pd.Series(risk_list)
    print(df.risk.unique())
    print(len(risk_list))
    df.loc[df['risk'] > 0.075, 'predict'] = 1
    df.loc[df['risk'] <= 0.075, 'predict'] = 0
        
    cm = confusion_matrix(y,df['predict'].values)
    cm_display = ConfusionMatrixDisplay(cm).plot()
    
    print(df.predict.unique())
    #save the interim output
    DATA_PATH='/Users/lifuchen/Desktop/research'
    df.to_csv(path.join(DATA_PATH, 'framingham_result.csv'))

    ba = balanced_accuracy_score(y, df['predict'].values)
    print('accuracy', accuracy_score(y, df['predict'].values))
    print('roc AUC', roc_auc_score(y, df['risk'].values))
    print('precision', precision_score(y, df['predict'].values))
    print('recall', recall_score(y, df['predict'].values))
    print("ap", average_precision_score(y, df['risk'].values))
    
    return ba, accuracy_score(y, df['predict'].values), roc_auc_score(y, df['risk'].values), \
           precision_score(y, df['predict'].values), recall_score(y, df['predict'].values), \
           average_precision_score(y, df['risk'].values)

In [7]:
def framingham_result(df, records):
    auroc = roc_auc_score(df.Class.values, df['risk'].values)
    ba = balanced_accuracy_score(df.Class.values, df['predict'].values)
    grouped = df.groupby(df.Race)
    df2_white = grouped.get_group("W")
    df2_black = grouped.get_group("B")
    ba_white = balanced_accuracy_score(df2_white['Class'].values,df2_white['predict'].values)
    ba_black = balanced_accuracy_score(df2_black['Class'].values,df2_black['predict'].values)
    eod_race = fair.get_EOD(df2_white['Class'].values,df2_white['predict'].values, 1, df2_black['Class'].values,df2_black['predict'].values, 1)
    di_race = fair.get_SP(df2_white['Class'].values,df2_white['predict'].values, 1, df2_black['Class'].values,df2_black['predict'].values, 1)

    grouped_2 = df.groupby(df.Gender)
    df2_male = grouped_2.get_group("M")
    df2_female = grouped_2.get_group("F")
    ba_male = balanced_accuracy_score(df2_male['Class'].values,df2_male['predict'].values)
    ba_female = balanced_accuracy_score(df2_female['Class'].values,df2_female['predict'].values)
    eod_gender = fair.get_EOD(df2_male['Class'].values,df2_male['predict'].values, 1, df2_female['Class'].values,df2_female['predict'].values, 1)
    di_gender = fair.get_SP(df2_male['Class'].values,df2_male['predict'].values, 1, df2_female['Class'].values,df2_female['predict'].values, 1)

    records.append({
        'auroc': auroc,
        'ba': ba,
        'ba_white': ba_white,
        'ba_black': ba_black,
        'eod_race': eod_race,
        'di_race': di_race,
        'ba_male': ba_male,
        'ba_female': ba_female,
        'eod_gender': eod_gender,
        'di_gender': di_gender,
    })

In [8]:
data_path='/Users/lifuchen/Desktop/research'
# df = pd.read_csv(path.join(data_path, 'framingham_data.csv'))
# ba, acc, roc, precision, recall, ap = predict_by_framingham(df)

df2 = pd.read_csv(path.join(data_path, 'framingham_result.csv'))

y = df2.predict.values
X = df2.drop(['GRID','predict'], axis=1)

records = []
for random_state in range(10):
    df_train_val, df_test, y_train_val, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
    df_test['predict'] = y_test
    framingham_result(df_test, records)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 1 is  0.514
True positive rate of class 2 is  0.551
Positive prediction rate of class 1 is  0.246
Positive prediction rate of class 2 is  0.276
True positive rate of class 1 is  0.651
True positive rate of class 2 is  0.419
Positive prediction rate of class 1 is  0.385
Positive prediction rate of class 2 is  0.177


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 1 is  0.508
True positive rate of class 2 is  0.523
Positive prediction rate of class 1 is  0.239
Positive prediction rate of class 2 is  0.256
True positive rate of class 1 is  0.622
True positive rate of class 2 is  0.426
Positive prediction rate of class 1 is  0.37
Positive prediction rate of class 2 is  0.17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 1 is  0.535
True positive rate of class 2 is  0.566
Positive prediction rate of class 1 is  0.241
Positive prediction rate of class 2 is  0.275
True positive rate of class 1 is  0.666
True positive rate of class 2 is  0.441
Positive prediction rate of class 1 is  0.374
Positive prediction rate of class 2 is  0.176
True positive rate of class 1 is  0.525


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 2 is  0.61
Positive prediction rate of class 1 is  0.242
Positive prediction rate of class 2 is  0.255
True positive rate of class 1 is  0.677
True positive rate of class 2 is  0.427
Positive prediction rate of class 1 is  0.377
Positive prediction rate of class 2 is  0.17
True positive rate of class 1 is  0.514


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 2 is  0.58
Positive prediction rate of class 1 is  0.241
Positive prediction rate of class 2 is  0.269
True positive rate of class 1 is  0.658
True positive rate of class 2 is  0.422
Positive prediction rate of class 1 is  0.371
Positive prediction rate of class 2 is  0.176
True positive rate of class 1 is  0.515


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 2 is  0.581
Positive prediction rate of class 1 is  0.239
Positive prediction rate of class 2 is  0.264
True positive rate of class 1 is  0.649
True positive rate of class 2 is  0.429
Positive prediction rate of class 1 is  0.376
Positive prediction rate of class 2 is  0.169
True positive rate of class 1 is  0.517


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 2 is  0.577
Positive prediction rate of class 1 is  0.241
Positive prediction rate of class 2 is  0.272
True positive rate of class 1 is  0.674
True positive rate of class 2 is  0.407
Positive prediction rate of class 1 is  0.374
Positive prediction rate of class 2 is  0.173


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 1 is  0.519
True positive rate of class 2 is  0.552
Positive prediction rate of class 1 is  0.24
Positive prediction rate of class 2 is  0.272
True positive rate of class 1 is  0.674
True positive rate of class 2 is  0.404
Positive prediction rate of class 1 is  0.374
Positive prediction rate of class 2 is  0.174


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 1 is  0.512
True positive rate of class 2 is  0.588
Positive prediction rate of class 1 is  0.239
Positive prediction rate of class 2 is  0.274
True positive rate of class 1 is  0.646
True positive rate of class 2 is  0.427
Positive prediction rate of class 1 is  0.372
Positive prediction rate of class 2 is  0.172
True positive rate of class 1 is  0.518


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predict'] = y_test


True positive rate of class 2 is  0.55
Positive prediction rate of class 1 is  0.242
Positive prediction rate of class 2 is  0.271
True positive rate of class 1 is  0.646
True positive rate of class 2 is  0.429
Positive prediction rate of class 1 is  0.375
Positive prediction rate of class 2 is  0.174


In [9]:
result_table = pd.DataFrame(records)
records.append({
    'auroc': result_table["auroc"].mean(),
    'ba': result_table["ba"].mean(),
    'ba_white': result_table["ba_white"].mean(),
    'ba_black': result_table["ba_black"].mean(),
    'eod_race': result_table["eod_race"].mean(),
    'di_race': result_table["di_race"].mean(),
    'ba_male': result_table["ba_male"].mean(),
    'ba_female': result_table["ba_female"].mean(),
    'eod_gender': result_table["eod_gender"].mean(),
    'di_gender': result_table["di_gender"].mean(),
    })
records.append({
    'auroc': result_table["auroc"].std(),
    'ba': result_table["ba"].std(),
    'ba_white': result_table["ba_white"].std(),
    'ba_black': result_table["ba_black"].std(),
    'eod_race': result_table["eod_race"].std(),
    'di_race': result_table["di_race"].std(),
    'ba_male': result_table["ba_male"].std(),
    'ba_female': result_table["ba_female"].std(),
    'eod_gender': result_table["eod_gender"].std(),
    'di_gender': result_table["di_gender"].std(),  
    })

In [10]:
result_table = pd.DataFrame(records)
result_table.to_csv(path.join(data_path,'framingham_result_summary.csv'), index=False)