In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils.config_utils import get_config
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

In [12]:


def get_data_set_original(dataset, config):
    train_set, test_set = train_test_split(dataset, test_size=config['test_size'], random_state=config['random_state'])
    return train_set, test_set

def get_data_set_with_certain_group_excluded(sensitive_variable_name, sensitive_variable_map, original_dataset):
    # return a dictionary of dataframes, each dataframe is a dataset with certain group excluded
    # the key is the excluded group
    result_dic = {}
    for key, value in sensitive_variable_map.items():
        train_set = original_dataset[original_dataset[sensitive_variable_name] == value] 
        #get not equal to value
        test_set = original_dataset[original_dataset[sensitive_variable_name] != value]
        result_dic[key] = {
            'train_set': train_set,
            'test_set': test_set
        }
    return result_dic

def get_model(name, params):
    if name == 'lr':
        model = LogisticRegression(**params)
    elif name == 'svm':
        model = SVC(**params)
    elif name == 'rf':
        model = RandomForestClassifier(**params)
    elif name == 'gb':
        model = GradientBoostingClassifier(**params)
    elif name == 'nn':
        model = MLPClassifier(**params)
    else:
        raise ValueError('No such model')
    return model

def preprocess_data(features, config):
    features = features.copy()
    drop_columns = config['drop_columns']
    # drop the columns
    features = features.drop(drop_columns, axis=1)
    # get the missing values threshold
    missing_values_threshold = config['missing_values_threshold']
    #calculate missing values ratio
    missing_ratio = features.isnull().mean()
    # print(f'missing_ratio: {missing_ratio}')
    #retain the values which are less than the threshold
    variables_to_be_retained = features.columns[missing_ratio <= missing_values_threshold]
    features = features[variables_to_be_retained]
    # The samplest way to fill missing numerical values is to fill them with 0
    features = features.fillna(0)
    # check if there are any null values
    has_null_values = features.isnull().any().any()
    # if has_null_values:
    #     print("has null values.")
    # else:
    #     print("has no null values.")

    string_columns = features.select_dtypes(include='object').columns
    # print the variables that are numerical(not strings)
    num_string_columns = len(string_columns)
    # print(f"String Columns in Dataframe are are listed below. There are {num_string_columns} colums in total.")

    # for col in string_columns:
    #     print(col)

    # drop the string columns
    features = features.drop(string_columns, axis=1)
    # One-Hot Encoding
    features = pd.get_dummies(features)
    # normalize
    normalize_method = config['normalize_method']
    if normalize_method == 'min_max':
        features = (features - features.min()) / (features.max() - features.min())
    elif normalize_method == 'z_score':
        features = (features - features.mean()) / features.std()
    else:
        raise ValueError('No such normalize method')
    # There maybe some identical columns in the dataset, so we need to remove them. (Maybe there are other ways to do this)
    columns_with_null = features.columns[features.isnull().any()]

    # Print the column names with null values
    # print("Columns with null values:")
    # print(columns_with_null)
    features.drop(columns_with_null, axis=1, inplace=True)
    
    return features

def train_and_evaluate(model, train_set, test_set, config):
    target_variable = config['target_variable']
    y_train = train_set[target_variable].copy()
    y_test = test_set[target_variable].copy()
    
    y_train = y_train.map({0: 0, 1: 1, 8: 0})
    y_test = y_test.map({0: 0, 1: 1, 8: 0})
    
    X_train = train_set.drop(columns=[target_variable])
    # print(X_train.shape)
    X_test = test_set.drop(columns=[target_variable])
    # print(X_test.shape)
    #feature_names, features_values
    X_train = preprocess_data(X_train, config['preprocessing'])
    X_test = preprocess_data(X_test, config['preprocessing'])
    common_columns = X_train.columns.intersection(X_test.columns)
    X_train = X_train[common_columns]
    X_test = X_test[common_columns]
    X_train_names, X_train_values = X_train.columns, X_train.values
    X_test_names, X_test_values = X_test.columns, X_test.values
    
    model.fit(X_train_values, y_train)
    y_pred = model.predict(X_test_values)
    return y_pred, y_test, model

def calculate_classification_metrics(model_name, y_true, y_pred_prob, metrics):
    # Threshold 0,5 by default
    y_pred = np.where(y_pred_prob >= 0.5, 1, 0)
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Recall
    sensitivity = recall_score(y_true, y_pred)

    # Specificity
    specificity = recall_score(y_true, y_pred, pos_label=0)

    # F1 Score
    f1 = f1_score(y_true, y_pred)

    # ROC AUC
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    # FPR FNR
    false_positive_rate = fpr[1]
    false_negative_rate = 1 - tpr[1]

    # confusion_matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    metrics[model_name] = {
        "fpr_tpr": (fpr, tpr),
        "Accuracy": accuracy,
        "Sensitivity (Recall)": sensitivity,
        "Specificity": specificity,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "False Positive Rate": false_positive_rate,
        "False Negative Rate": false_negative_rate,
        "Confusion Matrix": conf_matrix
    }
    return metrics

def metrics_to_df(metrics):
    df = pd.DataFrame.from_dict(metrics, orient='index')
    df = df[['Accuracy', 'Sensitivity (Recall)', 'Specificity', 'F1 Score', 'ROC AUC', 'False Positive Rate', 'False Negative Rate']]
    return df

def plot_auc(metrics, tag):
    plt.figure(figsize=(8, 6))
    for model_name, metric in metrics.items():
        fpr, tpr = metric['fpr_tpr']
        roc_auc = metric['ROC AUC']
        plt.plot(fpr, tpr, label=f'{model_name} (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic {tag}')
    plt.legend(loc='lower right')
    path = os.path.join("pics", f'roc_curve{tag}.png')
    plt.savefig(path)




In [20]:
# The path of NACC Dataset
NACC_DATASET_PATH = os.path.join("data", "NACC.csv")
# The path of the Config file 
DATA_CONFIG_PATH = os.path.join("config", "config_default.yaml")
print(NACC_DATASET_PATH)
print(DATA_CONFIG_PATH)
config = get_config(DATA_CONFIG_PATH)
NACC_dataset = pd.read_csv(NACC_DATASET_PATH, low_memory=False)

models_dict = config['model_dict']
protected_attributes = config['protected_attributes']
print(config)
#original 

metrics = {}
per_class_metrics = {}

for model_name, model_params in models_dict.items():
    per_class_metrics[model_name]= {}
    train_set, test_set = get_data_set_original(NACC_dataset, config)
    model = get_model(model_name, model_params)
    y_pred, y_test, model = train_and_evaluate(model=model, train_set= train_set, test_set=test_set, config=config)
    calculate_classification_metrics(model_name=model_name, y_pred_prob=y_pred, y_true=y_test, metrics=metrics)
    # for protected_attribute in protected_attributes:
    #     per_class_metrics[model_name][protected_attribute] = {}
    #     sensitive_variable_name = protected_attributes[protected_attribute]['name']
    #     sensitive_variable_map = protected_attributes[protected_attribute]['map']
    #     for k, v in sensitive_variable_map.items():
    #         per_class_y_pred_prob = y_pred[test_set[protected_attribute] == v]
    #         per_class_y_true = y_test[test_set[protected_attribute] == v]
    #         per_class_y_pred_prob = np.where(per_class_y_pred_prob >= 0.5, 1, 0)
    
    #         # Accuracy
    #         accuracy = accuracy_score(per_class_y_true, per_class_y_pred_prob)
    #         # ROC AUC
    #         fpr, tpr, thresholds = roc_curve(per_class_y_true, per_class_y_pred_prob)
    #         roc_auc = auc(fpr, tpr)
    #         print(f'{protected_attribute} {k} acc:{accuracy} roc-auc:{roc_auc}')
plot_auc(metrics, tag='original')
df = metrics_to_df(metrics)
#save
df.to_csv('original.csv')


data/NACC.csv
config/config_default.yaml
{'target_variable': 'NACCALZD', 'test_size': 0.2, 'random_state': 42, 'model_dict': {'lr': {'C': 100, 'solver': 'saga'}, 'gb': {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 6, 'min_samples_leaf': 50, 'max_features': 0.3}, 'rf': {'n_estimators': 200, 'max_features': 'auto', 'max_depth': 6, 'criterion': 'entropy'}, 'svm': {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}, 'nn': {'hidden_layer_sizes': [128, 32], 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.05, 'learning_rate': 'adaptive'}}, 'preprocessing': {'drop_columns': ['NACCETPR', 'NACCALZP', 'PROBAD', 'PROBADIF', 'POSSAD', 'POSSADIF', 'NACCADMD'], 'missing_values_threshold': 0.2, 'normalize_method': 'min_max'}, 'protected_attributes': {'RACE': {'name': 'RACE', 'map': {'White': 1, 'black_or_African_American': 2, 'American_Indian_or_Alaska_Native': 3, 'Asian': 5}}, 'SEX': {'name': 'SEX', 'map': {'Male': 1, 'Female': 2}}}}




RACE White acc:0.9498404012767898 roc-auc:0.9453005660262073
RACE black_or_African_American acc:0.9516806722689075 roc-auc:0.9464351173020528
RACE American_Indian_or_Alaska_Native acc:0.9302325581395349 roc-auc:0.8934659090909092
RACE Asian acc:0.8928571428571429 roc-auc:0.8959083469721767
SEX Male acc:0.9347399411187438 roc-auc:0.9330342252505888
SEX Female acc:0.9575098814229249 roc-auc:0.9515149623458508
RACE White acc:0.9740082079343365 roc-auc:0.9684804223810872
RACE black_or_African_American acc:0.9726890756302521 roc-auc:0.968475073313783
RACE American_Indian_or_Alaska_Native acc:0.9534883720930233 roc-auc:0.9389204545454546
RACE Asian acc:0.9642857142857143 roc-auc:0.9662847790507364
SEX Male acc:0.9627085377821394 roc-auc:0.9579694173728706
SEX Female acc:0.980566534914361 roc-auc:0.9761164299212867


  warn(


RACE White acc:0.9131326949384405 roc-auc:0.9236128469618445
RACE black_or_African_American acc:0.9201680672268907 roc-auc:0.9199046920821115
RACE American_Indian_or_Alaska_Native acc:0.9302325581395349 roc-auc:0.953125
RACE Asian acc:0.8839285714285714 roc-auc:0.8970540098199672
SEX Male acc:0.879784102060844 roc-auc:0.89459861695883
SEX Female acc:0.9357707509881423 roc-auc:0.9400286023982883
RACE White acc:0.9612403100775194 roc-auc:0.9607399167902593
RACE black_or_African_American acc:0.9621848739495799 roc-auc:0.9613728005865102
RACE American_Indian_or_Alaska_Native acc:0.9069767441860465 roc-auc:0.9076704545454546
RACE Asian acc:0.8928571428571429 roc-auc:0.8988543371522096
SEX Male acc:0.94946025515211 roc-auc:0.9496860733776233
SEX Female acc:0.9667325428194994 roc-auc:0.9665368965262842
RACE White acc:0.9594163246694026 roc-auc:0.9588714470607187
RACE black_or_African_American acc:0.9432773109243697 roc-auc:0.9433651026392963
RACE American_Indian_or_Alaska_Native acc:0.9302325

In [None]:
# The path of NACC Dataset
NACC_DATASET_PATH = os.path.join("data", "NACC.csv")
# The path of the Config file 
DATA_CONFIG_PATH = os.path.join("config", "config_default.yaml")
print(NACC_DATASET_PATH)
print(DATA_CONFIG_PATH)
config = get_config(DATA_CONFIG_PATH)
NACC_dataset = pd.read_csv(NACC_DATASET_PATH, low_memory=False)

models_dict = config['model_dict']
protected_attributes = config['protected_attributes']
metrics = {}
for protected_attribute in protected_attributes:
    metrics[protected_attribute] = {}
    dataset_dic = get_data_set_with_certain_group_excluded(protected_attributes[protected_attribute]['name'],
                                                      protected_attributes[protected_attribute]['map'],
                                                      NACC_dataset.copy())
    for k, v in dataset_dic.items():
        train_group = k
        for model_name, model_params in models_dict.items():
            train_set = v['train_set'].copy()
            test_set = v['test_set'].copy()
            model = get_model(model_name, model_params)
            y_pred, y_test, model= train_and_evaluate(model=model, train_set= train_set, test_set=test_set, config=config)
            calculate_classification_metrics(model_name=model_name, y_pred_prob=y_pred, y_true=y_test, metrics=metrics[protected_attribute])
        plot_auc(metrics[protected_attribute], f'{protected_attribute}  {train_group}')
        df = metrics_to_df(metrics[protected_attribute])
        #save
        df.to_csv(f'{train_group}.csv')
    
        