In [12]:
import pandas as pd
import torch
import numpy as np

utils

In [13]:
def frust_class(x):
    if int(x) <= 3:
        return 0
    else:
        return 1

def normalize_column(df, columns):
    for column in columns:
        data = df[column]
        min_val = data.min()
        max_val = data.max()
        normalized_column = (data - min_val) / (max_val - min_val)
        df[column] = normalized_column
    return df

models

In [14]:
def baseline(x_train, x_test, y_train, y_test, func_var):
    most_common_class = max(set(y_train), key=y_train.tolist().count)
    y_pred = [most_common_class] * len(y_test)
    y_pred = torch.tensor([t.item() for t in y_pred])
    return y_pred

In [15]:
from sklearn import tree
from matplotlib.image import imread
from sklearn.metrics import f1_score, accuracy_score

def ClassTree(x_train, x_test, y_train, y_test, func_var):
    criterion=func_var
    dtc = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=100)
    dtc = dtc.fit(x_train,y_train)
    y_pred = dtc.predict(x_test)
    
    return y_pred

In [16]:
from sklearn.neighbors import KNeighborsClassifier 

def KNN(x_train, x_test, y_train, y_test, func_var):

    K=func_var
    dist=1
    metric = 'minkowski'
    metric_params = {}
    knclassifier = KNeighborsClassifier(n_neighbors=K, p=dist, metric=metric, metric_params=metric_params)
    knclassifier.fit(x_train, y_train)
    y_pred = knclassifier.predict(x_test)
    return y_pred

In [17]:
from sklearn.linear_model import LogisticRegression

def LogReg(x_train, x_test, y_train, y_test, func_var):

    logistic_regression_model = LogisticRegression(C=1/func_var, max_iter=5000)
    logistic_regression_model.fit(x_train, y_train)
    y_test_est = logistic_regression_model.predict(x_test).T
    return y_test_est

In [23]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

class SimpleNN(nn.Module):
    def __init__(self, input_size, h, output_size) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, h),
            nn.ReLU(),  
            nn.Linear(h, output_size),
            nn.Sigmoid(),
        )

    def forward(self, x): return self.layers.forward(x)

def get_predictions(test_x, model):
    model.eval()
    with torch.no_grad():
        return model(test_x)

def train_model(model, criterion, optimizer, train_loader, epochs):
    losses = []
    model.train()
    for epoch in tqdm(range(epochs)):
        for i, (inputs, labels) in enumerate(train_loader):
            optimizer.zero_grad() 
            outputs = model(inputs)
            outputs = torch.cat(list(outputs), dim=0)
            labels = torch.flatten(labels)
            loss = criterion(outputs, labels.float()) 
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

def ann(x_train, x_test, y_train, y_test, func_var):

    input_size = 6
    # binary for status
    output_size = 1
    learning_rate = 1e-5
    epochs = 10000
    batch_size = 32
    h = func_var

    model = SimpleNN(input_size, h, output_size)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    train_model(model, criterion, optimizer, train_loader, epochs)
    raw_predictions = get_predictions(x_test, model)
    predictions = np.argmax(raw_predictions, axis=1)
    return predictions

In [24]:
data = pd.read_csv("HR_data.csv")
data["HighlyFrustrated"] = data["Frustrated"].apply(frust_class)
data = normalize_column(data, ["HR_Mean", "HR_Median", "HR_std", "HR_Min", "HR_Max", "HR_AUC"])
test_df = data[data["Cohort"] == "D1_2"]
train_df = data[data["Cohort"] == "D1_1"]

x_test = test_df[["HR_Mean", "HR_Median", "HR_std", "HR_Min", "HR_Max", "HR_AUC"]].values
x_train = train_df[["HR_Mean", "HR_Median", "HR_std", "HR_Min", "HR_Max", "HR_AUC"]].values
y_test = test_df["HighlyFrustrated"].values
y_train = train_df["HighlyFrustrated"].values

x_test = torch.tensor(x_test.astype(np.float32()))
x_train = torch.tensor(x_train.astype(np.float32()))
y_test = torch.tensor(y_test.astype(np.float32()))
y_train = torch.tensor(y_train.astype(np.float32()))

In [25]:
baseline_pred = baseline(x_train, x_test, y_train, y_test, None)
tree_pred = ClassTree(x_train, x_test, y_train, y_test, "gini")
knn_pred = KNN(x_train, x_test, y_train, y_test, 8)
logred_pred = LogReg(x_train, x_test, y_train, y_test, 0.1)
ann_pred = ann(x_train, x_test, y_train, y_test, 256)
df = pd.DataFrame()

100%|██████████| 10000/10000 [00:18<00:00, 531.93it/s]


In [26]:
df["Baseline"] = baseline_pred
df["Tree"] = tree_pred
df["KNN"] = knn_pred
df["LogReg"] = logred_pred
df["ANN"] = ann_pred
df["True Values"] = y_test

df.to_csv("protected_group_predictions.csv")

In [22]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
baseline_acc = accuracy_score(baseline_pred, y_test)
tree_acc = accuracy_score(tree_pred, y_test)
knn_acc = accuracy_score(knn_pred, y_test)
logreg_acc = accuracy_score(logred_pred, y_test)
ann_acc = accuracy_score(ann_pred, y_test)

baseline_acc, tree_acc, knn_acc, logreg_acc, ann_acc

(0.8055555555555556,
 0.8055555555555556,
 0.7361111111111112,
 0.8055555555555556,
 0.8055555555555556)