In [44]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import precision_score, confusion_matrix, recall_score, roc_auc_score, average_precision_score, f1_score
import re
import json

In [45]:
def compute_metrics(model, file, hyperparameters, threshold):

    model_num = re.findall(r'\d+\.?\d*', file)[0]
    
    df = pd.read_csv(file)
    
    y_true = df['y_true']
    y_pred = df['y_pred']
    y_pred_round = y_pred.map(lambda x: np.around(x))
    
    precision = precision_score(y_true, y_pred_round, zero_division = 0)
    recall = recall_score(y_true, y_pred_round, zero_division = 0)
    f1 = f1_score(y_true, y_pred_round, zero_division = 0)
    
    cm = confusion_matrix(y_true.to_numpy(), y_pred_round.to_numpy())
    num_TP = cm[1,1]
    num_FP = cm[0,1]

    pr_auc = average_precision_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    
    idx = (y_pred_round == y_true)
    correct_pred_confidence_avg = (1 - abs(y_true[idx] - y_pred[idx])).mean()
    correct_pred_confidence_std = (1 - abs(y_true[idx] - y_pred[idx])).std()
    correct_pred_confidence = str(correct_pred_confidence_avg) + " ± " + str(correct_pred_confidence_std)

    new_y_pred = y_pred.map(lambda x: 1 if x > threshold else 0)
    idx = ((new_y_pred == 1) & (y_true == 0))
    new_cm = confusion_matrix(y_true.to_numpy(), new_y_pred.to_numpy())
    new_num_TP = new_cm[1,1]
    new_num_FP = new_cm[0,1]

    # return [model_num, precision, num_TP, num_FP, correct_pred_confidence, new_num_TP, new_num_FP]

    return [model, model_num, precision, recall, f1, num_TP, num_FP, pr_auc, roc_auc,
            correct_pred_confidence, new_num_TP, new_num_FP, hyperparameters['lr'], hyperparameters['weight_decay']]

In [46]:
def analyze_data(model, threshold):
    
    directory = '/Users/gabrielgreenstein/Downloads/' + model + '/hyperparameter_optimization'
    
    data = [['model', 'model_num', 'precision', 'recall', 'f1', 'num_TP', 'num_FP', 'pr_auc', 'roc_auc',
             'correct_pred_confidence', 'new_num_TP', 'new_num_FP', 'learning_rate', 'weight_decay']]
    
    for model_folder in os.listdir(directory):
        model_folder_path = os.path.join(directory, model_folder)
        if os.path.isdir(model_folder_path) and model_folder.startswith('model_'):

            config_file = os.path.join(model_folder_path, 'configure.json')
                
            if os.path.isfile(config_file):
                hyperparameters = json.load(open(config_file))

            val_model_folder = os.path.join(model_folder_path, 'val_model_on_infer_set')
            
            if os.path.isdir(val_model_folder):
                results_file = os.path.join(val_model_folder, 'results.txt')
                
                if os.path.isfile(results_file):
                    data.append(compute_metrics(model, results_file, hyperparameters, threshold))
                else:
                    print(f"'results.txt' not found in {val_model_folder}")
            else:
                print(f"'val_model_on_infer_set' folder not found in {model_folder_path}")
    df = pd.DataFrame(data[1:], columns=data[0])
    df = df[df['precision'] >= 0.85]
    df.sort_values(by=['precision'], inplace=True, ascending=False)
    df.reset_index(drop=True, inplace=True)
    df.fillna(0)
    
    return df

In [None]:
# models = ['AttentiveFP', 'Weave', 'MPNN', 'GAT']
models = ['MPNN']
threshold = 0.99

df = pd.DataFrame()

for model in models:
    result = analyze_data(model, threshold)  # Assuming this returns a DataFrame
    df = pd.concat([df, result], ignore_index=True)

In [43]:
df.sort_values(by=['pr_auc'], ascending=False).head(25)

Unnamed: 0,model,model_num,precision,recall,f1,num_TP,num_FP,pr_auc,roc_auc,correct_pred_confidence,new_num_TP,new_num_FP,learning_rate,weight_decay


In [68]:
df.to_csv('best_models.csv', index=False)