In [1]:
def get_ontology_indices(ont_rem_file, num_obs):
    # Load the CSV file using genfromtxt
    data = np.genfromtxt(ont_rem_file, delimiter=' ')
    indices_backup = data # indices of data deferred to neural network backup
    
    full_sequence = np.arange(num_obs*3) 
    
    # Calculate the inverse of the original array
    indices_ont = np.setdiff1d(full_sequence, indices_backup) # indices of data predicted by ontology
    
    return indices_backup, indices_ont

In [2]:
import numpy as np

rem_2016, ont_2016 = get_ontology_indices('data/rem2016.csv', 650)
rem_2015, ont_2015 = get_ontology_indices('data/rem2015.csv', 597)

In [3]:
# Get prediction-truth pairs 
def get_pred(file_path):
    with open(file_path) as f:
        lines = f.readlines()
        return np.array([line.split('\t') for line in lines])[:,:2]
    
# Filter out examples that have already been predicted by the ontology
def mask_predictions(pred_array, rem_array):
    return np.array([pair for i, pair in enumerate(pred_array) if i*3 in rem_array])

# Get prediction accuracy
def get_acc(pred_array):
    count = np.sum(pred_array[:,0] == pred_array[:,1])
    acc = count / pred_array.shape[0]
    return acc

def print_result(acc, backup_acc, combined_acc):
    print(f'''Neural network accuracy (full sample): {100 * acc:.2f}%\nNeural network accuracy (as backup): {100 * backup_acc:.2f}%\nCombined ontology and nn accuracy: {100 * combined_acc:.2f}%\n\n''')

In [4]:
# Extracting eval files

In [5]:
import os
import glob


def get_eval_paths(directory_path):
    # Get a list of all subdirectories
    subdirectories = [subdir for subdir in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, subdir))]

    file_paths = {'2015': [], '2016': []}

    # Iterate through each subdirectory
    for subdir in subdirectories:
        year = subdir[:4]
        # Get a list of all .txt files in the subdirectory
        txt_files = glob.glob(os.path.join(directory_path, subdir, 'epoch_*_eval.txt'))
        if txt_files:
            # Extract the highest {n} value from the file names
            n_values = [int(os.path.splitext(os.path.basename(txt_file))[0].split('_')[1]) for txt_file in txt_files]
            highest_n = max(n_values)

            # Construct the file name of the .txt file with the highest {n}
            highest_n_file = os.path.join(directory_path, subdir, f'epoch_{highest_n}_eval.txt')

            # Do something with the highest_n_file
            file_paths[year].append(highest_n_file)
    return file_paths

In [6]:
directory = 'test_models'
accuracyOnt2016 = 0.868159 # accuracy if the ontology is used with the backup model
accuracyOnt2016_full = 0.783077 # accuracy if only the ontology is used
accuracyOnt2015 = 0.827703 # accuracy if the ontology is used with the backup model
accuracyOnt2015_full = 0.658291 # accuracy if only the ontology is used
accuracyOnt = {'2015': 0.827703, '2016': 0.868159}
accuracyOnt_full = {'2015': 0.658291, '2016': 0.783077} # accuracy if only the ontology is used

def eval_file(f, year):
    if year == '2015':
        rem = rem_2015
        num_obs = 597
    elif year == '2016':
        rem = rem_2016
        num_obs = 650

    num_backup_predictions = len(rem) // 3 # rem - remaining_pos_vector
    
    # the total number of observations -  number of remaining indexes when the prediction is implicit
    num_ont_predictions = num_obs - num_backup_predictions 
    
    # Get predictions
    pred_arr = get_pred(f)
    backup_pred_arr = mask_predictions(pred_arr, rem)

    # Get accuracy
    acc = get_acc(pred_arr)
    backup_acc = get_acc(backup_pred_arr)

    combined_acc = (num_ont_predictions * accuracyOnt[year] + num_backup_predictions * backup_acc) / num_obs
    
    return acc, backup_acc, combined_acc

def f1_file(f):
    pred_arr = get_pred(f)


def eval_files_from_directory(directory):
    for f in os.listdir(directory):
        if f.endswith('.txt'):
            year = f[:4]
            if year == '2015':
                rem = rem_2015
                num_obs = 597
            elif year == '2016':
                rem = rem_2016
                num_obs = 650
                
            num_backup_predictions = len(rem) // 3
            num_ont_predictions = num_obs - num_backup_predictions
            
            file_path = os.path.join(directory, f)
            
            # Get predictions
            pred_arr = get_pred(file_path)
            backup_pred_arr = mask_predictions(predd_arr, rem)
            
            # Get accuracy
            acc = get_acc(pred_arr)
            backup_acc = get_acc(backup_pred_arr)
            
            combined_acc = (num_ont_predictions * accuracyOnt[year] + num_backup_predictions * backup_acc)
            
            return acc, backup_acc, combined_acc
        

            

In [7]:
best_2016_f = 'test_models/2015tri_gcn+concat_seed65_reg0.027059715881067578_drop0.2285714285714286_cdrop0.2285714285714286_lr1.1122448979591838e-05_tgcnTrue_semgcnTrue_lexgcnTrue_knogcnTrue_epochs15_adam/epoch_1_eval.txt'
eval_file(best_2016_f, '2015')

(0.8207705192629816, 0.8239202657807309, 0.8257957922948074)