In [1]:
import json
import glob
import numpy as np
import os
import pandas as pd
import sys


In [2]:
label1='runway'
label2='street'
spurious='airplane'
main_dir = './2-Models/Models/{}-{}/{}'.format(label1, label2, spurious)

MODES = {'initial-tune': 'Baseline', 'combined-transfer-ptune': 'SPIRE', 'fs-tune-ptune': 'FS', 'added-transfer-ptune': 'Adding Only', 'removed-transfer-ptune': 'Removing Only'}
    
# Collect the data for each training mode
data = {}
for mode_dir in glob.glob('{}/*'.format(main_dir)):
    mode = mode_dir.split('/')[-1]

    # Aggregate the data for that mode across the trials
    data_mode = {}
    count = 0
    for trial_dir in glob.glob('{}/trial*'.format(mode_dir)):
        # Include both Accuracy and Search results
        for file in ['results.json', 'counterfactual.json']:
            try:
                with open('{}/{}'.format(trial_dir, file), 'r') as f:
                    data_tmp = json.load(f)
                for key in data_tmp:
                    if key in data_mode:
                        data_mode[key].append(data_tmp[key])
                    else:
                        data_mode[key] = [data_tmp[key]]
            except:
                pass
    # We want the average
    for key in data_mode:
        data_tmp = data_mode[key]
        data_mode[key] = '{} ({})'.format(np.round(np.mean(data_tmp), 3), np.round(np.std(data_tmp), 3))

    data[mode] = data_mode

# Convert the nested dictionary into a csv
modes = [key for key in data]
modes.sort()
metrics = [key for key in data[modes[0]]]

df = pd.DataFrame()
df['Mode'] = modes
for metric in metrics:
    data_tmp = []
    for mode in modes:
        data_tmp.append(data[mode][metric])
    df[metric] = data_tmp


df = df.set_index('Mode')
df = df.reindex(list(MODES))
df = df.reset_index()

df = df.replace(MODES)


In [3]:

print('Accuracy on Original data')
print(df[['Mode', '1s', '0ns']].to_string(index = False))
print()
print()

print('Accuracy on External data')
print(df[['Mode', 'c-1s', 'c-1ns', 'c-0s', 'c-0ns']].to_string(index = False))
print()
print()

print('Metrics on External data')
print(df[['Mode', 'c-r-gap', 'c-h-gap', 'c-p']].to_string(index = False))
print()
print()

print('Counterfactual Evaluation')
print(df[['Mode', '1s-spurious/box', '1s-spurious/pixel-paint', '0ns+spurious']].to_string(index = False))

Accuracy on Original data
          Mode             1s            0ns
      Baseline  0.956 (0.036)  0.998 (0.002)
         SPIRE  0.805 (0.045)  0.998 (0.002)
            FS  0.926 (0.047)  0.998 (0.001)
   Adding Only  0.796 (0.057)  0.999 (0.001)
 Removing Only  0.956 (0.023)  0.997 (0.001)


Accuracy on External data
          Mode           c-1s          c-1ns           c-0s      c-0ns
      Baseline  0.908 (0.065)  0.483 (0.145)   0.752 (0.08)  1.0 (0.0)
         SPIRE  0.723 (0.048)  0.756 (0.105)  0.958 (0.036)  1.0 (0.0)
            FS  0.871 (0.056)  0.502 (0.139)  0.758 (0.077)  1.0 (0.0)
   Adding Only  0.675 (0.036)  0.469 (0.108)  0.956 (0.033)  1.0 (0.0)
 Removing Only  0.935 (0.045)  0.833 (0.068)  0.712 (0.071)  1.0 (0.0)


Metrics on External data
          Mode         c-r-gap        c-h-gap            c-p
      Baseline   0.425 (0.138)   0.248 (0.08)  0.286 (0.062)
         SPIRE  -0.033 (0.083)  0.042 (0.036)  0.746 (0.175)
            FS   0.369 (0.125)  0.242 (0