In [1]:

from scipy.stats import pearsonr
import json
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys

from Misc import get_sc_strength, get_d_gap, get_h_gap


In [2]:
main_dir = './Models'

# Collect the data for each training mode
data = {}
for mode_dir in glob.glob('{}/*'.format(main_dir)):
    mode = mode_dir.split('/')[-1]

    # Aggregate the data for that mode across the trials
    data_mode = {}
    count = 0
    for trial_dir in glob.glob('{}/trial*'.format(mode_dir)):
        # Include both Accuracy and Search results
        for file in ['results.json', 'search.json']:
            with open('{}/{}'.format(trial_dir, file), 'r') as f:
                data_tmp = json.load(f)
            for key in data_tmp:
                if key in data_mode:
                    data_mode[key].append(data_tmp[key])
                else:
                    data_mode[key] = [data_tmp[key]]
                
    # We want the average
    for key in data_mode:
        data_tmp = data_mode[key]
        data_mode[key] = '{} ({})'.format(np.round(np.mean(data_tmp), 3), np.round(np.std(data_tmp), 3))
        
    # Compute aggregated accuracy metrics

    data[mode] = data_mode

# Convert the nested dictionary into a csv
modes = [key for key in data]
modes.sort()
metrics = [key for key in data[modes[0]]]

# Group the results by pair
with open('./FindSCs.json', 'r') as f:
    pairs = json.load(f)

metric_groups = {}
metric_groups['avg'] = [('MAP', 'MAP'), ('MAR', 'MAR')]

for pair in pairs:
    n = len(pair)
    main = pair.split('-')[0]
    spurious = pair.split('-')[1]
    n_main = len(main)
    tmp = []
    for metric in metrics:
        if metric[:n] == pair:
            name = metric[n+1:]
            if name[:n_main] == main:
                name = name[n_main:]
                if name[0] != '+':
                    name = name[1:]
            name = name.replace(main, 'main')
            name = name.replace(spurious, 'spurious')
            tmp.append((name, metric))
    metric_groups[pair] = tmp

# Save the results
for group in metric_groups:

    df = pd.DataFrame()
    df['Mode'] = modes
    for info in metric_groups[group]:
        name = info[0]
        metric = info[1]
        data_tmp = []
        for mode in modes:
            data_tmp.append(data[mode][metric])
        df[name] = data_tmp

    df.to_csv('./Analysis/{}.csv'.format(group), index = False)

In [3]:
# Show the results for each object pair and then the aggregated differences

def remove_var_info(df):
    def remove(x):
        return x.split(' ')[0]
    return df.applymap(remove)

pair_groups = []
pair_groups.append(["surfboard-person", "tennis+racket-person", "skateboard-person", "baseball+glove-person", "baseball+bat-person", "sports+ball-person", "skis-person", "tie-person", "knife-person", "toothbrush-person"])
pair_groups.append(["fork-dining+table", "knife-dining+table", "cup-dining+table", "broccoli-dining+table"])
pair_groups.append(["person-airplane"])

modes = ['aug-tp-transfer', 'initial-tune']
cols_acc = ['Mode', 'both', 'just_main', 'just_spurious', 'neither', 'b-precision', 'b-recall', 'b-f1']
cols_search = ['Mode', 'main-pixel-paint', 'spurious-pixel-paint', '+spurious']

diffs_acc = {}
for metric in cols_acc[1:]:
    diffs_acc[metric] = []

diffs_search = {}
for metric in cols_search[1:]:
    diffs_search[metric] = []
        
for group in pair_groups:
    print()
    print()
    for pair in group:
        
        data_tmp = pairs[pair]
        
        print(pair)
        print('SC Strength: ', np.round(get_sc_strength(data_tmp), 3)) 
        print('Detection Gap: ', np.round(get_d_gap(data_tmp), 3))
        print('Hallucination Gap: ', np.round(get_h_gap(data_tmp), 3))
        
        df = pd.read_csv('./Analysis/{}.csv'.format(pair))
        
        print('Acc Comparison')
        df_tmp = remove_var_info(df.loc[df['Mode'].isin(modes), cols_acc].copy())
        print(df_tmp.to_string(index = False)) 
        
        for metric in diffs_acc:
            data_tmp = [float(v) for v in df_tmp[metric].values]
            if not np.any(data_tmp == -1):
                diffs_acc[metric].append(data_tmp[0] - data_tmp[1]) # Warning:  Compare this order to make sure it is correct
        
        print('Search Comparison')
        df_tmp = remove_var_info(df.loc[df['Mode'].isin(modes), cols_search].copy())
        print(df_tmp.to_string(index = False))
        
        for metric in diffs_search:
            data_tmp = [float(v) for v in df_tmp[metric].values]
            if not np.any(data_tmp == -1):
                diffs_search[metric].append(data_tmp[0] - data_tmp[1]) # Warning:  Compare this order to make sure it is correct
                
        print()
        
print()
print()
print()
print()
print('Average Differences:')

for key in diffs_acc:
    print(key, np.round(np.mean(diffs_acc[key]), 3))

for key in diffs_search:
    print(key, np.round(np.mean(diffs_search[key]), 3))



surfboard-person
SC Strength:  0.436
Detection Gap:  0.467
Hallucination Gap:  0.006
Acc Comparison
            Mode   both just_main just_spurious neither b-precision b-recall   b-f1
 aug-tp-transfer  0.741     0.569         0.997   0.998       0.893    0.655  0.754
    initial-tune  0.801     0.539         0.994   0.998       0.844     0.67  0.747
Search Comparison
            Mode main-pixel-paint spurious-pixel-paint +spurious
 aug-tp-transfer            0.127                0.144      -1.0
    initial-tune            0.093                0.141      -1.0

tennis+racket-person
SC Strength:  0.461
Detection Gap:  0.509
Hallucination Gap:  0.007
Acc Comparison
            Mode   both just_main just_spurious neither b-precision b-recall   b-f1
 aug-tp-transfer  0.738     0.485         0.998   0.999       0.935    0.612   0.74
    initial-tune  0.866     0.412         0.995     1.0       0.907    0.639  0.749
Search Comparison
            Mode main-pixel-paint spurious-pixel-paint +sp