In [1]:
import os, random, copy
import seaborn as sns
import pandas as pd
import numpy as np
from os.path import join as opj
import matplotlib.pyplot as plt
from utils import ci_95, calc_p, calc_diff_p
from scipy.io import loadmat
%matplotlib inline
random.seed(9)
%autosave 5

CUR_DIR = os.getcwd()

DATA_DIR = opj(CUR_DIR, 'data')
BOOTSTRAP_DIR = opj(DATA_DIR, 'parsed_questions_bootstrap')

MODELS = ['human', 'woven', 'dnn', 'wovenab', 'wovenab2', 'dnn2']
MODELS_WITHOUT_HUMAN = ['woven', 'dnn', 'wovenab', 'wovenab2', 'dnn2']
BASELINE_MODELS = ['dnn', 'wovenab', 'wovenab2', 'dnn2']

Autosaving every 5 seconds


# 1. Matching Stiffness

In [2]:
COND = 'stiff'  # mass|stiff
COND_F = pd.read_csv(opj(DATA_DIR, f'parsed_questions_{COND}.csv'))

## 1.0) Compare average accuracy

In [3]:
print("@@ Comparing the average accuracy between humans and the models (human acc - model acc).")
print("   The significance level is determined using 1000 bootstrap samples.")
print(f"   Task: {COND}")
all_acc = COND_F.groupby(['iter_sub']).mean().reset_index()
human_acc = all_acc['human_acc']

for _model in MODELS_WITHOUT_HUMAN:
    model_acc = all_acc[f'{_model}_acc']
    p_value = calc_p(human_acc-model_acc)
    if _model == 'wovenab2':
        _model = 'Abaltion+'
    elif _model == 'wovenab':
        _model = 'Woven-ablation'
    elif _model == 'dnn2':
        _model = 'best-corr. DNN'
    elif _model == 'woven':
        _model = 'Woven'
    elif _model == 'dnn':
        _model = 'DNN'
    print(f"p(human vs. {_model}) = {p_value}")


@@ Comparing the average accuracy between humans and the models (human acc - model acc).
   The significance level is determined using 1000 bootstrap samples.
   Task: stiff
p(human vs. Woven) = 1.084
p(human vs. DNN) = 1.022
p(human vs. Woven-ablation) = 0.78
p(human vs. Abaltion+) = 0.354
p(human vs. best-corr. DNN) = 0.016


## 1.1) Comparing the correlation of accuracy levels between models and humans.

In [4]:
print("@@ Correlation of accuracy levels between human and models")
print(f"   Task: {COND}")

conf_f = pd.read_csv(opj(BOOTSTRAP_DIR, f'cor_{COND}.csv'))

for _model in BASELINE_MODELS:
    cor_woven = conf_f['cor_woven']
    cor_model = conf_f[f'cor_{_model}']
    p_value = calc_diff_p(cor_woven, cor_model)
    
    if _model == 'wovenab2':
        _model = 'Abaltion+'
    elif _model == 'wovenab':
        _model = 'Woven-ablation'
    elif _model == 'dnn2':
        _model = 'best-corr. DNN'
    elif _model == 'dnn':
        _model = 'DNN'
        
    print(f"p(cor(Woven, human)-cor({_model}, human)) = {p_value}")


@@ Correlation of accuracy levels between human and models
   Task: stiff
p(cor(Woven, human)-cor(DNN, human)) = 0.0
p(cor(Woven, human)-cor(Woven-ablation, human)) = 0.058
p(cor(Woven, human)-cor(Abaltion+, human)) = 0.0
p(cor(Woven, human)-cor(best-corr. DNN, human)) = 0.0


## 1.2) Compare the correlation of models with human for each scene 

In [5]:
print("@@ Correlation between human and model accuracy levels by scene configurations")
print(f"   Task: {COND}")

each_scene_cor_conf_f = pd.read_csv(opj(BOOTSTRAP_DIR, f'cor_each_scene_{COND}.csv'))
each_scene_cor_conf_f = each_scene_cor_conf_f.melt(id_vars=['exclude_scene'], 
                                                   value_vars=[f'cor_{i}' for i in MODELS_WITHOUT_HUMAN], 
                                                   var_name='model', value_name='cor')

for _scene in set(each_scene_cor_conf_f['exclude_scene']):
    print(f"------ Exclude scene = {_scene}")
    df = each_scene_cor_conf_f[each_scene_cor_conf_f['exclude_scene'] == _scene].reset_index(drop=True)
    for i in BASELINE_MODELS:
        woven = df[df['model'] == 'cor_woven'].reset_index(drop=True)['cor']
        other_model = df[df['model'] == f'cor_{i}'].reset_index(drop=True)['cor']
        p_value = calc_diff_p(woven, other_model)
        if i == 'dnn':
            i = 'DNN'
        elif i == 'wovenab':
            i = 'Woven-ablation'
        elif i == 'wovenab2':
            i = 'Abaltion+'
        elif i == 'dnn2':
            i = 'best-corr. DNN'
        
        print(f"p(Woven-{i}) = {p_value}")

@@ Correlation between human and model accuracy levels by scene configurations
   Task: stiff
------ Exclude scene = wind
p(Woven-DNN) = 0.0
p(Woven-Woven-ablation) = 0.02
p(Woven-Abaltion+) = 0.0
p(Woven-best-corr. DNN) = 0.202
------ Exclude scene = rotate
p(Woven-DNN) = 0.0
p(Woven-Woven-ablation) = 0.866
p(Woven-Abaltion+) = 0.06
p(Woven-best-corr. DNN) = 0.326
------ Exclude scene = drape
p(Woven-DNN) = 0.014
p(Woven-Woven-ablation) = 0.312
p(Woven-Abaltion+) = 0.002
p(Woven-best-corr. DNN) = 0.0
------ Exclude scene = ball
p(Woven-DNN) = 0.0
p(Woven-Woven-ablation) = 0.612
p(Woven-Abaltion+) = 0.008
p(Woven-best-corr. DNN) = 0.272


## 1.3) NMDS correlation

In [6]:
in_file = opj(DATA_DIR, 'nmds', f'{COND}.mat')
nmds_data = loadmat(in_file)

models = ['cor_cnn', 'cor_gen', 'cor_genab', 'cor_genab2', 'cor_cnn2']
print("@@ 95% confidence interval for the correlation between the MDS space of the models and that of the humans.")
print(f"   Task: {COND}\n")
for _model in models:
    cur_ci = ci_95(nmds_data[_model])
    
    if _model == 'cor_genab2':
        _model = 'Abaltion+'
    elif _model == 'cor_genab':
        _model = 'Woven-ablation'
    elif _model == 'cor_cnn2':
        _model = 'best-corr. DNN'
    elif _model == 'cor_gen':
        _model = 'Woven'
    elif _model == 'cor_cnn':
        _model = 'DNN'
    
    print(f"   95% CI for r(human, {_model}) : {cur_ci}")
    
#####
## Significance test
print("\n@@ Statistical comparison of the correlation between Woven and humans against the correlation between other models and humans.")
print(f"   Task: {COND}\n")
models = ['cor_cnn', 'cor_genab', 'cor_genab2', 'cor_cnn2']
for _model in models:
    p_value = calc_diff_p(nmds_data['cor_gen'][0], nmds_data[_model][0])
    if _model == 'cor_genab2':
        _model = 'Abaltion+'
    elif _model == 'cor_genab':
        _model = 'Woven-ablation'
    elif _model == 'cor_cnn2':
        _model = 'best-corr. DNN'
    elif _model == 'cor_cnn':
        _model = 'DNN'
    
    print(f"   cor(Woven, human)-cor({_model}, human) : p={p_value}")
    

@@ 95% confidence interval for the correlation between the MDS space of the models and that of the humans.
   Task: stiff

   95% CI for r(human, DNN) : [0.22943982 0.26404008]
   95% CI for r(human, Woven) : [0.41050466 0.77594403]
   95% CI for r(human, Woven-ablation) : [0.29873198 0.53521945]
   95% CI for r(human, Abaltion+) : [0.32608489 0.48563307]
   95% CI for r(human, best-corr. DNN) : [0.38401005 0.55263503]

@@ Statistical comparison of the correlation between Woven and humans against the correlation between other models and humans.
   Task: stiff

   cor(Woven, human)-cor(DNN, human) : p=0.0
   cor(Woven, human)-cor(Woven-ablation, human) : p=0.096
   cor(Woven, human)-cor(Abaltion+, human) : p=0.028
   cor(Woven, human)-cor(best-corr. DNN, human) : p=0.092


# 2. Matching Mass

In [7]:
COND = 'mass'  # mass|stiff
COND_F = pd.read_csv(opj(DATA_DIR, f'parsed_questions_{COND}.csv'))

## 2.0) Compare average accuracy

In [8]:
print("@@ Comparing the average accuracy between humans and the models (human acc - model acc).")
print("   The significance level is determined using 1000 bootstrap samples.")
print(f"   Task: {COND}")
all_acc = COND_F.groupby(['iter_sub']).mean().reset_index()
human_acc = all_acc['human_acc']

for _model in MODELS_WITHOUT_HUMAN:
    model_acc = all_acc[f'{_model}_acc']
    p_value = calc_p(human_acc-model_acc)
    if _model == 'wovenab2':
        _model = 'Abaltion+'
    elif _model == 'wovenab':
        _model = 'Woven-ablation'
    elif _model == 'dnn2':
        _model = 'best-corr. DNN'
    elif _model == 'woven':
        _model = 'Woven'
    elif _model == 'dnn':
        _model = 'DNN'
    print(f"p(human vs. {_model}) = {p_value}")


@@ Comparing the average accuracy between humans and the models (human acc - model acc).
   The significance level is determined using 1000 bootstrap samples.
   Task: mass
p(human vs. Woven) = 0.84
p(human vs. DNN) = 0.184
p(human vs. Woven-ablation) = 0.882
p(human vs. Abaltion+) = 0.996
p(human vs. best-corr. DNN) = 0.032


## 2.1) Comparing the correlation of accuracy levels between models and humans.

In [9]:
print("@@ Correlation of accuracy levels between human and models")
print(f"   Task: {COND}")

conf_f = pd.read_csv(opj(BOOTSTRAP_DIR, f'cor_{COND}.csv'))

for _model in BASELINE_MODELS:
    cor_woven = conf_f['cor_woven']
    cor_model = conf_f[f'cor_{_model}']
    p_value = calc_diff_p(cor_woven, cor_model)
    
    if _model == 'wovenab2':
        _model = 'Abaltion+'
    elif _model == 'wovenab':
        _model = 'Woven-ablation'
    elif _model == 'dnn2':
        _model = 'best-corr. DNN'
    elif _model == 'dnn':
        _model = 'DNN'
        
    print(f"p(cor(Woven, human)-cor({_model}, human)) = {p_value}")


@@ Correlation of accuracy levels between human and models
   Task: mass
p(cor(Woven, human)-cor(DNN, human)) = 0.0
p(cor(Woven, human)-cor(Woven-ablation, human)) = 0.0
p(cor(Woven, human)-cor(Abaltion+, human)) = 0.866
p(cor(Woven, human)-cor(best-corr. DNN, human)) = 0.0


## 2.2) Compare the correlation of models with human for each scene 

In [10]:
print("@@ Correlation between human and model accuracy levels by scene configurations")
print(f"   Task: {COND}")

each_scene_cor_conf_f = pd.read_csv(opj(BOOTSTRAP_DIR, f'cor_each_scene_{COND}.csv'))
each_scene_cor_conf_f = each_scene_cor_conf_f.melt(id_vars=['exclude_scene'], 
                                                   value_vars=[f'cor_{i}' for i in MODELS_WITHOUT_HUMAN], 
                                                   var_name='model', value_name='cor')

for _scene in set(each_scene_cor_conf_f['exclude_scene']):
    print(f"------ Exclude scene = {_scene}")
    df = each_scene_cor_conf_f[each_scene_cor_conf_f['exclude_scene'] == _scene].reset_index(drop=True)
    for i in BASELINE_MODELS:
        woven = df[df['model'] == 'cor_woven'].reset_index(drop=True)['cor']
        other_model = df[df['model'] == f'cor_{i}'].reset_index(drop=True)['cor']
        p_value = calc_diff_p(woven, other_model)
        if i == 'dnn':
            i = 'DNN'
        elif i == 'wovenab':
            i = 'Woven-ablation'
        elif i == 'wovenab2':
            i = 'Abaltion+'
        elif i == 'dnn2':
            i = 'best-corr. DNN'
        
        print(f"p(Woven-{i}) = {p_value}")

@@ Correlation between human and model accuracy levels by scene configurations
   Task: mass
------ Exclude scene = wind
p(Woven-DNN) = 0.0
p(Woven-Woven-ablation) = 0.0
p(Woven-Abaltion+) = 0.006
p(Woven-best-corr. DNN) = 0.0
------ Exclude scene = rotate
p(Woven-DNN) = 0.0
p(Woven-Woven-ablation) = 0.0
p(Woven-Abaltion+) = 0.644
p(Woven-best-corr. DNN) = 0.0
------ Exclude scene = drape
p(Woven-DNN) = 0.0
p(Woven-Woven-ablation) = 0.0
p(Woven-Abaltion+) = 0.006
p(Woven-best-corr. DNN) = 0.0
------ Exclude scene = ball
p(Woven-DNN) = 0.0
p(Woven-Woven-ablation) = 0.0
p(Woven-Abaltion+) = 0.762
p(Woven-best-corr. DNN) = 0.0


## 2.3) NMDS correlation

In [11]:
in_file = opj(DATA_DIR, 'nmds', f'{COND}.mat')
nmds_data = loadmat(in_file)

models = ['cor_cnn', 'cor_gen', 'cor_genab', 'cor_genab2', 'cor_cnn2']
print("@@ 95% confidence interval for the correlation between the MDS space of the models and that of the humans.")
print(f"   Task: {COND}\n")
for _model in models:
    cur_ci = ci_95(nmds_data[_model])
    
    if _model == 'cor_genab2':
        _model = 'Abaltion+'
    elif _model == 'cor_genab':
        _model = 'Woven-ablation'
    elif _model == 'cor_cnn2':
        _model = 'best-corr. DNN'
    elif _model == 'cor_gen':
        _model = 'Woven'
    elif _model == 'cor_cnn':
        _model = 'DNN'
    
    print(f"   95% CI for r(human, {_model}) : {cur_ci}")
    
#####
## Significance test
print("\n@@ Statistical comparison of the correlation between Woven and humans against the correlation between other models and humans.")
print(f"   Task: {COND}\n")
models = ['cor_cnn', 'cor_genab', 'cor_genab2', 'cor_cnn2']
for _model in models:
    p_value = calc_diff_p(nmds_data['cor_gen'][0], nmds_data[_model][0])
    if _model == 'cor_genab2':
        _model = 'Abaltion+'
    elif _model == 'cor_genab':
        _model = 'Woven-ablation'
    elif _model == 'cor_cnn2':
        _model = 'best-corr. DNN'
    elif _model == 'cor_cnn':
        _model = 'DNN'
    
    print(f"   cor(Woven, human)-cor({_model}, human) : p={p_value}")

@@ 95% confidence interval for the correlation between the MDS space of the models and that of the humans.
   Task: mass

   95% CI for r(human, DNN) : [0.00083565 0.06162289]
   95% CI for r(human, Woven) : [0.5069238  0.72760374]
   95% CI for r(human, Woven-ablation) : [0.10025667 0.22660985]
   95% CI for r(human, Abaltion+) : [0.34419473 0.43212359]
   95% CI for r(human, best-corr. DNN) : [0.19701044 0.2658229 ]

@@ Statistical comparison of the correlation between Woven and humans against the correlation between other models and humans.
   Task: mass

   cor(Woven, human)-cor(DNN, human) : p=0.0
   cor(Woven, human)-cor(Woven-ablation, human) : p=0.0
   cor(Woven, human)-cor(Abaltion+, human) : p=0.0
   cor(Woven, human)-cor(best-corr. DNN, human) : p=0.0
