Performs Correlation Analysis, for a set of objects -- across 3 stages:
- Recognition (objects)
- Reasoning    (color, size)
- Retention    (color, size)

> Given CapBERT's performance, what can we expect from VLMs?

// img_color objs: (1354, 711, 4929)

In [1]:
import torch
import pandas as pd
from utils import read_json

In [2]:
model = 'vilt'

In [3]:
obj_l = read_json('../dataset/color/test.json')
df_l = pd.read_csv(f'../temp/{model}_col.csv')

In [4]:
df_l['object'] = list(obj_l)

In [5]:
obj_vl= read_json('../dataset/img_color/test.json')
obj_vl = [_['object'] for _ in obj_vl]

df_vl = pd.read_csv(f'../temp/{model}_imcol.csv')

In [6]:
df_vl['object'] = obj_vl

In [7]:
df_l.head(5)

Unnamed: 0,true,pred,prob,object
0,37810,3,"0.05636003240942955,0.021840402856469154,0.020...",pig
1,110,8,"0.08919460326433182,0.021954400464892387,0.057...",indicator
2,3810,10,"0.05169696360826492,0.01805987022817135,0.0358...",chariot
3,34,8,"0.05501304939389229,0.02240145020186901,0.0683...",ground cover
4,23810,3,"0.05686449259519577,0.022163238376379013,0.060...",bed race


In [8]:
df_vl.iloc[0:50:10, :]

Unnamed: 0,true,pred,prob,object
0,7,3,"0.004302120301872492,0.014215422794222832,0.01...",pig
10,0,0,"0.39208489656448364,0.09383445233106613,0.2085...",indicator
20,0,0,"0.8400886654853821,0.00579852145165205,0.00142...",christmas ball
30,4,4,"0.04573044180870056,0.004021757282316685,0.014...",shirt guy
40,8,0,"0.6997007727622986,0.0026546271983534098,0.015...",banner flag


In [9]:
# Drop extra objects in `df_l`
obj_vl = set(obj_vl)

df_l = df_l[df_l.apply(lambda _: _['object'] in obj_vl, axis=1)]

In [10]:
def _confidence(x):
    true = x['true'].split(',')
    true = [int(_) for _ in true]

    prob = x['prob'].split(',')
    prob = [float(_) for _ in prob]
    prob = torch.tensor(prob)

    return prob[true].sum().item()

# Conf
df_l['conf'] = df_l.apply(lambda x: _confidence(x), axis=1)
df_vl['conf'] = df_vl.apply(lambda x: _confidence(x), axis=1)

def _is_correct(x):
    return str(x['pred']) in x['true']

# R-Acc
df_l['correct'] = df_l.apply(lambda x: _is_correct(x), axis=1)
df_vl['correct'] = df_vl.apply(lambda x: _is_correct(x), axis=1)

In [11]:
df_l.head()

Unnamed: 0,true,pred,prob,object,conf,correct
0,37810,3,"0.05636003240942955,0.021840402856469154,0.020...",pig,0.723446,True
1,110,8,"0.08919460326433182,0.021954400464892387,0.057...",indicator,0.39245,False
5,0,8,"0.11402490735054016,0.025141188874840736,0.089...",christmas ball,0.114025,False
6,5810,8,"0.05009888857603073,0.015372240915894508,0.042...",shirt guy,0.693679,True
7,58,8,"0.21041236817836761,0.01790141686797142,0.0672...",banner flag,0.581485,True


In [12]:
# Group `conf` by objects
df = df_vl.groupby('object', sort=False)['conf'].agg('mean').reset_index()
df_ = df_vl.groupby('object', sort=False)['correct'].agg('mean').reset_index()

In [13]:
df.head(5)

Unnamed: 0,object,conf
0,pig,0.548698
1,indicator,0.285958
2,christmas ball,0.731677
3,shirt guy,0.578882
4,banner flag,0.601756


In [14]:
df_.head(5)



Unnamed: 0,object,correct
0,pig,0.7
1,indicator,0.4
2,christmas ball,0.9
3,shirt guy,0.8
4,banner flag,0.7


#### Conf

In [15]:
# Ensure object order is same
all(l == vl for l, vl in zip(df_l['object'].tolist(), df['object'].tolist()))

True

In [16]:
df = df.rename(columns={'conf': 'conf_vl'})

df['conf_l'] = df_l['conf'].tolist()

In [17]:
df.head(10)


Unnamed: 0,object,conf_vl,conf_l
0,pig,0.548698,0.723446
1,indicator,0.285958,0.39245
2,christmas ball,0.731677,0.114025
3,shirt guy,0.578882,0.693679
4,banner flag,0.601756,0.581485
5,parsley,0.81523,0.172975
6,dome top,0.300533,0.685503
7,kitchen stool,0.422884,0.247449
8,ad sign,0.563916,0.567506
9,ginger hair,0.333206,0.460599


In [18]:
df['conf_l'].corr(df['conf_vl'], method='pearson')

-0.07765679769188608

In [19]:
df['conf_l'].corr(df['conf_vl'], method='spearman')

-0.06694292325572693

In [20]:
df['conf_l'].corr(df['conf_vl'], method='kendall')


-0.04516331769747783

#### R-Acc

VLM score | conditioned on LM's success/failure

In [21]:
df_ = df_.rename(columns={'correct': 'correct_vl'})

df_['acc_l'] = df_l['correct'].tolist()

In [22]:
df_.head(10)

Unnamed: 0,object,correct_vl,acc_l
0,pig,0.7,True
1,indicator,0.4,False
2,christmas ball,0.9,False
3,shirt guy,0.8,True
4,banner flag,0.7,True
5,parsley,0.9,False
6,dome top,0.3,True
7,kitchen stool,0.7,False
8,ad sign,0.7,True
9,ginger hair,0.4,True


In [23]:
df_[df_['acc_l'] == True]['correct_vl'].mean(), \
df_[df_['acc_l'] == False]['correct_vl'].mean()

(0.6652042360060514, 0.6552339901477832)

In [24]:
df_['correct_vl'].mean()

0.6619192533982553