## Transitive Size Relations

**Goal**: Evaluation set comprising transitive (*indirect*) object co-occurrences.

**Models**: *BERT, CapBERT, ViLT, FLAVA*

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import read_json, sort_dict, save_csv

In [2]:
df = pd.read_csv('../results/size.csv')
jsn = read_json('../data/obj_size_k_5.json')

In [3]:
df['freq'] = df['<'] + df['>']

In [4]:
df.head(2)

Unnamed: 0,o1,o2,>,<,typical,freq
0,guy,clock,303,60,>,363
1,clock,guy,60,303,<,363


In [5]:
jsn[0]['regions'][0]


{'name': 'clock', 'bbox': [421, 57, 82, 139], 'cluster': 0}

<br>

#### Statistics


In [6]:
df['freq'].describe()


count    43096.000000
mean       579.939623
std       1641.455501
min        100.000000
25%        139.000000
50%        219.000000
75%        452.000000
max      63248.000000
Name: freq, dtype: float64

In [7]:
df['typical'].value_counts()


>    21548
<    21548
Name: typical, dtype: int64

In [8]:
# Verify: threshold
sum(df.apply(lambda x: x['freq'] < 50, axis=1))


0

<br>

### Slice "<" relations

Common objects

In [9]:
df = df[df['typical'] == '<']

df = df.drop(columns=['<', '>'])

In [10]:
df['o1'].nunique(), df['o2'].nunique()

(1321, 1093)

In [11]:
objects = set(df['o1']) | set((df['o2']))

f'# objects: {len(objects)}  || # rels: {len(df)}'

'# objects: 1682  || # rels: 21548'

In [12]:
df = df.sort_values(by=['freq'], ascending=False)


In [13]:
df.head(10)

Unnamed: 0,o1,o2,typical,freq
621,window,building,<,63248
518,window,tree,<,50302
486,window,car,<,46952
7983,letter,sign,<,42145
57,car,tree,<,42003
13579,spot,giraffe,<,41561
3808,leaf,tree,<,41121
479,sign,tree,<,40891
2679,window,train,<,34126
2948,window,bus,<,33721


In [None]:
# Frequent Small
obj2freq = df.groupby(by='o1')['freq'].sum().to_dict()
obj2freq = sort_dict(obj2freq, by='v', reverse=True)
list(obj2freq.items())[:20]

In [None]:
# Frequent Large
obj2freq = df.groupby(by='o2')['freq'].sum().to_dict()
obj2freq = sort_dict(obj2freq, by='v', reverse=True)
list(obj2freq.items())[:20]


<br>

### Derive Transitive Co-occurrences

(a,b) --> (b,b') --> (a,b')

In [14]:
data = df.to_dict('records')
pairs = {(_['o1'], _['o2']) for _ in data}

In [15]:
eval_set = []

for d in tqdm(data):
    # Get all transitive samples
    new = [_ for _ in data if d['o2'] == _['o1']]
    for n in new:
        if (d['o1'], n['o2']) not in pairs:
            d_new = {'o1': d['o1'], 'o2': n['o2'],
                     '>': 100, '<': 100 , 'typical': '<'}
            eval_set += [d_new]

save_csv(eval_set, path='../results/size_trans.csv', index=False)

100%|██████████| 21548/21548 [00:30<00:00, 704.38it/s]


In [37]:
eval_set = list(np.random.choice(eval_set, size=20000, replace=False))

save_csv(eval_set, path='../dataset/size_trans/test.csv', index=False)

In [None]:
# Co-occurrence (cluster-membership) variance
# obj2cluster = {}
# for d in tqdm(jsn):
#     for r in d['regions']:
#         o = r['name']
#         c = r['cluster']
#         if o not in obj2cluster:
#             obj2cluster[o] = []
#
#         obj2cluster[o] += [c]
#
# def _cluster_var(_o: str) -> float:
#     var = np.std(obj2cluster[_o])**2
#     var = var.round(2)
#     return float(var)
#
# obj2var = {o: _cluster_var(o) for o in tqdm(objects)}
# obj2var = sort_dict(obj2var, by='v', reverse=True)
# list(obj2var.items())[:20], list(obj2var.items())[-20:]