In [3]:
import pandas as pd
from pathlib import Path
import sys
sys.path.append("../")
from  src.protocols.protocols import OpenSetProtocol

In [6]:
info_path = Path("/local/scratch/datasets/ImageNet/ILSVRC2012/robustness")
root_dir = Path("/local/scratch/datasets/ImageNet/ILSVRC2012")
out_dir = Path("/local/scratch/palechor/openset-imagenet/data")

def print_data(prt):
    print('--------------- Protocol ' + str(prt.protocol) + ' ------------')
    print('kn classes:', len(prt.kn_classes))
    print('kn_unk classes:', len(prt.neg_classes))
    print('unk_unk classes:', len(prt.unk_classes))
    print('train size:', len(prt.data['train']))
    print('val size:', len(prt.data['val']))
    print('test size:', len(prt.data['test']))


p1 = OpenSetProtocol(im_root_dir=root_dir, info_path=info_path, protocol=1)
p1.create_dataset(random_state=4242)
print_data(p1)

p2 = OpenSetProtocol(im_root_dir=root_dir, info_path=info_path, protocol=2)
p2.create_dataset(random_state=4242)
print_data(p2)

p3 = OpenSetProtocol(im_root_dir=root_dir, info_path=info_path, protocol=3)
p3.create_dataset(random_state=4242)
print_data(p3)

--------------- Protocol 1 ------------
kn classes: 116
kn_unk classes: 67
unk_unk classes: 166
train size: 185898
val size: 46475
test size: 17450
--------------- Protocol 2 ------------
kn classes: 30
kn_unk classes: 31
unk_unk classes: 55
train size: 60689
val size: 15173
test size: 5800
--------------- Protocol 3 ------------
kn classes: 151
kn_unk classes: 97
unk_unk classes: 164
train size: 252724
val size: 63182
test size: 20600


In [11]:
def check_datasets(d1, d2):
    d1 = d1.copy()
    d2 = d2.copy()
    print('d1 shape:', d1.shape)
    print('d2 shape:', d2.shape)

    # create folder column
    d1['folder'] = d1.path.apply(lambda x: x.split('/')[1])
    d1 = (d1.groupby('folder').count())
    d1 = d1.drop(columns=['path'])
    d2['folder'] = d2.path.apply(lambda x: x.split('/')[1])
    d2 = (d2.groupby('folder').count())
    d2 = d2.drop(columns=['path'])
    # check if they have the same classes
    print('same classes:', set(d1.index) == set(d2.index))
    df = pd.merge(d1, d2, on='folder', how='outer')
    print(df[abs(df.label_x - df.label_y) > 0])

df1 = pd.DataFrame(p1.data['train'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p1_train.csv', names=['path', 'label'])
print('========== Protocol - Train ==========')
check_datasets(df1, df2)

d1 shape: (185898, 2)
d2 shape: (185892, 2)
same classes: True
           label_x  label_y
folder                     
n02085782      618      617
n02088632      909      908
n02089078      586      585
n02095314      782      781
n02095889      749      748
n02096437      925      924


In [13]:
df1 = pd.DataFrame(p2.data['train'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p2_train.csv', names=['path', 'label'])
print('========== Protocol - Train ==========')
check_datasets(df1, df2)

d1 shape: (60689, 2)
d2 shape: (60684, 2)
same classes: True
           label_x  label_y
folder                     
n02088632      909      908
n02089078      586      585
n02095314      782      781
n02095889      749      748
n02096437      925      924


In [14]:
df1 = pd.DataFrame(p3.data['train'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p3_train.csv', names=['path', 'label'])
print('========== Protocol - Train ==========')
check_datasets(df1, df2)

d1 shape: (252724, 2)
d2 shape: (255022, 2)
same classes: False
           label_x  label_y
folder                     
n01855032    913.0    912.0
n02085782    618.0    617.0
n02088632    909.0    908.0
n02096437    925.0    924.0
n04429376    781.0    780.0
n04612504    965.0    964.0
