In [1]:
import pandas as pd
from pathlib import Path
import sys
sys.path.append("../")
from  src.protocols.protocols import OpenSetProtocol

In [2]:
info_path = Path("/local/scratch/datasets/ImageNet/ILSVRC2012/robustness")
root_dir = Path("/local/scratch/datasets/ImageNet/ILSVRC2012")
out_dir = Path("/local/scratch/palechor/openset-imagenet/data")

def print_data(prt):
    print('--------------- Protocol ' + str(prt.protocol) + ' ------------')
    print('kn classes:', len(prt.kn_classes))
    print('kn_unk classes:', len(prt.neg_classes))
    print('unk_unk classes:', len(prt.unk_classes))
    print('train size:', len(prt.data['train']))
    print('val size:', len(prt.data['val']))
    print('test size:', len(prt.data['test']))


p1 = OpenSetProtocol(im_root_dir=root_dir, info_path=info_path, protocol=1)
p1.create_dataset(random_state=4242)
print_data(p1)

p2 = OpenSetProtocol(im_root_dir=root_dir, info_path=info_path, protocol=2)
p2.create_dataset(random_state=4242)
print_data(p2)

p3 = OpenSetProtocol(im_root_dir=root_dir, info_path=info_path, protocol=3)
p3.create_dataset(random_state=4242)
print_data(p3)

TypeError: __init__() got an unexpected keyword argument 'im_root_dir'

In [21]:
def check_datasets(d1, d2):
    """Gets the difference between number of classes between two dataframes"""
    d1 = d1.copy()
    d2 = d2.copy()
    print('d1 shape:', d1.shape, 'd2 shape:', d2.shape)
    # create folder column
    d1['folder'] = d1.path.apply(lambda x: x.split('/')[1])
    d1 = (d1.groupby('folder').count())
    d1 = d1.drop(columns=['path'])
    d2['folder'] = d2.path.apply(lambda x: x.split('/')[1])
    d2 = (d2.groupby('folder').count())
    d2 = d2.drop(columns=['path'])
    # check if they have the same classes
    print('same classes:', set(d1.index) == set(d2.index))
    df = pd.merge(d1, d2, on='folder', how='outer')
    df.columns = ['samples_df1', 'samples_df2']
    print(df[abs(df.samples_df1 - df.samples_df2) > 0])

df1 = pd.DataFrame(p1.data['train'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p1_train.csv', names=['path', 'label'])
print('========== Protocol 1- Train ==========')
check_datasets(df1, df2)

df1 = pd.DataFrame(p1.data['val'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p1_val.csv', names=['path', 'label'])
print('========== Protocol 1- Val ==========')
check_datasets(df1, df2)

df1 = pd.DataFrame(p1.data['test'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p1_test.csv', names=['path', 'label'])
print('========== Protocol 1- Test ==========')
check_datasets(df1, df2)

d1 shape: (185898, 2) d2 shape: (185892, 2)
same classes: True
           samples_df1  samples_df2
folder                             
n02085782          618          617
n02088632          909          908
n02089078          586          585
n02095314          782          781
n02095889          749          748
n02096437          925          924
d1 shape: (46475, 2) d2 shape: (46481, 2)
same classes: True
           samples_df1  samples_df2
folder                             
n02085782          154          155
n02088632          227          228
n02089078          146          147
n02095314          195          196
n02095889          187          188
n02096437          231          232
d1 shape: (17450, 2) d2 shape: (17450, 2)
same classes: True
Empty DataFrame
Columns: [samples_df1, samples_df2]
Index: []


In [22]:
df1 = pd.DataFrame(p2.data['train'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p2_train.csv', names=['path', 'label'])
print('========== Protocol 2- Train ==========')
check_datasets(df1, df2)

df1 = pd.DataFrame(p2.data['val'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p2_val.csv', names=['path', 'label'])
print('========== Protocol 2- Val ==========')
check_datasets(df1, df2)

df1 = pd.DataFrame(p2.data['test'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p2_test.csv', names=['path', 'label'])
print('========== Protocol 2- Test ==========')
check_datasets(df1, df2)

d1 shape: (60689, 2) d2 shape: (60684, 2)
same classes: True
           samples_df1  samples_df2
folder                             
n02088632          909          908
n02089078          586          585
n02095314          782          781
n02095889          749          748
n02096437          925          924
d1 shape: (15173, 2) d2 shape: (15178, 2)
same classes: True
           samples_df1  samples_df2
folder                             
n02088632          227          228
n02089078          146          147
n02095314          195          196
n02095889          187          188
n02096437          231          232
d1 shape: (5800, 2) d2 shape: (5800, 2)
same classes: True
Empty DataFrame
Columns: [samples_df1, samples_df2]
Index: []


In [33]:
df1 = pd.DataFrame(p3.data['train'], columns=['path', 'label'])
df2 = pd.read_csv('../data/v1/p3_train.csv', names=['path', 'label'])
print('========== Protocol 3- Train ==========')
#check_datasets(df1, df2)

d1 = df1.copy()
d2 = df2.copy()
#print('d1 shape:', d1.shape, 'd2 shape:', d2.shape)
# create folder column
d1['folder'] = d1.path.apply(lambda x: x.split('/')[1])
d1 = (d1.groupby('folder').count())
d1 = d1.drop(columns=['path'])
d2['folder'] = d2.path.apply(lambda x: x.split('/')[1])
d2 = (d2.groupby('folder').count())
d2 = d2.drop(columns=['path'])
# check if they have the same classes
#print('same classes:', set(d1.index) == set(d2.index))
c1 = set(d1.index)
c2 = set(d2.index)
print(c1.difference(c2))
print(c2.difference(c1))

df = pd.merge(d1, d2, on='folder', how='outer')
df.columns = ['samples_df1', 'samples_df2']
#print(df[abs(df.samples_df1 - df.samples_df2) > 0])

{'n01622779', 'n04344873', 'n01496331', 'n02088364', 'n02033041', 'n03977966', 'n02165105', 'n02107574', 'n02099601', 'n02089973', 'n02493793', 'n02110341', 'n03180011', 'n02190166', 'n02096051', 'n02127052', 'n02088238', 'n13037406', 'n02006656', 'n02007558', 'n02095314', 'n02113023', 'n04037443', 'n02092002', 'n02093859', 'n02870880', 'n02606052', 'n01847000', 'n02095889', 'n02100877', 'n02018207', 'n02093647', 'n02268853', 'n02492035', 'n02281787', 'n02125311', 'n01534433', 'n02009912', 'n03770679', 'n02089078'}
{'n02168699', 'n04285008', 'n01843383', 'n01560419', 'n02096177', 'n02100236', 'n02002724', 'n12267677', 'n02492660', 'n03100240', 'n02094258', 'n01614925', 'n02089867', 'n01820546', 'n02655020', 'n02128757', 'n02058221', 'n02107908', 'n02097209', 'n02101556', 'n04099969', 'n03016953', 'n02088094', 'n13040303', 'n02098413', 'n02110627', 'n03832673', 'n01530575', 'n02487347', 'n02109047', 'n02111889', 'n02276258', 'n02105505', 'n02027492', 'n02106550', 'n03662601', 'n02236044