In [None]:
import os, json

import matplotlib.pyplot as plt

output_dir = 'outputs/token_vs_contour_recall'


tokenizers = [
    'directsam_large_sa1b_2ep', 
    'directsam_large_gen1_1008', 
    'directsam_large_gen2_1014', 
    'directsam_large_gen3_1023', 

    'directsam_tiny_sa1b_2ep', 
    'directsam_tiny_dsa_50ep', 
    'directsam_tiny_dsa_75ep', 

    'panoptic_mask2former_base', 
    'panoptic_mask2former_large', 
    'panoptic_mask2former_small', 
    'panoptic_mask2former_tiny', 
    'panoptic_oneformer_large', 
    'panoptic_oneformer_tiny', 

    'patch_2_per_side_raster', 
    'patch_4_per_side_raster', 
    'patch_8_per_side_raster', 
    'patch_16_per_side_raster', 

    'sam_vit_b', 
    'sam_vit_l', 
    'sam_vit_h', 
    # 'sam_vit_h_48points', 
    # 'sam_vit_h_64points', 
    # 'sam_vit_h_64points_1layer', 
    # 'sam_vit_h_64points_2layer', 

    'superpixel_slic',
    ]

all_tokenizers = []
summary = {}
splits = os.listdir(output_dir)
for split in splits:
    split_dir = os.path.join(output_dir, split)
    summary[split] = {}
    files = os.listdir(split_dir)
    files.sort()
    for f in files:
        if f.endswith('.json'):
            results = json.load(open(os.path.join(split_dir, f)))
            tokenizer = f.split('_202')[0]
            all_tokenizers.append(tokenizer)
            summary[split][tokenizer]= [results['mean_tokens'], results['mean_recall']]
    
    # print the difference between summary[split].keys() and tokenizers
    missing = set(tokenizers) - set(summary[split].keys())
    if missing:
        print(f"Missing tokenizers for {split}: {missing}")

print(summary)
all_tokenizers = list(set(all_tokenizers))
all_tokenizers.sort()
for tokenizer in all_tokenizers:
    print(tokenizer)
    # check missing splits for this tokenizer
    for split in splits:
        if tokenizer not in summary[split]:
            print(f" - Missing {split} for {tokenizer}")

In [None]:

ALL_LEVELS = [
    'tcd', 'PhenoBench', 'EgoHOS', 'LoveDA', 
    'SA1B', 
    'DUTS_TE', 'LVIS', 
    'COCONut_relabeld_COCO_val', 
    'FoodSeg103', 'plantorgans', 'SUIM', 
    'LIP', 
    'MyFood', 'DIS5K_DIS_VD', 'WireFrame', 'EntitySeg', 'Fashionpedia', 'SOBA', 
    'ISAID', 'MapillaryMetropolis', 'PACO', 
    'PascalPanopticParts', 
    'ADE20k', 
    'SeginW', 
    'CIHP', 'NYUDepthv2', 'cityscapes', 'DRAM', 
    'PartImageNetPP', 
    'SPIN'
]

OBJECT_LEVEL = [
    "EgoHOS", "DUTS_TE", "LVIS",
    "FoodSeg103", "SUIM", "MyFood", "DIS5K_DIS_VD", "EntitySeg", "NYUDepthv2", "cityscapes", "DRAM"
]


SUBOBJECT_LEVEL = [
    "LIP", "PACO", "PascalPanopticParts", "ADE20k", "CIHP", "PartImageNetPP", "SPIN", "Fashionpedia"
]

classification = {
    'All': ALL_LEVELS,
    'Object': OBJECT_LEVEL,
    'Subobject': SUBOBJECT_LEVEL,
}

for name, SPLITS in classification.items():
    plt.figure(figsize=(20, 10))
    for tokenizer in tokenizers:
        avg_tokens = 0
        avg_recall = 0
        for split in SPLITS:
            if tokenizer not in summary[split]:
                raise ValueError(f"Missing {tokenizer} for {split}")
            tokens, recall = summary[split][tokenizer]
            avg_tokens += tokens
            avg_recall += recall
        avg_tokens /= len(SPLITS)
        avg_recall /= len(SPLITS)

        plt.scatter(avg_tokens, avg_recall, label=tokenizer)
        plt.text(avg_tokens, avg_recall, f"{tokenizer}\n{int(avg_tokens)}:{avg_recall*100:.1f}", fontsize=12)


    plt.xlabel('Average Tokens')
    plt.ylabel('Average Recall')
    # plt.title(f'Token vs Contour Recall ({SPLITS})')
    # instead use var name of SPLITS
    plt.title(f'Token vs Contour Recall ({name})')
    plt.xlim(0, 256)
    plt.ylim(0, 1)
    plt.legend()
    plt.show()