In [1]:
import pandas as pd
import os, sys, re
from glob import glob
from copy import copy
from tqdm.auto import tqdm

In [2]:
sys.path.append('../deepdive')
from model_options import *

### Metadata Setup

In [3]:
og_metadata = pd.read_csv('../model_opts/model_typology.csv')
og_metadata_cols = og_metadata.columns.to_list()
og_metadata['model_string'] = og_metadata['model'] + '_' + og_metadata['train_type']
og_metadata = og_metadata[['model_string'] + og_metadata_cols].drop_duplicates()

In [4]:
original_model_info = og_metadata.set_index('model_string').to_dict(orient = 'index')
deepdive_model_uids = get_model_options(output = 'list')

In [17]:
list(original_model_info.keys());

### Target Benchmark

In [19]:
benchmark = 'regression/oasis'

In [20]:
source_dir = 'incoming/{}'.format(benchmark)
file_list = os.listdir(source_dir)

### Conversions

In [18]:
filtered_file_list = [file for file in file_list 
                      if not '_random' in file and not '.ipynb' in file]

files_unaccounted = copy(filtered_file_list)
conversions = {}
for filename in filtered_file_list:
    model_uid = None
    
    if '_random' in filename or '.ipynb' in filename:
        files_unaccounted.remove(filename)
    
    model_string = filename.split('.')[0]
    
    if '_imagenet' in filename:
        model_string = model_string.replace('_imagenet','_classification')
        if model_string in original_model_info:
            model_info = original_model_info[model_string]
            model_name = model_string.replace('_classification','')

            if model_info['model_source'] == 'torchvision':
                model_uid = 'torchvision_' + model_name + '_imagenet1k_v1'

            if model_info['model_source'] == 'timm':
                model_uid = 'timm_' + model_name
            
    if '_taskonomy' in filename:
        model_name = model_string.replace('_taskonomy','')
        model_uid = 'taskonomy_' + model_name
        
    if '_selfsupervised' in filename:
        model_info = original_model_info[model_string]
        model_name = model_string.replace('_selfsupervised','')
        
        if model_info['model_source'] == 'vissl':
            if 'JigSaw' in model_name:
                model_name = model_name.replace('Saw','saw')
                model_name = '-'.join(model_name.split('-')[:3])
            else:
                model_name = '-'.join(model_name.split('-')[:2])
                
        model_uid = 'vissl_' + model_name.replace('-','_').lower()
        
        if model_info['model_source'] == 'dino':
            model_uid = model_name
    
    if '_seer' in filename:
        model_name = model_string.replace('_seer', '').lower()
        model_uid = 'seer_' + ''.join(model_name.split('-')[0:2])
        
        if 'INFT' in filename:
            model_uid += '_inft'
            
    if '_clip' in filename:
        model_name = model_string.replace('_clip','')
        model_uid = 'clip_' + model_name.replace('-','').lower()
        
    if '_slip' in filename:
        model_name = (model_string.replace('_slip','')
                      .replace('Ep100','Max')
                      .replace('L-CLIP-CC','B-CLIP-CC')
                      .replace('L-SLIP-CC','B-SLIP-CC'))
        
        model_uid = 'slip_' + model_name.replace('-','_').lower()
        
        if 'SLIP' in filename:
            model_uid = model_uid.replace('_slip','')
        
        if 'CC12M' not in filename:
            model_uid += '_yfcc15m'

    if '_ipcl' in filename:
        model_name = (model_string.replace('-','_')
                      .replace('imagenet','imagenet1k')
                      .replace('places256','places2')
                      .replace('openimages','openimagesv6'))
    
        model_uid = 'visionlab_' + model_name
        
    if '_yolo' in filename:
        model_uid = model_string.replace('_yolo','')
        
    if '_monoculardepth' in filename:
        model_name = model_string.replace('_monoculardepth','')
        model_uid = 'midas_' + model_name.lower()
        
        if model_uid == 'midas_midas':
            model_uid = 'midas'
        
    if '_bit_expert' in filename:
        model_name = model_string.replace('_bit_expert','').lower()
        model_uid = '_'.join(['bit_expert', model_name.split('-')[-1]])
        
    model_types = ['_segmentation','_detection']
    if any([x in filename for x in model_types]):
        model_name = model_string
        for model_type in model_types:
            model_name = model_name.replace(model_type, '')
            
        model_uid = 'detectron_' + model_name
        
    hyperparams = ['ImageNet1K_V1','ImageNet1K_V2']
    if any([x in filename for x in hyperparams]):
        model_uid = 'torchvision_' + model_string.replace('-','_').lower()
        
    if model_uid is not None:
        if model_uid in deepdive_model_uids:
            conversions[model_uid] = filename
            files_unaccounted.remove(filename)
        
if len(conversions) != len(filtered_file_list):
    print('Missing models? Check conversion.')
    
            
if len(files_unaccounted) != 0:
    print(files_unaccounted)

Missing models? Check conversion.
['efficientnetv2_rw_s_imagenet.parquet', 'coat_lite_mini_imagenet.parquet']


In [22]:
pd.read_parquet(f'{source_dir}/{file_list[0]}')

Unnamed: 0,model,train_type,model_layer_index,model_layer,measurement,image_type,score_type,score,alpha
0,efficientnetv2_rw_s,imagenet,1,Conv2d-1,valence,Object,ev_score,-0.081182,0.100000
1,efficientnetv2_rw_s,imagenet,1,Conv2d-1,valence,Object,pearson_r,0.098516,0.100000
2,efficientnetv2_rw_s,imagenet,1,Conv2d-1,valence,Object,ev_score,-0.081182,0.177828
3,efficientnetv2_rw_s,imagenet,1,Conv2d-1,valence,Object,pearson_r,0.098516,0.177828
4,efficientnetv2_rw_s,imagenet,1,Conv2d-1,valence,Object,ev_score,-0.081182,0.316228
...,...,...,...,...,...,...,...,...,...
335995,efficientnetv2_rw_s,imagenet,448,Linear-1,beauty,Combo,pearson_r,0.660218,31622.776602
335996,efficientnetv2_rw_s,imagenet,448,Linear-1,beauty,Combo,ev_score,0.205066,56234.132519
335997,efficientnetv2_rw_s,imagenet,448,Linear-1,beauty,Combo,pearson_r,0.642236,56234.132519
335998,efficientnetv2_rw_s,imagenet,448,Linear-1,beauty,Combo,ev_score,0.139229,100000.000000


### File Transfer

In [23]:
output_dir = '../deep_affect/fresh_results/{}'.format(benchmark)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


for model_uid, source_file in tqdm(conversions.items()):
    output_file = model_uid + '.parquet'
    
    source_file = os.path.join(source_dir, source_file)
    output_file = os.path.join(output_dir, output_file)
    
    columns_to_drop = ['model','train_type']
    data = pd.read_parquet(source_file)
    data.drop(columns_to_drop, 
              axis = 1, inplace = True)
    data.insert(0,'model_uid', model_uid)
    
    if not os.path.exists(output_file):
        data.to_parquet(output_file, index = None)

  0%|          | 0/67 [00:00<?, ?it/s]