## Introduction
This notebook was used to scrape model_size information from HuggingFace so that the model_size can also be presented. This Model size is a better parameter to make a decision on using these models for production environmetns.
 

In [17]:
import ssl 
import time
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup as soup
import requests
ssl._create_default_https_context = ssl._create_unverified_context

def get_model_size(model_name, hf_access_token):
    url = f"https://huggingface.co/timm/{model_name}/tree/main"
    headers = {"Authorization": f"Bearer {hf_access_token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        page_soup = soup(response.content, "html.parser")
        all_file_sizes = page_soup.find_all('a', {'title' : 'Download file'})
        selected_file = [file for file in all_file_sizes if "pytorch_model.bin" in file['href']]
        model_size = selected_file[0].text
        model_size = model_size.split("\n")[0]
    elif response.status_code == 404:
        model_size = "Model Not Found"
    elif response.status_code == 401:
        model_size = "Model Authorization Error"
    else:
        raise Exception(f"Error in getting model size for {model_name}")
    return model_size

def get_model_sizes(model_names, size_csv_path, hf_access_token):
    model_sizes = []
    count = 0
    #get the idx from whcih we have to resume from the csv file:
    try:
        model_size_df = pd.read_csv(size_csv_path, header=None)
        model_shape = model_size_df.shape
        model_resume_idx = model_shape[0]
        print(f"Resuming from index {model_resume_idx}")
    except FileNotFoundError:
        model_resume_idx = 0
    
    if model_resume_idx == len(model_names):
        print("All models have been scraped")
        return model_size_df
    else:
        for model_name in tqdm(model_names[model_resume_idx:]):
            model_size = get_model_size(model_name, hf_access_token)
            model_sizes.append(model_size)
            with open(size_csv_path, "a") as f:
                f.write(f"{model_name},{model_size}\n")
            count += 1
            if count % 45 == 0:
                time.sleep(30)
        
        model_size_df = pd.read_csv(size_csv_path)
        return model_size_df


def get_size_in_mb(combined_df):
    model_sizes = combined_df.model_size[combined_df.model_size.notnull()].copy()
    
    conversion_df = pd.DataFrame(columns=["original_size"])
    conversion_df['original_size'] = model_sizes
    conversion_df.reset_index(inplace=True, drop=True) 
    conversion_df['metric'] = conversion_df.original_size.apply(lambda x: x.split(" ")[1] if x is not np.nan else np.nan)
    conversion_df['size'] = conversion_df.original_size.apply(lambda x: float(x.split(" ")[0]) if x is not np.nan else np.nan)
    conversion_df['multiplier'] = conversion_df.metric.apply(lambda x: 1 if x == "MB" else 1024)
    conversion_df["size_in_mb"] = conversion_df['size'] * conversion_df['multiplier']
    
    return conversion_df['size_in_mb'].values

def get_combined_df(score_df, model_size_df):
    combined_df = pd.concat([score_df, model_size_df], axis=1)
    combined_df.rename(columns={1: "model_size"}, inplace=True)
    combined_df.drop(columns=[0], inplace=True)
    combined_df = combined_df.loc[:,['model', 'model_size','top1', 'top1_err', 'top5', 'top5_err', 'param_count',
           'img_size', 'crop_pct', 'interpolation', 'top1_diff', 'top5_diff',
           'rank_diff']]
    
    combined_df.model_size = combined_df.model_size.apply(lambda x: np.nan if x in ["Model Not Found", "Model Authorization Error"] else x)
    return combined_df


def get_final_df(score_df_path, scraped_size_path, hf_access_token):
    score_df = pd.read_csv(score_df_path)
    
    #get Model Sizes from HunggingFace library
    model_size_df = get_model_sizes(score_df['model'].values,
                                    scraped_size_path, 
                                    hf_access_token)
    
    #combine the original and final_dfs
    combined_df = get_combined_df(score_df, model_size_df)
    
    #convert the model sizes to MB. Leaving out the models which are not found.
    sizes_in_mb = get_size_in_mb(combined_df)
    nonnull_idx = combined_df.model_size[combined_df.model_size.notnull()].index
    combined_df.loc[nonnull_idx, 'model_size_in_mb'] = sizes_in_mb
    
    #A final dataframe with only the required columns
    final_cols = ['model', 'model_size_in_mb','top1', 'top1_err', 'top5', 'top5_err', 'param_count',
               'img_size', 'crop_pct', 'interpolation', 'top1_diff', 'top5_diff',
               'rank_diff']
    
    final_df = combined_df.loc[:, final_cols].copy()
    return final_df

In [18]:
orig_score_path = "results-imagenet-real.csv"
scraped_size_path= "model_sizes.csv"
hf_access_token = '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'

get_final_df(orig_score_path, scraped_size_path, hf_access_token)

Resuming from index 1080
All models have been scraped


Unnamed: 0,model,model_size_in_mb,top1,top1_err,top5,top5_err,param_count,img_size,crop_pct,interpolation,top1_diff,top5_diff,rank_diff
0,eva02_large_patch14_448.mim_m38m_ft_in22k_in1k,1249.28,91.129,8.871,98.713,1.287,305.08,448,1.000,bicubic,1.077,-0.335,0
1,eva_giant_patch14_336.clip_ft_in1k,4147.20,91.058,8.942,98.602,1.399,1013.01,336,1.000,bicubic,1.592,-0.224,5
2,eva02_large_patch14_448.mim_in22k_ft_in22k_in1k,1249.28,91.022,8.978,98.683,1.317,305.08,448,1.000,bicubic,1.052,-0.329,-1
3,eva_giant_patch14_560.m30m_ft_in22k_in1k,4157.44,90.969,9.031,98.672,1.328,1014.45,560,1.000,bicubic,1.183,-0.320,-1
4,eva02_large_patch14_448.mim_in22k_ft_in1k,1249.28,90.920,9.080,98.685,1.315,305.08,448,1.000,bicubic,1.298,-0.265,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,efficientvit_m0.r224_in1k,9.76,71.091,28.909,89.589,10.411,2.35,224,0.875,bicubic,7.821,4.413,0
1076,lcnet_050.ra2_in1k,7.60,70.402,29.598,88.825,11.175,1.88,224,0.875,bicubic,7.264,4.443,0
1077,tf_mobilenetv3_small_minimal_100.in1k,8.29,70.096,29.904,88.516,11.485,2.04,224,0.875,bilinear,7.202,4.278,0
1078,tinynet_e.in1k,8.30,66.810,33.190,86.280,13.720,2.04,106,0.875,bicubic,6.944,4.518,0
