In [None]:
import pandas as pd
from huggingface_hub import HfApi
from tqdm import tqdm

## Get the full list of models and their main information

In [None]:
api = HfApi()
models = api.list_models()


In [None]:
full_list = []
for model in tqdm(models, desc='Processing models', total=1172011):
    full_list.append({'id': model.id, 'task': model.pipeline_tag, 'created_at': model.created_at,'downloads': model.downloads, 'downloads_all_time': model.downloads_all_time, 'tags': model.tags, 'likes': model.likes, 'library_name': model.library_name, 'trending_score': model.trending_score})


In [None]:
data=pd.DataFrame(full_list)
data.to_parquet('D:/Alejandria/output/models_hf.parquet')   


In [None]:
data.head(3)

## Get the metrics

In [None]:
from tqdm import tqdm
import pandas as pd
from time import sleep
from transformers import AutoConfig
tqdm.pandas()

In [None]:
data = pd.read_parquet('D:/Alejandria/output/models_hf.parquet')}

In [None]:
def get_config(model_id):
    try:
        config = AutoConfig.from_pretrained(model_id)
        return config.to_json_string()
    except:
        return None

In [None]:
# Get the max position for each model parallelized
from joblib import Parallel, delayed
import multiprocessing

num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores)(delayed(get_config)(model) for model in tqdm(data['id'], desc='Getting max position', total=len(data)))

In [None]:
data['config'] = results

### Get readme metrics

In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd

In [None]:
def get_readme(model_id):
    try:
        readme = hf_hub_download(model_id, filename='README.md')
        return readme
    except:
        return None

In [None]:
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores)(delayed(get_readme)(model) for model in tqdm(data['id'], desc='Getting readmes', total=len(data)))

In [None]:
data['readme'] = results