In [1]:
import pandas as pd
from huggingface_hub import HfApi
from tqdm import tqdm

## Get the full list of models and their main information

In [2]:
api = HfApi()
models = api.list_models()


In [3]:
full_list = []
for model in tqdm(models, desc='Processing models', total=1172011):
    full_list.append({'id': model.id, 'task': model.pipeline_tag, 'created_at': model.created_at,'downloads': model.downloads, 'downloads_all_time': model.downloads_all_time, 'tags': model.tags, 'likes': model.likes, 'library_name': model.library_name, 'trending_score': model.trending_score})


Processing models: 1172145it [04:52, 4008.31it/s]                             


In [None]:
data=pd.DataFrame(full_list)
data.to_parquet('D:/Alejandria/output/models_hf.parquet')   


In [5]:
data.head(3)

Unnamed: 0,id,task,created_at,downloads,downloads_all_time,tags,likes,library_name,trending_score
0,Qwen/QwQ-32B-Preview,text-generation,2024-11-27 15:50:55+00:00,33626,,"[transformers, safetensors, qwen2, text-genera...",969,transformers,969.0
1,tencent/HunyuanVideo,,2024-12-01 06:00:34+00:00,0,,"[license:other, region:us]",238,,238.0
2,Lightricks/LTX-Video,image-to-video,2024-10-31 12:36:00+00:00,32304,,"[diffusers, safetensors, ltx-video, text-to-vi...",564,diffusers,203.0


## Get the metrics

In [1]:
import datetime
import bs4
import requests
import re
from tqdm import tqdm
import pandas as pd
from datetime import datetime
import numpy as np
from time import sleep
tqdm.pandas()

In [2]:
data = pd.read_parquet('D:/Alejandria/output/models_hf.parquet')
data = data.head(1000)

In [3]:
def get_max_position(model):
    sleep(np.random.randint(1, 5))
    url = f"https://huggingface.co/{model}/raw/main/config.json"
    response = requests.get(url)
    if response.status_code != 200:
        return model, None
    else:
        try:
            json_response = response.json()
            if 'text_config' not in json_response.keys():
                return model, json_response['max_position_embeddings']
            else:
                return model, json_response['text_config']['max_position_embeddings']
        except:
            return model, None

In [4]:
# Get the max position for each model parallelized
from joblib import Parallel, delayed
import multiprocessing

num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores)(delayed(get_max_position)(model) for model in tqdm(data['id'], desc='Getting max position', total=len(data)))

Getting max position: 100%|██████████| 1000/1000 [02:22<00:00,  7.01it/s]


In [5]:
id, max_emb = zip(*results)

In [6]:
data['max_position'] =  max_emb

In [7]:
data

Unnamed: 0,id,task,created_at,downloads,downloads_all_time,tags,likes,library_name,trending_score,max_position
0,Qwen/QwQ-32B-Preview,text-generation,2024-11-27 15:50:55+00:00,33626,,"[transformers, safetensors, qwen2, text-genera...",969,transformers,969.0,32768.0
1,tencent/HunyuanVideo,,2024-12-01 06:00:34+00:00,0,,"[license:other, region:us]",238,,238.0,
2,Lightricks/LTX-Video,image-to-video,2024-10-31 12:36:00+00:00,32304,,"[diffusers, safetensors, ltx-video, text-to-vi...",564,diffusers,203.0,
3,Djrango/Qwen2vl-Flux,text-to-image,2024-11-25 02:37:41+00:00,0,,"[diffusers, safetensors, flux, qwen2vl, stable...",352,diffusers,193.0,
4,AIDC-AI/Marco-o1,text-generation,2024-11-13 02:37:28+00:00,9709,,"[transformers, safetensors, qwen2, text-genera...",599,transformers,185.0,32768.0
...,...,...,...,...,...,...,...,...,...,...
995,BioMistral/BioMistral-7B,text-generation,2024-02-14 11:33:32+00:00,13561,,"[transformers, pytorch, tensorboard, mistral, ...",406,transformers,2.0,32768.0
996,briaai/BRIA-2.3,text-to-image,2024-02-18 09:22:05+00:00,719,,"[diffusers, safetensors, text-to-image, legal ...",29,diffusers,2.0,
997,ShineChen1024/MagicClothing,,2024-02-20 02:26:57+00:00,0,,"[arxiv:2403.01779, arxiv:2404.09512, license:c...",102,,2.0,
998,yanolja/EEVE-Korean-Instruct-10.8B-v1.0,text-generation,2024-02-22 04:39:04+00:00,25650,,"[transformers, safetensors, llama, text-genera...",135,transformers,2.0,4096.0


### Get readme metrics

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os

# Load an open-source language model
model_name = "tiiuae/falcon-7b-instruct"  # Replace with the open-source model of your choice
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", trust_remote_code=True)
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

def extract_metrics_from_document(document: str):
    """
    Uses a language model to extract metrics information from a document.
    """
    question = "Does this document talk about the metrics of the model? If it does, return a JSON with the metrics. Use as keys the dataset it was tested on, the metric name, and the metric value."
    response = qa_pipeline(f"Document:\n{document}\n\n{question}", max_length=512, num_return_sequences=1)
    return response[0]['generated_text']

def get_readme_metrics(model):
    sleep(np.random.randint(1, 2))
    url = f"https://huggingface.co/{model}/raw/main/README.md"
    response = requests.get(url)
    if response.status_code != 200:
        return response.text
       
def process_documents(documents_folder: str):
    """
    Processes all text files in a folder and extracts metrics information.
    """
    results = {}
    for file_name in os.listdir(documents_folder):
        file_path = os.path.join(documents_folder, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as file:
                document = file.read()
                metrics = extract_metrics_from_document(document)
                results[file_name] = metrics
    return results

# Specify the folder containing the documents
documents_folder = "path_to_documents"

# Extract metrics from all documents in the folder
metrics_results = process_documents(documents_folder)

# Print results
for file_name, metrics in metrics_results.items():
    print(f"Metrics for {file_name}:")
    print(metrics)
    print("-" * 50)


In [8]:
url = f"https://huggingface.co/tencent/HunyuanVideo/raw/main/README.md"
response = requests.get(url)