In [1]:
import time
import tempfile
import shutil
import pandas as pd
from huggingface_hub import list_models, model_info, hf_hub_download
from tqdm import tqdm
from typing import List, Dict, Optional
from pathlib import Path
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
import torch
import gc

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


# Data Acquisition: Building the Dataset through the Hugging Face Hub API

### Overview
The goal of this step is to construct a robust and informative dataset containing metadata for every model hosted on the Hugging Face Hub. This dataset will serve as a ground truth for a Retrieval-Augmented Generation (RAG) system.

### The Hugging Face Hub API
The [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api) provides programmatic access to the platform's extensive model repository through several endpoints, each serving different purposes and offering varying levels of detail.<br>
The `list_models()` method allows iteration over ModelInfo objects but does not return complete metadata, even with the <b>full=True</b> parameter. To access all relevant information, we must call [`model_info()`](https://huggingface.co/docs/huggingface_hub/v0.32.3/en/package_reference/hf_api#huggingface_hub.ModelInfo) individually for each model ID. This yields full metadata, but at the cost of a large number of API requests, which can impact performance and scalability.

----

### Metadata Fields
The fields that we can retrieve and those that are most useful for our purposes include:
- **model_id**: Unique identifier for the model.
- **base_model**: Identifier of the base model from which this model derives (e.g., for fine-tuned models).
- **author**: The creator or organization behind the model.
- **license**: Licensing information for the model.
- **language**: Language of the model's training data or metadata.
- **downloads**: Number of times the model has been downloaded.
- **likes**: Number of likes the model has received.
- **tags**: Tags associated with the model for easier categorization.
- **pipeline_tag**: The pipeline tag associated with the model (e.g., text-generation, image-classification).
- **library_name**: The library name associated with the model (e.g., transformers, diffusers).
- **created_at**: Timestamp of when the model was created.
- **readme_file**: The readme file of the model repository, which may contain additional context and information about the model.

---

### The Challenge of Context
While the metadata fields provide valuable insights, they often lack sufficient context to fully understand the model's capabilities, limitations, and training methodology. The readme file of each model repository is a crucial resource for this additional context, but it comes with its own set of challenges:
- **Inconsistency**: Not all models have a readme file, and those that do may vary significantly in content quality and relevance.
- **Information Overload**: Some readme files may contain excessive or irrelevant information, making it difficult to extract useful insights.
- **Lack of Control**: The content of readme files is user-generated, so we cannot guarantee the presence or quality of information.
- **Performance**: Downloading readme files for a large number of models can be time-consuming and resource-intensive.

In [6]:
# Configuration
HF_TOKEN = "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"
MAX_WORKERS = 5
BATCH_SIZE = 100
CHECKPOINT_FILE = "models_scraping_checkpoint.json"
MODEL_LIMIT = 1100
# Cache Directory
CHACE_DIR = "temp_cache"
os.makedirs(CHACE_DIR, exist_ok=True)
TEMP_CACHE_DIR = tempfile.mkdtemp(prefix="hf_temp_cache_", dir=CHACE_DIR)
# Generating Embeddings Batch Size
EMBEDDINGS_BATCH_SIZE = 200

In [7]:
def get_readme_from_repository(repository_id: str):
    """
    Fetches the README.md file from a given Hugging Face dataset repository.

    Args:
        repository_id (str): The ID of the Hugging Face dataset repository.
    Returns:
        str: The content of the README.md file, or an empty string if not found.
    """
    try:
        readme_content = hf_hub_download(
            repo_id=repository_id,
            filename="README.md",
            token=HF_TOKEN,
            cache_dir=TEMP_CACHE_DIR
        )

        with open(readme_content, "r", encoding="utf-8") as f:
            readme_text = f.read()

        return readme_text
    except Exception as e:
        logger.error(f"Failed to download README for {repository_id}: {e}")
        return ""


def process_single_model(model_id: str) -> Optional[Dict]:
    """
    Processes a single model metadata from the Hugging Face Hub.

    Args:
        model_id (str): The ID of the model to process.
    Returns:
        Optional[Dict]: A dictionary containing the processed metadata, or None if an error occurs.
    """
    try:
        info = model_info(model_id, token=HF_TOKEN)
        card_data = info.cardData if hasattr(info, 'cardData') and info.cardData else {}

        readme = get_readme_from_repository(model_id)

        return {
            'model_id': model_id,
            'base_model': getattr(card_data, 'base_model', None),
            'author': getattr(info, 'author', None),
            'readme_file': readme,
            'license': getattr(card_data, 'license', None),
            'language': getattr(card_data, 'language', None),
            'downloads': getattr(info, 'downloads', 0),
            'likes': getattr(info, 'likes', 0),
            'tags': ', '.join(info.tags) if hasattr(info, 'tags') and info.tags else '',
            'pipeline_tag': getattr(info, 'pipeline_tag', None),
            'library_name': getattr(info, 'library_name', None),
            'created_at': getattr(info, 'created_at', None),
        }
    except Exception as e:
        logger.error(f"Error processing {model_id}: {e}")
        return None


def process_batch_threaded(model_ids: List[str]) -> List[Dict]:
    """Process a batch of models using ThreadPoolExecutor"""
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        future_to_model = {
            executor.submit(process_single_model, model_id): model_id
            for model_id in model_ids
        }

        # Collect results with progress bar
        for future in tqdm(as_completed(future_to_model),
                           total=len(model_ids),
                           desc=f"Processing batch"):
            try:
                result = future.result(timeout=60)  # 60 second timeout
                if result:
                    results.append(result)
            except Exception as e:
                model_id = future_to_model[future]
                logger.error(f"Timeout/Error for {model_id}: {e}")

    return results


def save_checkpoint(data):
    """
    Saves the processed data to a JSON checkpoint file.

    Args:
        data (list): List of processed dataset metadata dictionaries.
    """
    try:
        with open(CHECKPOINT_FILE, 'w') as f:
            json.dump(data, f, indent=2, default=str)
        logging.info(f"Checkpoint saved: {len(data)} datasets processed")
        print(f"Progress: {len(data)}/{len(datasets)} datasets completed")
    except Exception as e:
        logging.error(f"Could not save checkpoint: {e}")


def cleanup_temp_cache():
    """
    Cleans up the temporary cache directory used for storing readme files.
    """
    try:
        if os.path.exists(TEMP_CACHE_DIR):
            shutil.rmtree(TEMP_CACHE_DIR)
            logging.info(f"Current temporary cache {TEMP_CACHE_DIR} deleted.")
    except Exception as e:
        logging.error(f"Could not delete current temp cache dir {TEMP_CACHE_DIR}: {e}")


def clean_all_cache_folders():
    """
    Cleans up all cache folders in the CHACE_DIR directory.
    This function deletes all files and directories within the CHACE_DIR.
    """
    try:
        if os.path.exists(CHACE_DIR):
            for item in os.listdir(CHACE_DIR):
                item_path = os.path.join(CHACE_DIR, item)
                if os.path.isdir(item_path):
                    shutil.rmtree(item_path)
                    logging.info(f"Deleted cache folder: {item_path}")
                else:
                    os.remove(item_path)
                    logging.info(f"Deleted cache file: {item_path}")

            logging.info(f"All cache folders in {CHACE_DIR} cleaned up.")
    except Exception as e:
        logging.error(f"Could not delete cache folders in {CHACE_DIR}: {e}")

In [8]:
# Load Checkpoint (if exists)
checkpoint_data = []
start_index = 0

if Path(CHECKPOINT_FILE).exists():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint_data = json.load(f)
        start_index = len(checkpoint_data)
        print(f"Loaded checkpoint: {start_index} models already processed")
    except Exception as e:
        print(f"Could not load checkpoint file: {e}")
else:
    print("No checkpoint file found. Starting from scratch.")

Loaded checkpoint: 169986 models already processed


In [10]:
print("Fetching models from Hugging Face Hub...")
models = list(list_models(limit=MODEL_LIMIT))
print(f"Fetched {len(models)} models")

# Prepare models to process
models_to_process = models[start_index:]
all_data = checkpoint_data.copy()

print(f"Models remaining to process: {len(models_to_process)}")

Fetching models from Hugging Face Hub...
Fetched 1100 models
Models remaining to process: 0


In [None]:
# Process Models in Batches
if models_to_process:
    total_batches = (len(models_to_process) - 1) // BATCH_SIZE + 1

    for i in range(0, len(models_to_process), BATCH_SIZE):
        batch_models = models_to_process[i:i + BATCH_SIZE]
        batch_ids = [m.modelId for m in batch_models]

        current_batch = i // BATCH_SIZE + 1
        print(f"\nProcessing batch {current_batch}/{total_batches}")
        print(f"Batch size: {len(batch_ids)} models")

        # Process batch
        batch_results = process_batch_threaded(batch_ids)
        all_data.extend(batch_results)

        # Save checkpoint
        save_checkpoint(all_data)

        # Clean up temporary temp_cache directory
        cleanup_temp_cache()

        # Rate limiting
        time.sleep(0.5)

In [11]:
df = pd.DataFrame(all_data)

print(f"\nScraping completed!")
print(f"Total models processed: {len(df)}")
print(f"Dataset shape: {df.shape}")
df.head()


Scraping completed!
Total models processed: 169986
Dataset shape: (169986, 12)


Unnamed: 0,model_id,base_model,author,readme_file,license,language,downloads,likes,tags,pipeline_tag,library_name,created_at
0,Qwen/Qwen3-Embedding-0.6B-GGUF,[Qwen/Qwen3-0.6B-Base],Qwen,---\nlicense: apache-2.0\nbase_model:\n- Qwen/...,apache-2.0,,10264,307,"gguf, arxiv:2506.05176, base_model:Qwen/Qwen3-...",,,2025-06-05 08:34:51+00:00
1,fishaudio/openaudio-s1-mini,,fishaudio,---\ntags:\n- text-to-speech\nlicense: cc-by-n...,cc-by-nc-sa-4.0,"[zh, en, de, ja, fr, es, ko, ar, nl, ru, it, p...",1649,204,"dual_ar, text-to-speech, zh, en, de, ja, fr, e...",text-to-speech,,2025-05-31 11:57:47+00:00
2,deepseek-ai/DeepSeek-R1-0528,,deepseek-ai,---\nlicense: mit\nlibrary_name: transformers\...,mit,,104615,1918,"transformers, safetensors, deepseek_v3, text-g...",text-generation,transformers,2025-05-28 09:46:42+00:00
3,mistralai/Magistral-Small-2506,[mistralai/Mistral-Small-3.1-24B-Instruct-2503],mistralai,---\nlanguage:\n- en\n- fr\n- de\n- es\n- pt\n...,apache-2.0,"[en, fr, de, es, pt, it, ja, ko, ru, zh, ar, f...",101,231,"vllm, safetensors, mistral, conversational, en...",text-generation,vllm,2025-06-04 10:51:21+00:00
4,google/gemma-3n-E4B-it-litert-preview,,google,---\nlicense: gemma\npipeline_tag: image-text-...,gemma,,0,1074,"image-text-to-text, arxiv:1905.07830, arxiv:19...",image-text-to-text,,2025-05-18 19:24:14+00:00


### Cleaning Markdown Readme Files
This step involves cleaning the readme files extracted from the Hugging Face models to ensure that they contain only relevant textual content, while preserving titles and important information. The cleaning process will remove unnecessary formatting, images, links, and other non-essential elements.

In [12]:
import re


def clean_markdown(text):
    # Remove YAML front matter
    text = re.sub(r'^---.*?---\s*', '', text, flags=re.DOTALL)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove images (![alt](url))
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)

    # Remove markdown links but keep the visible text: [text](url) → text
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)

    # Remove tables (lines containing |, excluding bullet points)
    text = re.sub(r'^\s*\|.*\|.*$', '', text, flags=re.MULTILINE)

    # Remove code blocks (``` ... ```)
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)

    # Remove inline code (`code`)
    text = re.sub(r'`[^`]+`', '', text)

    # Remove blockquotes (> ...)
    text = re.sub(r'^\s*>.*$', '', text, flags=re.MULTILINE)

    # Remove citation block (```)
    text = re.sub(r'^@misc.*?```', '', text, flags=re.DOTALL)

    # Remove extra newlines
    text = re.sub(r'\n{2,}', '\n\n', text)

    # Trim whitespace
    return text.strip()


# Clean the readme text in the DataFrame
df['readme_file'] = df['readme_file'].apply(clean_markdown)

In [13]:
print("Readme empty string count:", (df['readme_file'] == '').sum())
print("Removing models with empty readme...")
df = df[df['readme_file'] != '']
print("DataFrame shape after removing empty readmes:", df.shape)
print("Applying markdown cleaning to readme files")
#df['readme_file'] = df['readme_file'].apply(clean_markdown)
print("Final shape of DataFrame:", df.shape)
df.head()

Readme empty string count: 81827
Removing models with empty readme...
DataFrame shape after removing empty readmes: (88159, 12)
Applying markdown cleaning to readme files
Final shape of DataFrame: (88159, 12)


Unnamed: 0,model_id,base_model,author,readme_file,license,language,downloads,likes,tags,pipeline_tag,library_name,created_at
0,Qwen/Qwen3-Embedding-0.6B-GGUF,[Qwen/Qwen3-0.6B-Base],Qwen,# Qwen3-Embedding-0.6B-GGUF\n\n \n\n## High...,apache-2.0,,10264,307,"gguf, arxiv:2506.05176, base_model:Qwen/Qwen3-...",,,2025-06-05 08:34:51+00:00
1,fishaudio/openaudio-s1-mini,,fishaudio,# OpenAudio S1\n\n**OpenAudio S1** is a leadin...,cc-by-nc-sa-4.0,"[zh, en, de, ja, fr, es, ko, ar, nl, ru, it, p...",1649,204,"dual_ar, text-to-speech, zh, en, de, ja, fr, e...",text-to-speech,,2025-05-31 11:57:47+00:00
2,deepseek-ai/DeepSeek-R1-0528,,deepseek-ai,# DeepSeek-R1-0528\n\n \n\n \n \n \n \n...,mit,,104615,1918,"transformers, safetensors, deepseek_v3, text-g...",text-generation,transformers,2025-05-28 09:46:42+00:00
3,mistralai/Magistral-Small-2506,[mistralai/Mistral-Small-3.1-24B-Instruct-2503],mistralai,# Model Card for Magistral-Small-2506\n\nBuild...,apache-2.0,"[en, fr, de, es, pt, it, ja, ko, ru, zh, ar, f...",101,231,"vllm, safetensors, mistral, conversational, en...",text-generation,vllm,2025-06-04 10:51:21+00:00
4,google/gemma-3n-E4B-it-litert-preview,,google,# Gemma 3n model card\n\n**Model Page**: Gemma...,gemma,,0,1074,"image-text-to-text, arxiv:1905.07830, arxiv:19...",image-text-to-text,,2025-05-18 19:24:14+00:00


### Generating Content Embeddings
This is a preprocessing step in order to generate embeddings for the content of the models. The embeddings will be used to compare and rank models based on their metadata and readme content, enabling efficient retrieval in a RAG system. The model used is an open source model: [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3)

A few important considerations:
- The embedding model is not specifically trained on structured metadata fields (e.g. license, tags), so it may not fully capture their semantic weight or relevance.
- README files often contain noisy, inconsistent, or sparse information. This can affect the quality of the resulting embeddings.
- Some fields such as `license`,`language` or `tags` might be missing or incomplete.


### Weighted Embeddings Approach
In this section, we will generate embeddings for the datasets using a weighted approach. Each field in the dataset will be assigned a weight based on its importance for content matching and retrieval. The weights will be used to combine the embeddings of different fields into a single embedding vector.

In [15]:
df = pd.read_csv("Output/models_hg_cleaned.csv")
df = df.sample(30000, random_state=42).reset_index(drop=True)
print(f"Dataset shape: {df.shape}")
print(df.columns)

Dataset shape: (30000, 12)
Index(['model_id', 'base_model', 'author', 'readme_file', 'license',
       'language', 'downloads', 'likes', 'tags', 'pipeline_tag',
       'library_name', 'created_at'],
      dtype='object')


In [16]:
df.head()

Unnamed: 0,model_id,base_model,author,readme_file,license,language,downloads,likes,tags,pipeline_tag,library_name,created_at
0,pnparam/xlsr_comb2,,pnparam,# xlsr_comb2\n\nThis model is a fine-tuned ver...,apache-2.0,,7,0,"transformers, pytorch, wav2vec2, automatic-spe...",automatic-speech-recognition,transformers,2023-02-25 15:37:17+00:00
1,Wlad777/test,,Wlad777,10+9,,,0,0,region:us,,,2023-02-23 08:04:39+00:00
2,VAZaytsev/ppo_LL,,VAZaytsev,# PPO Agent Playing LunarLander-v2\n\n This i...,,,0,0,"tensorboard, LunarLander-v2, ppo, deep-reinfor...",reinforcement-learning,,2023-03-12 14:24:05+00:00
3,cardiffnlp/twitter-xlm-roberta-base-sentiment-...,,cardiffnlp,# cardiffnlp/twitter-xlm-roberta-base-sentimen...,,,48289,28,"transformers, pytorch, xlm-roberta, text-class...",text-classification,transformers,2022-12-01 00:32:11+00:00
4,alireza7/ARMAN-SS-100-persian-base,,alireza7,More information about models is available here.,,,21,0,"transformers, pytorch, pegasus, text2text-gene...",text2text-generation,transformers,2022-03-02 23:29:05+00:00


In [None]:
embedding_model = "jinaai/jina-embeddings-v3"

FIELD_WEIGHTS = {
    'model_id': 0.05,  # Low relevance – just an identifier, rarely informative for semantic search.
    'base_model': 0.10,  # Medium relevance – useful to understand the model's architecture or foundation.
    'author': 0.05,  # Low relevance – rarely impacts model capabilities or domain.
    'license': 0.05,  # Low relevance – important for legal use but not for content relevance.
    'language': 0.10,  # Medium relevance – essential when queries specify language preferences.
    'tags': 0.20,  # High relevance – concise, curated keywords help capture the model's purpose.
    'pipeline_tag': 0.15,  # High relevance – explicitly defines the task (e.g., classification, QA).
    'library_name': 0.10,  # Medium relevance – relevant when a specific framework is required.
    'readme_file': 0.20  # Highest relevance – contains descriptive context but may be noisy or verbose.
}

In [None]:
def generate_weighted_embeddings(data, batch_size):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True).to(device)

    text_columns = list(FIELD_WEIGHTS.keys())
    data[text_columns] = data[text_columns].fillna("").astype(str)

    embeddings = []

    for start in tqdm(range(0, len(data), batch_size), desc="Processing Weighted Embeddings"):
        end = min(start + batch_size, len(data))
        batch_data = data.iloc[start:end]

        # Build flat list of texts to embed and keep mapping
        texts_to_embed = []
        index_map = []  # (row_index_in_batch, field_name)

        for i, row in batch_data.iterrows():
            for field in text_columns:
                text = row[field].strip()
                if text:
                    texts_to_embed.append(text)
                    index_map.append((i - start, field))  # i-start: relative index in batch

        try:
            # Batch encode 
            with torch.no_grad():
                all_embeddings = model.encode(texts_to_embed, task="text-matching")

            # Build empty structures for rows
            row_embs = [[] for _ in range(len(batch_data))]
            row_weights = [0.0 for _ in range(len(batch_data))]

            # Populate embeddings and weights
            for emb, (row_idx, field) in zip(all_embeddings, index_map):
                weight = FIELD_WEIGHTS[field]
                row_embs[row_idx].append(np.array(emb) * weight)
                row_weights[row_idx] += weight

            # Compute final embedding per row
            for i in range(len(batch_data)):
                if row_embs[i]:
                    combined_emb = np.sum(row_embs[i], axis=0) / row_weights[i]
                else:
                    combined_emb = len(all_embeddings[0]) if all_embeddings else 1024
                embeddings.append(combined_emb.tolist())

            torch.cuda.empty_cache()
            gc.collect()
        except Exception as e:
            print(f"An error occurred during processing the batch:{start}-{end}. Exception: {e}")
            print("Embeddings will be filled with empty lists.")
            embeddings.extend([[]] * (end - start))

    del model
    torch.cuda.empty_cache()

    data['embeddings'] = embeddings

    return data


df = generate_weighted_embeddings(df, batch_size=EMBEDDINGS_BATCH_SIZE)

In [None]:
df.head()
df.to_csv("Output/datasets_hg_weighted_emb.csv", index=False)

### Classic Embeddings Approach
In this section, we will generate embeddings for the datasets using a classic approach. This involves concatenating the text from multiple fields into a single string and generating embeddings for that combined text. The embeddings will be generated using the same model as before: [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3).

In [None]:
embedding_model = "jinaai/jina-embeddings-v3"


def generate_content_embeddings(data, batch_size):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True).to(device)

    # Assicura che tutte le colonne coinvolte siano stringhe, anche se sono liste o NaN
    text_columns = ['model_id', 'base_model', 'author', 'license', 'language', 'tags', 'pipeline_tag', 'library_name',
                    'readme_file']

    for col in text_columns:
        data[col] = data[col].astype(str)

    data[text_columns] = data[text_columns].fillna("").astype(str)

    data['full_text'] = (
            data['model_id'] + "\n" +
            data['base_model'] + "\n" +
            data['author'] + "\n" +
            data['license'] + "\n" +
            data['language'] + "\n" +
            data['tags'] + "\n" +
            data['pipeline_tag'] + "\n" +
            data['library_name'] + "\n" +
            data['readme_file']
    )

    embeddings = []

    for start in tqdm(range(0, len(data), batch_size), desc="Processing Embeddings Batches"):
        end = min(start + batch_size, len(data))

        try:
            batch_texts = data['full_text'].iloc[start:end].tolist()

            with torch.no_grad():
                batch_embeddings = model.encode(batch_texts, task="text-matching").tolist()

            embeddings.extend(batch_embeddings)

            torch.cuda.empty_cache()
            gc.collect()

        except Exception as e:
            print(f"An error occurred during processing the batch:{start}-{end}. Exception: {e}")
            print("Embeddings will be filled with empty lists.")
            embeddings.extend([[]] * (end - start))

    del model
    torch.cuda.empty_cache()

    data['embeddings'] = embeddings
    data = data.drop(columns=['full_text'])
    return data

In [None]:
df = generate_content_embeddings(df, batch_size=5)
df.head()

In [None]:
df.to_csv("Output/models_hg_embeddings.csv", index=False)