In [None]:
import time
import tempfile
import shutil
import pandas as pd
from huggingface_hub import HfApi, list_models, model_info, hf_hub_download
from tqdm import tqdm
from typing import List, Dict, Optional
from pathlib import Path
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Data Acquisition: Building the Dataset through the Hugging Face Hub API

### Overview
The goal of this step is to construct a robust and informative dataset containing metadata for every model hosted on the Hugging Face Hub. This dataset will serve as a ground truth for a Retrieval-Augmented Generation (RAG) system.

### The Hugging Face Hub API
The [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api) provides programmatic access to the platform's extensive model repository through several endpoints, each serving different purposes and offering varying levels of detail.<br>
The `list_models()` method allows iteration over ModelInfo objects but does not return complete metadata, even with the <b>full=True</b> parameter. To access all relevant information, we must call [`model_info()`](https://huggingface.co/docs/huggingface_hub/v0.32.3/en/package_reference/hf_api#huggingface_hub.ModelInfo) individually for each model ID. This yields full metadata, but at the cost of a large number of API requests, which can impact performance and scalability.

----

### Metadata Fields
The fields that we can retrieve and those that are most useful for our purposes include:
- **model_id**: Unique identifier for the model.
- **base_model**: Identifier of the base model from which this model derives (e.g., for fine-tuned models).
- **author**: The creator or organization behind the model.
- **license**: Licensing information for the model.
- **language**: Language of the model's training data or metadata.
- **downloads**: Number of times the model has been downloaded.
- **likes**: Number of likes the model has received.
- **tags**: Tags associated with the model for easier categorization.
- **pipeline_tag**: The pipeline tag associated with the model (e.g., text-generation, image-classification).
- **library_name**: The library name associated with the model (e.g., transformers, diffusers).
- **created_at**: Timestamp of when the model was created.
- **readme_file**: The readme file of the model repository, which may contain additional context and information about the model.

---

### The Challenge of Context
While the metadata fields provide valuable insights, they often lack sufficient context to fully understand the model's capabilities, limitations, and training methodology. The readme file of each model repository is a crucial resource for this additional context, but it comes with its own set of challenges:
- **Inconsistency**: Not all models have a readme file, and those that do may vary significantly in content quality and relevance.
- **Information Overload**: Some readme files may contain excessive or irrelevant information, making it difficult to extract useful insights.
- **Lack of Control**: The content of readme files is user-generated, so we cannot guarantee the presence or quality of information.
- **Performance**: Downloading readme files for a large number of models can be time-consuming and resource-intensive.

In [None]:
# Configuration
HF_TOKEN = "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"
MAX_WORKERS = 5
BATCH_SIZE = 100
CHECKPOINT_FILE = "models_scraping_checkpoint.json"
MODEL_LIMIT = 1100
# Cache Directory
CHACE_DIR = "temp_cache"
os.makedirs(CHACE_DIR, exist_ok=True)
TEMP_CACHE_DIR = tempfile.mkdtemp(prefix="hf_temp_cache_", dir=CHACE_DIR)
# Generating Embeddings Batch Size
EMBEDDINGS_BATCH_SIZE = 200

In [None]:
def get_readme_from_repository(repository_id: str):
    try:
        readme_content = hf_hub_download(
            repo_id=repository_id,
            filename="README.md",
            token=HF_TOKEN,
            cache_dir=TEMP_CACHE_DIR
        )

        with open(readme_content, "r", encoding="utf-8") as f:
            readme_text = f.read()

        return readme_text
    except Exception as e:
        logger.error(f"Failed to download README for {repository_id}: {e}")
        return ""

def process_single_model(model_id: str) -> Optional[Dict]:
    """Process a single model and extract its information"""
    try:
        info = model_info(model_id, token=HF_TOKEN)
        card_data = info.cardData if hasattr(info, 'cardData') and info.cardData else {}

        readme = get_readme_from_repository(model_id)

        return {
            'model_id': model_id,
            'base_model': getattr(card_data, 'base_model', None),
            'author': getattr(info, 'author', None),
            'readme_file': readme,
            'license': getattr(card_data, 'license', None),
            'language': getattr(card_data, 'language', None),
            'downloads': getattr(info, 'downloads', 0),
            'likes': getattr(info, 'likes', 0),
            'tags': ', '.join(info.tags) if hasattr(info, 'tags') and info.tags else '',
            'pipeline_tag': getattr(info, 'pipeline_tag', None),
            'library_name': getattr(info, 'library_name', None),
            'created_at': getattr(info, 'created_at', None),
        }
    except Exception as e:
        logger.error(f"Error processing {model_id}: {e}")
        return None

def process_batch_threaded(model_ids: List[str]) -> List[Dict]:
    """Process a batch of models using ThreadPoolExecutor"""
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        future_to_model = {
            executor.submit(process_single_model, model_id): model_id
            for model_id in model_ids
        }

        # Collect results with progress bar
        for future in tqdm(as_completed(future_to_model),
                          total=len(model_ids),
                          desc=f"Processing batch"):
            try:
                result = future.result(timeout=60)  # 60 second timeout
                if result:
                    results.append(result)
            except Exception as e:
                model_id = future_to_model[future]
                logger.error(f"Timeout/Error for {model_id}: {e}")

    return results

def save_checkpoint(data):
    # Save checkpoint
    try:
        with open(CHECKPOINT_FILE, 'w') as f:
            json.dump(data, f, indent=2, default=str)
        logging.info(f"Checkpoint saved: {len(data)} datasets processed")
        print(f"Progress: {len(data)}/{len(datasets)} datasets completed")
    except Exception as e:
        logging.error(f"Could not save checkpoint: {e}")


def cleanup_temp_cache():
    try:
        if os.path.exists(TEMP_CACHE_DIR):
            shutil.rmtree(TEMP_CACHE_DIR)
            logging.info(f"Current temporary cache {TEMP_CACHE_DIR} deleted.")
    except Exception as e:
        logging.error(f"Could not delete current temp cache dir {TEMP_CACHE_DIR}: {e}")


def clean_all_cache_folders():
    try:
        if os.path.exists(CHACE_DIR):
            for item in os.listdir(CHACE_DIR):
                item_path = os.path.join(CHACE_DIR, item)
                if os.path.isdir(item_path):
                    shutil.rmtree(item_path)
                    logging.info(f"Deleted cache folder: {item_path}")
                else:
                    os.remove(item_path)
                    logging.info(f"Deleted cache file: {item_path}")

            logging.info(f"All cache folders in {CHACE_DIR} cleaned up.")
    except Exception as e:
        logging.error(f"Could not delete cache folders in {CHACE_DIR}: {e}")

In [None]:
# Load Checkpoint (if exists)
checkpoint_data = []
start_index = 0

if Path(CHECKPOINT_FILE).exists():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint_data = json.load(f)
        start_index = len(checkpoint_data)
        print(f"Loaded checkpoint: {start_index} models already processed")
    except Exception as e:
        print(f"Could not load checkpoint file: {e}")
else:
    print("No checkpoint file found. Starting from scratch.")

In [None]:
print("Fetching models from Hugging Face Hub...")
models = list(list_models(limit=MODEL_LIMIT))
print(f"Fetched {len(models)} models")

# Prepare models to process
models_to_process = models[start_index:]
all_data = checkpoint_data.copy()

print(f"Models remaining to process: {len(models_to_process)}")

In [None]:
# Process Models in Batches
if models_to_process:
    total_batches = (len(models_to_process) - 1) // BATCH_SIZE + 1

    for i in range(0, len(models_to_process), BATCH_SIZE):
        batch_models = models_to_process[i:i + BATCH_SIZE]
        batch_ids = [m.modelId for m in batch_models]

        current_batch = i // BATCH_SIZE + 1
        print(f"\nProcessing batch {current_batch}/{total_batches}")
        print(f"Batch size: {len(batch_ids)} models")

        # Process batch
        batch_results = process_batch_threaded(batch_ids)
        all_data.extend(batch_results)

        # Save checkpoint
        save_checkpoint(all_data)

        # Clean up temporary temp_cache directory
        cleanup_temp_cache()

        # Rate limiting
        time.sleep(0.5)

In [None]:
df = pd.DataFrame(all_data)

print(f"\nScraping completed!")
print(f"Total models processed: {len(df)}")
print(f"Dataset shape: {df.shape}")
df.head()

### Cleaning Markdown Readme Files
This step involves cleaning the readme files extracted from the Hugging Face models to ensure that they contain only relevant textual content, while preserving titles and important information. The cleaning process will remove unnecessary formatting, images, links, and other non-essential elements.

In [None]:
import html
import re


def clean_markdown(text):
    """
    Clean markdown text and extract only textual content while preserving titles.

    Args:
        markdown_text (str): Raw markdown text

    Returns:
        str: Cleaned text with titles preserved
        :param text: the markdown text to clean
    """

    # Remove YAML front matter (--- ... ---)
    text = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL | re.MULTILINE)

    # Remove HTML tags but keep the content
    text = re.sub(r'<[^>]+>', '', text)

    # Remove images ![alt text](url) or ![alt text][ref]
    text = re.sub(r'!\[.*?\]\([^)]*\)', '', text)
    text = re.sub(r'!\[.*?\]\[[^\]]*\]', '', text)

    # Remove standalone image references [image]: url
    text = re.sub(r'^\s*\[.*?\]:\s*https?://.*$', '', text, flags=re.MULTILINE)

    # Convert headers to plain text (preserve titles)
    # Handle # ## ### #### ##### ###### headers
    text = re.sub(r'^#{1,6}\s+(.+)$', r'\1', text, flags=re.MULTILINE)

    # Remove links but keep the link text [text](url) -> text
    text = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', text)

    # Remove reference-style links [text][ref] -> text
    text = re.sub(r'\[([^\]]*)\]\[[^\]]*\]', r'\1', text)

    # Remove link references [ref]: url
    text = re.sub(r'^\s*\[.*?\]:\s*.*$', '', text, flags=re.MULTILINE)

    # Remove code blocks (triple backticks)
    text = re.sub(r'```[\s\S]*?```', '', text)

    # Remove inline code `code` -> code
    text = re.sub(r'`([^`]*)`', r'\1', text)

    # Remove bold and italic formatting
    text = re.sub(r'\*\*\*(.*?)\*\*\*', r'\1', text)  # Bold italic
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Bold
    text = re.sub(r'\*(.*?)\*', r'\1', text)  # Italic
    text = re.sub(r'___(.*?)___', r'\1', text)  # Bold italic
    text = re.sub(r'__(.*?)__', r'\1', text)  # Bold
    text = re.sub(r'_(.*?)_', r'\1', text)  # Italic

    # Remove strikethrough ~~text~~ -> text
    text = re.sub(r'~~(.*?)~~', r'\1', text)

    # Remove blockquotes > text -> text
    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)

    # Remove horizontal rules (--- or ***)
    text = re.sub(r'^[-*]{3,}\s*$', '', text, flags=re.MULTILINE)

    # Remove list markers (-, *, +, numbers)
    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)

    # Remove tables (simple approach - remove lines with | characters)
    text = re.sub(r'^.*\|.*$', '', text, flags=re.MULTILINE)

    # Unescape HTML entities (much more comprehensive than manual replacement)
    text = html.unescape(text)

    # Clean up extra whitespace
    # Remove empty lines with only whitespace
    text = re.sub(r'^\s*$', '', text, flags=re.MULTILINE)

    # Remove multiple consecutive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Remove leading/trailing whitespace from each line
    lines = [line.strip() for line in text.split('\n')]

    # Remove empty lines but preserve paragraph structure
    cleaned_lines = []
    for line in lines:
        if line:  # Only keep non-empty lines
            cleaned_lines.append(line)

    # Join lines back together with single newlines
    result = '\n'.join(cleaned_lines)

    # Remove any remaining multiple newlines
    result = re.sub(r'\n{2,}', '\n', result)

    # Final cleanup
    result = result.strip()

    return result

In [None]:
print("Readme empty string count:", (df['readme_file'] == '').sum())
print("Removing models with empty readme...")
df = df[df['readme_file'] != '']
print("DataFrame shape after removing empty readmes:", df.shape)
print("Applying markdown cleaning to readme files")
df['readme_file'] = df['readme_file'].apply(clean_markdown)
print("Final shape of DataFrame:", df.shape)
df.head()

### Generating Content Embeddings
This is a preprocessing step in order to generate embeddings for the content of the models. The embeddings will be used to compare and rank models based on their metadata and readme content, enabling efficient retrieval in a RAG system. The model used is an open source model: [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3)

A few important considerations:
- The embedding model is not specifically trained on structured metadata fields (e.g. license, tags), so it may not fully capture their semantic weight or relevance.
- README files often contain noisy, inconsistent, or sparse information. This can affect the quality of the resulting embeddings.
- Some fields such as `license`,`language` or `tags` might be missing or incomplete.

At this stage, we apply a simple approach: we concatenate all available metadata fields along with the README content into a single string, which is then embedded. <br>
Maybe a more valuable approach could be to use a weighted sum of embeddings, where each field has a different weight based on its importance or relevance to the model's capabilities.

In [None]:
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
import torch
import gc

embedding_model = "jinaai/jina-embeddings-v3"

def generate_content_embeddings(data, batch_size):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True).to(device)

    # Assicura che tutte le colonne coinvolte siano stringhe, anche se sono liste o NaN
    text_columns = ['model_id', 'base_model', 'author', 'license', 'language', 'tags', 'pipeline_tag', 'library_name', 'readme_file']

    for col in text_columns:
        data[col] = data[col].astype(str)

    data[text_columns] = data[text_columns].fillna("").astype(str)


    data['full_text'] = (
        data['model_id'] + "\n" +
        data['base_model'] + "\n" +
        data['author'] + "\n" +
        data['license'] + "\n" +
        data['language'] + "\n" +
        data['tags'] + "\n" +
        data['pipeline_tag'] + "\n" +
        data['library_name'] + "\n" +
        data['readme_file']
    )


    embeddings = []

    for start in tqdm(range(0, len(data), batch_size), desc="Processing Embeddings Batches"):
        end = min(start + batch_size, len(data))

        try:
            batch_texts = data['full_text'].iloc[start:end].tolist()

            with torch.no_grad():
                batch_embeddings = model.encode(batch_texts, task="text-matching").tolist()

            embeddings.extend(batch_embeddings)

            torch.cuda.empty_cache()
            gc.collect()

        except Exception as e:
            print(f"An error occurred during processing the batch:{start}-{end}. Exception: {e}")
            print("Embeddings will be filled with empty lists.")
            embeddings.extend([[]] * (end - start))

    del model
    torch.cuda.empty_cache()

    data['embeddings'] = embeddings
    data = data.drop(columns=['full_text'])
    return data

In [None]:
df = generate_content_embeddings(df, batch_size=5)
df.head()

In [None]:
df.to_csv("huggingface_models_embeddings.csv", index=False)