In [1]:
import time
import tempfile
import os
import re
import html
import shutil
import pandas as pd
from huggingface_hub import hf_hub_download, list_datasets
from tqdm import tqdm
from pathlib import Path
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import warnings

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Data Acquisition: Building a Dataset from Hugging Face Hub API

### Overview
The goal of this step is to construct a robust and informative dataset containing metadata for every dataset hosted on the Hugging Face Hub. This dataset will serve as a ground truth for a Retrieval-Augmented Generation (RAG) system.

### The Hugging Face Hub API
The [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/package_reference/hf_api) provides programmatic access to the platform's extensive model repository through several endpoints, each serving different purposes and offering varying levels of detail.<br>
The [`list_datasets()`](https://huggingface.co/docs/huggingface_hub/v0.32.3/en/package_reference/hf_api#huggingface_hub.HfApi.list_datasets) method allows iteration over DatasetInfo objects, with the use of the `full=True` parameter we can retrive all the metadata associated with each dataset.

---

### Metadata Fields
The fields that we can retrieve and those that are most useful for our purposes include:
- **dataset_id**: Unique identifier for the dataset.
- **author**: The creator or maintainer of the dataset.
- **created_at**: Timestamp indicating when the dataset was created.
- **readme_file**: The readme file of the model repository, which may contain additional context and information about the model.
- **downloads**: Number of times the dataset has been downloaded.
- **likes**: Number of likes the dataset has received.
- **tags**: Tags associated with the dataset, useful for categorization.
- **language**: Language of dataset’s data or metadata.
- **license**: License under which the dataset is released.
- **multilinguality**: Whether the dataset is multilingual. Options are: ‘monolingual’, ‘multilingual’, ‘translation’, ‘other’.
- **size_categories**: The number of examples in the dataset. Options are: ‘n<1K’, ‘1K1T’, and ‘other’.
- **task-categories**: The tasks the dataset is intended for, such as ‘text-classification’, ‘text-generation’, etc.

---

### The Challenge of Context
While the metadata fields provide valuable insights, they often lack the necessary context to fully understand the dataset's purpose and content. The readme file is a crucial component that can fill this gap, offering detailed explanations, usage examples, and additional information that is not captured in the metadata alone, but it comes with its own set of challenges that we've seen before:
- **Inconsistency**: Not all models have a readme file, and those that do may vary significantly in content quality and relevance.
- **Information Overload**: Some readme files may contain excessive or irrelevant information, making it difficult to extract useful insights.
- **Lack of Control**: The content of readme files is user-generated, so we cannot guarantee the presence or quality of information.
- **Performance**: Downloading readme files for a large number of models can be time-consuming and resource-intensive.

In [2]:
# Configuration
HF_TOKEN = "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"
MAX_WORKERS = 5
BATCH_SIZE = 10
CHECKPOINT_FILE = "dataset_scraping_checkpoint.json"
MODEL_LIMIT = 100
# Cache Directory
CHACE_DIR = "temp_cache"
os.makedirs(CHACE_DIR, exist_ok=True)
TEMP_CACHE_DIR = tempfile.mkdtemp(prefix="hf_temp_cache_", dir=CHACE_DIR)
# Generating Embeddings Batch Size
EMBEDDINGS_BATCH_SIZE = 200

In [3]:
def get_readme_from_repository(repository_id: str):
    try:
        readme_content = hf_hub_download(
            repo_id=repository_id,
            filename="README.md",
            token=HF_TOKEN,
            repo_type="dataset",
            cache_dir=TEMP_CACHE_DIR
        )

        with open(readme_content, "r", encoding="utf-8") as f:
            readme_text = f.read()

        return readme_text
    except Exception as e:
        logger.error(f"Failed to download README for {repository_id}: {e}")
        return ""


def process_single_dataset(dataset_info):
    try:
        dataset_id = dataset_info.id
        card_data = dataset_info.cardData if hasattr(dataset_info, 'cardData') else {}
        readme_text = get_readme_from_repository(dataset_id)

        return {
            "dataset_id": dataset_id,
            "author": getattr(dataset_info, 'author', None),
            "created_at": getattr(dataset_info, 'created_at', None),
            "readme_file": readme_text,
            "downloads": getattr(dataset_info, 'downloads', 0),
            "likes": getattr(dataset_info, 'likes', 0),
            "tags": getattr(dataset_info, 'tags', None),
            "language": getattr(card_data, 'language', None),
            "license": getattr(card_data, 'license', None),
            "multilinguality": getattr(card_data, 'multilinguality', None),
            "size_categories": getattr(card_data, 'size_categories', None),
            "task-categories": getattr(card_data, 'task_categories', None),
        }
    except Exception as e:
        logger.error(f"Error processing dataset {dataset_info.id}: {e}")
        return None


def process_batch_threaded(dataset_list):
    """Process a batch of dataset info objects using ThreadPoolExecutor"""
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        future_to_dataset = {
            executor.submit(process_single_dataset, dataset_info): dataset_info.id
            for dataset_info in dataset_list
        }

        # Collect results with progress bar
        for future in tqdm(as_completed(future_to_dataset),
                           total=len(dataset_list),
                           desc=f"Processing batch"):
            try:
                result = future.result(timeout=60)  # 60 second timeout
                if result:
                    results.append(result)
            except Exception as e:
                dataset_id = future_to_dataset[future]
                logger.error(f"Timeout/Error for {dataset_id}: {e}")

    return results


def save_checkpoint(data):
    # Save checkpoint
    try:
        with open(CHECKPOINT_FILE, 'w') as f:
            json.dump(data, f, indent=2, default=str)
        logging.info(f"Checkpoint saved: {len(data)} datasets processed")
        print(f"Progress: {len(data)}/{len(datasets)} datasets completed")
    except Exception as e:
        logging.error(f"Could not save checkpoint: {e}")


def cleanup_temp_cache():
    try:
        if os.path.exists(TEMP_CACHE_DIR):
            shutil.rmtree(TEMP_CACHE_DIR)
            logging.info(f"Current temporary cache {TEMP_CACHE_DIR} deleted.")
    except Exception as e:
        logging.error(f"Could not delete current temp cache dir {TEMP_CACHE_DIR}: {e}")


def clean_all_cache_folders():
    try:
        if os.path.exists(CHACE_DIR):
            for item in os.listdir(CHACE_DIR):
                item_path = os.path.join(CHACE_DIR, item)
                if os.path.isdir(item_path):
                    shutil.rmtree(item_path)
                    logging.info(f"Deleted cache folder: {item_path}")
                else:
                    os.remove(item_path)
                    logging.info(f"Deleted cache file: {item_path}")

            logging.info(f"All cache folders in {CHACE_DIR} cleaned up.")
    except Exception as e:
        logging.error(f"Could not delete cache folders in {CHACE_DIR}: {e}")


In [5]:
# Load Checkpoint (if exists)
checkpoint_data = []
start_index = 0

if Path(CHECKPOINT_FILE).exists():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint_data = json.load(f)
        start_index = len(checkpoint_data)
        print(f"Loaded checkpoint: {start_index} datasets already processed")
    except Exception as e:
        print(f"Could not load checkpoint file: {e}")
else:
    print("No checkpoint file found. Starting from scratch.")

Loaded checkpoint: 162400 datasets already processed


In [148]:
print("Fetching dataset info from Hugging Face Hub...")
datasets = list(list_datasets(limit=MODEL_LIMIT, full=True))
print(f"Total datasets found: {len(datasets)}")

datasets_to_process = datasets[start_index:]
all_data = checkpoint_data.copy()

print(f"Datasets remaining to process: {len(datasets_to_process)}")

Fetching dataset info from Hugging Face Hub...
Total datasets found: 100
Datasets remaining to process: 100


In [149]:
if datasets_to_process:
    total_batches = (len(datasets_to_process) - 1) // BATCH_SIZE + 1

    for i in range(0, len(datasets_to_process), BATCH_SIZE):
        batch_datasets = datasets_to_process[i:i + BATCH_SIZE]

        current_batch = i // BATCH_SIZE + 1
        print(f"\nProcessing batch {current_batch}/{total_batches}")
        print(f"Batch size: {len(batch_datasets)} datasets")

        # Process batch
        batch_results = process_batch_threaded(batch_datasets)
        all_data.extend(batch_results)

        # Save checkpoint
        save_checkpoint(all_data)

        # Clean up temporary temp_cache directory
        cleanup_temp_cache()

        # Rate limiting
        time.sleep(0.5)


Processing batch 1/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:01<00:00,  8.44it/s]
INFO:root:Checkpoint saved: 10 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 10/100 datasets completed

Processing batch 2/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:01<00:00,  6.92it/s]
INFO:root:Checkpoint saved: 20 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 20/100 datasets completed

Processing batch 3/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:01<00:00,  7.13it/s]
INFO:root:Checkpoint saved: 30 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 30/100 datasets completed

Processing batch 4/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:01<00:00,  5.00it/s]
INFO:root:Checkpoint saved: 40 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 40/100 datasets completed

Processing batch 5/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:01<00:00,  7.46it/s]
INFO:root:Checkpoint saved: 50 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 50/100 datasets completed

Processing batch 6/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:01<00:00,  7.24it/s]
INFO:root:Checkpoint saved: 60 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 60/100 datasets completed

Processing batch 7/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:01<00:00,  9.90it/s]
INFO:root:Checkpoint saved: 70 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 70/100 datasets completed

Processing batch 8/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:00<00:00, 12.98it/s]
INFO:root:Checkpoint saved: 80 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 80/100 datasets completed

Processing batch 9/10
Batch size: 10 datasets


Processing batch:   0%|          | 0/10 [00:00<?, ?it/s]ERROR:__main__:Failed to download README for mamachang/medical-reasoning: 404 Client Error. (Request ID: Root=1-684828b7-7bb400e765d1d4bf58d3535d;edffb06a-ffcb-43d1-be5c-187ccf2df6b1)

Entry Not Found for url: https://huggingface.co/datasets/mamachang/medical-reasoning/resolve/main/README.md.
Processing batch: 100%|██████████| 10/10 [00:01<00:00,  8.18it/s]
INFO:root:Checkpoint saved: 90 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 90/100 datasets completed

Processing batch 10/10
Batch size: 10 datasets


Processing batch: 100%|██████████| 10/10 [00:00<00:00, 10.58it/s]
INFO:root:Checkpoint saved: 100 datasets processed
INFO:root:Current temporary cache C:\Workspace\DigitalTwins\temp_cache\hf_temp_cache_x1qys5ba deleted.


Progress: 100/100 datasets completed


In [150]:
df = pd.DataFrame(all_data)

print(f"\nScraping completed!")
print(f"Total datasets processed: {len(df)}")
print(f"Dataset shape: {df.shape}")
print("Cleaning up all cache folders...")
clean_all_cache_folders()

INFO:root:All cache folders in temp_cache cleaned up.



Scraping completed!
Total datasets processed: 100
Dataset shape: (100, 12)
Cleaning up all cache folders...


### Cleaning Markdown Readme Files
This step involves cleaning the readme files extracted from the Hugging Face models to ensure that they contain only relevant textual content, while preserving titles and important information. The cleaning process will remove unnecessary formatting, images, links, and other non-essential elements.

In [152]:
def clean_markdown(text):
    # Remove YAML front matter
    text = re.sub(r'^---.*?---\s*', '', text, flags=re.DOTALL)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove images (![alt](url))
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)

    # Remove markdown links but keep the visible text: [text](url) → text
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)

    # Remove tables (lines containing |, excluding bullet points)
    text = re.sub(r'^\s*\|.*\|.*$', '', text, flags=re.MULTILINE)

    # Remove code blocks (``` ... ```)
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)

    # Remove inline code (`code`)
    text = re.sub(r'`[^`]+`', '', text)

    # Remove blockquotes (> ...)
    text = re.sub(r'^\s*>.*$', '', text, flags=re.MULTILINE)

    # Remove citation block (```)
    text = re.sub(r'^@misc.*?```', '', text, flags=re.DOTALL)

    # Remove extra newlines
    text = re.sub(r'\n{2,}', '\n\n', text)

    # Trim whitespace
    return text.strip()


# Clean the readme text in the DataFrame
df['readme_file'] = df['readme_file'].apply(clean_markdown)

In [153]:
print("Readme empty string count:", (df['readme_file'] == '').sum())
print("Removing models with empty readme...")
df = df[df['readme_file'] != '']
print("DataFrame shape after removing empty readmes:", df.shape)
print("Applying markdown cleaning to readme files")
df['readme_file'] = df['readme_file'].apply(clean_markdown)
print("Final shape of DataFrame:", df.shape)
df.head()

Readme empty string count: 8
Removing models with empty readme...
DataFrame shape after removing empty readmes: (92, 12)
Applying markdown cleaning to readme files
Final shape of DataFrame: (92, 12)


Unnamed: 0,dataset_id,author,created_at,readme_file,downloads,likes,tags,language,license,multilinguality,size_categories,task-categories
0,open-thoughts/OpenThoughts3-1.2M,open-thoughts,2025-05-28 21:51:11+00:00,paper |\ndataset |\nmodel\n\n \n\n# OpenThough...,3695,75,"[task_categories:text-generation, license:apac...",,apache-2.0,,,[text-generation]
1,a-m-team/AM-DeepSeek-R1-0528-Distilled,a-m-team,2025-06-04 01:50:01+00:00,## 📘 Dataset Summary\n\nThis dataset is a high...,2501,46,"[task_categories:text-generation, language:en,...","[en, zh]",,,[1M<n<10M],[text-generation]
2,fka/awesome-chatgpt-prompts,fka,2022-12-13 23:47:45+00:00,🧠 Awesome ChatGPT Prompts [CSV dataset]\n\nThi...,20949,7893,"[task_categories:question-answering, license:c...",,cc0-1.0,,[100K<n<1M],[question-answering]
3,yandex/yambda,yandex,2025-05-27 10:41:39+00:00,# Yambda-5B — A Large-Scale Multi-modal Datase...,39566,154,"[license:apache-2.0, size_categories:1B<n<10B,...",,apache-2.0,,[1B<n<10B],
4,Hcompany/WebClick,Hcompany,2025-04-30 19:44:42+00:00,# WebClick: A Multimodal Localization Benchmar...,3557,45,"[task_categories:visual-document-retrieval, la...",[en],apache-2.0,,,[visual-document-retrieval]


### Generating Content Embeddings
This is a preprocessing step in order to generate embeddings for the content of the models. The embeddings will be used to compare and rank models based on their metadata and readme content, enabling efficient retrieval in a RAG system. The model used is an open source model: [jinaai/jina-embeddings-v3](https://huggingface.co/jinaai/jina-embeddings-v3)

In [140]:
from transformers import AutoModel
import torch
import gc

embedding_model = "jinaai/jina-embeddings-v3"


def generate_content_embeddings(data, batch_size):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True).to(device)

    text_columns = ['dataset_id', 'author', 'created_at', 'readme',
                    'tags', 'language', 'license', 'multilinguality', 'size_categories',
                    'task-categories']

    for col in text_columns:
        data[col] = data[col].astype(str)

    data[text_columns] = data[text_columns].fillna("").astype(str)

    data['full_text'] = (
            data['dataset_id'] + "\n" +
            data['author'] + "\n" +
            data['created_at'] + "\n" +
            data['readme'] + "\n" +
            data['tags'] + "\n" +
            data['language'] + "\n" +
            data['license'] + "\n" +
            data['multilinguality'] + "\n" +
            data['size_categories'] + "\n" +
            data['task-categories']
    )

    embeddings = []

    for start in tqdm(range(0, len(data), batch_size), desc="Processing Embeddings Batches"):
        end = min(start + batch_size, len(data))

    try:
        batch_texts = data['full_text'].iloc[start:end].tolist()

        with torch.no_grad():
            batch_embeddings = model.encode(batch_texts, task="text-matching").tolist()

        embeddings.extend(batch_embeddings)

        torch.cuda.empty_cache()
        gc.collect()

    except Exception as e:
        print(f"An error occurred during processing the batch:{start}-{end}. Exception: {e}")
        print("Embeddings will be filled with empty lists.")
        embeddings.extend([[]] * (end - start))

    del model
    torch.cuda.empty_cache()

    data['embeddings'] = embeddings
    data = data.drop(columns=['full_text'])
    return data

ImportError: cannot import name 'Automodel' from 'transformers' (C:\Users\Diego\miniconda3\envs\DigitalTwins\Lib\site-packages\transformers\__init__.py)

In [None]:
df = generate_content_embeddings(df, batch_size=EMBEDDINGS_BATCH_SIZE)
df.head()

In [1]:
df.to_csv("Output/datasets_hg_embeddings.csv", index=False)

NameError: name 'df' is not defined