In [1]:
import time
import pandas as pd
from huggingface_hub import HfApi, list_models, model_info, hf_hub_download
from tqdm import tqdm
from typing import List, Dict, Optional
from pathlib import Path
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class HuggingfaceScraper:
    def __init__(self, token:str, max_workers: int = 5, batch_size: int = 100):
        self.api = HfApi(token=token)
        self.max_workers = max_workers
        self.batch_size = batch_size
        
    def get_readme_from_model(self, model_id) -> str:
        try:
            readme_content = hf_hub_download(
                repo_id=model_id,
                filename="README.md",
                token=self.api.token,
            )
            
            with open(readme_content, 'r', encoding='utf-8') as f:
                readme_text = f.read()
                
            return readme_text
        except Exception as e:
            print(f"Error fetching README for {model_id}: {e}")
            return ''
        
    def process_single_model(self, model_id: str) -> Optional[Dict]:
        try:
            info = model_info(model_id, token=self.api.token)
            card_data = info.cardData if hasattr(info, 'cardData') and info.cardData else {}

            readme = self.get_readme_from_model(model_id)
            
            return {
                'model_id': model_id,
                'base_model': getattr(card_data, 'base_model', None),
                'author': getattr(info, 'author', None),
                'readme_file': readme,
                'license' : getattr(card_data, 'license', None),
                'language' : getattr(card_data, 'language', None),
                'downloads': getattr(info, 'downloads', 0),
                'likes': getattr(info, 'likes', 0),
                'tags': ', '.join(info.tags) if hasattr(info, 'tags') and info.tags else '',
                'pipeline_tag': getattr(info, 'pipeline_tag', None),
                'library_name': getattr(info, 'library_name', None),
                'created_at': getattr(info, 'created_at', None),
            }
        except Exception as e:
            print(f"Error processing {model_id}: {e}")
            return None
        
    def process_batch_threaded(self, model_ids: List[str]) -> List[Dict]:
        """Process a batch of models using ThreadPoolExecutor"""
        results = []
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all tasks
            future_to_model = {
                executor.submit(self.process_single_model, model_id): model_id 
                for model_id in model_ids
            }
            
            # Collect results with progress bar
            for future in tqdm(as_completed(future_to_model), 
                             total=len(model_ids), 
                             desc=f"Processing batch"):
                try:
                    result = future.result(timeout=60)  # 60 second timeout
                    if result:
                        results.append(result)
                except Exception as e:
                    model_id = future_to_model[future]
                    logger.error(f"Timeout/Error for {model_id}: {e}")
        
        return results
        
    def scrape_models_checkpoint(self, checkpoint_file:str ="scraping_checkpoint.json") -> pd.DataFrame:
        
        checkpoint_data = []
        start_index = 0
        
        if Path(checkpoint_file).exists():
            try:
                with open(checkpoint_file, 'r') as f:
                    checkpoint_data = json.load(f)
                start_index = len(checkpoint_data)
            except Exception as e:
                print(f"Could not load checkpoint file: {e}")
                
        print("Fetching models from Hugging Face Hub...")
        models = list(list_models(limit=1100))
        print("Models fetched.")
        
        # Process remaining models
        models_to_process = models[start_index:]
        all_data = checkpoint_data.copy()
        
        
        for i in range(0, len(models_to_process), self.batch_size):
            batch_models = models_to_process[i:i + self.batch_size]
            batch_ids = [m.modelId for m in batch_models]
            
            print(f"Processing batch {i//self.batch_size + 1}/{(len(models_to_process)-1)//self.batch_size + 1}")
            
                        # Process batch
            batch_results = self.process_batch_threaded(batch_ids)
            all_data.extend(batch_results)
            
            # Save checkpoint
            
            try:
                with open(checkpoint_file, 'w') as f:
                    json.dump(all_data, f, indent=2, default=str)
                logger.info(f"Checkpoint saved: {len(all_data)} models processed")
            except Exception as e:
                logger.error(f"Could not save checkpoint: {e}")
            
            # Rate limiting
            time.sleep(1)  # Brief pause between batches
        
        return pd.DataFrame(all_data)
    

# Example usage
if __name__ == "__main__":
    
    scraper = HuggingfaceScraper(token="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", max_workers=5, batch_size=100)
    
    df = scraper.scrape_models_checkpoint(checkpoint_file="scraping_checkpoint.json")
            
    

  from .autonotebook import tqdm as notebook_tqdm


Fetching models from Hugging Face Hub...
Models fetched.


In [3]:
df.head(200)

Unnamed: 0,model_id,base_model,author,readme_file,license,language,downloads,likes,tags,pipeline_tag,library_name,created_at
0,ByteDance-Seed/BAGEL-7B-MoT,[Qwen/Qwen2.5-7B-Instruct],ByteDance-Seed,---\nlicense: apache-2.0\nbase_model:\n- Qwen/...,apache-2.0,,5831,826,"bagel-mot, any-to-any, arxiv:2505.14683, base_...",any-to-any,bagel-mot,2025-05-19 23:27:50+00:00
1,mistralai/Devstral-Small-2505,[mistralai/Devstrall-Small-2505],mistralai,---\nlanguage:\n- en\n- fr\n- de\n- es\n- pt\n...,apache-2.0,"[en, fr, de, es, pt, it, ja, ko, ru, zh, ar, f...",121541,635,"vllm, safetensors, mistral, text2text-generati...",text2text-generation,vllm,2025-05-12 21:49:21+00:00
2,deepseek-ai/DeepSeek-R1-0528,,deepseek-ai,,,,0,968,"safetensors, deepseek_v3, conversational, cust...",text-generation,,2025-05-28 09:46:42+00:00
3,sarvamai/sarvam-m,[mistralai/Mistral-Small-3.1-24B-Base-2503],sarvamai,---\nlibrary_name: transformers\nlicense: apac...,apache-2.0,"[en, bn, hi, kn, gu, mr, ml, or, pa, ta, te]",2378,192,"transformers, safetensors, mistral, text-gener...",text-generation,transformers,2025-05-20 06:39:05+00:00
4,google/gemma-3n-E4B-it-litert-preview,,google,---\nlicense: gemma\npipeline_tag: image-text-...,gemma,,0,628,"image-text-to-text, arxiv:1905.07830, arxiv:19...",image-text-to-text,,2025-05-18 19:24:14+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
195,fofr/sdxl-emoji,stabilityai/stable-diffusion-xl-base-1.0,fofr,---\nlicense: creativeml-openrail-m\ntags:\n ...,creativeml-openrail-m,,479,472,"diffusers, text-to-image, stable-diffusion, lo...",text-to-image,diffusers,2024-06-20 09:05:06+00:00
196,prithivMLmods/Qwen2-VL-OCR-2B-Instruct,[Qwen/Qwen2-VL-2B-Instruct],prithivMLmods,---\nlicense: apache-2.0\ndatasets:\n- unsloth...,apache-2.0,[en],126345,77,"transformers, safetensors, qwen2_vl, image-tex...",image-text-to-text,transformers,2024-12-19 01:57:34+00:00
197,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,,deepseek-ai,---\nlicense: mit\nlibrary_name: transformers\...,mit,,1358368,1201,"transformers, safetensors, qwen2, text-generat...",text-generation,transformers,2025-01-20 09:04:18+00:00
198,Qwen/Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B,Qwen,---\nlicense: apache-2.0\nlicense_link: https:...,apache-2.0,[en],1310905,440,"transformers, safetensors, qwen2, text-generat...",text-generation,transformers,2024-09-17 14:10:29+00:00
