# 对 HuggingFace 公开大模型的数据分析

最受用户喜欢的文本生成模型是怎样的？

# 数据爬取

Hugging Face 仅提供 100 页（每页30个模型）的数据供浏览，也即 3000 个模型，但存放的数据远比这个多（截至2024.12.15有 1,201,412 个模型）。

因此爬取全站的模型是不可行的，哪怕将范围拉到最近7天内的文本生成模型，就有超过 3000 个模型。因此，我们的爬取策略转换为

- 爬取前 3000 个最受用户喜欢的文本生成模型

对其进行数据分析。

In [1]:
import requests
import json
import os
from typing import List
# this url was reverse engineered
base_url = "https://huggingface.co/models-json"

def get_single_page_models(p) -> List[dict]:
    '''30 models per page'''
    assert 0 <= p <= 99, 'page must be positive and less than or equal to 99'
    models = []
    cache_path = f'cache/models_page_{p}.json'
    # read from cache if exists
    if os.path.exists(cache_path):
        with open(cache_path, 'r') as f:
            models = json.load(f)
    else:
        resp = requests.get(base_url, params={"p": p,
                                              "sort": "likes",
                                              "pipeline_tag": "text-generation",
                                              "withCount": True    # default true
                                              })
        models = resp.json().get('models', [])
        
        if not os.path.exists('cache'):
            os.makedirs('cache')
        
        with open(cache_path, 'w') as f:
            json.dump(models, f)
    
    return models

# example of model data
get_single_page_models(1)[0]

{'author': 'Qwen',
 'authorData': {'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png',
  'fullname': 'Qwen',
  'name': 'Qwen',
  'type': 'org',
  'isHf': False,
  'isMod': False,
  'isEnterprise': False,
  'followerCount': 6262},
 'downloads': 107082,
 'gated': False,
 'id': 'Qwen/QwQ-32B-Preview',
 'inference': 'warm',
 'lastModified': '2024-11-29T06:57:37.000Z',
 'likes': 1302,
 'pipeline_tag': 'text-generation',
 'private': False,
 'repoType': 'model',
 'isLikedByUser': False,
 'widgetOutputUrls': []}

## 二级爬取

模型参数作为标签不出现在概览中，针对模型大小的分析，我们考虑通过正则表达式来获取模型大小，例如

`meta-llama/Llama-3.3-70B-Instruct` 就可以pattern匹配得到模型大小为70B。

问题: 如果模型名不包含参数大小, 方法失效.

此外, 有一些模型的参数比较特殊, 例如 `mistralai/Mixtral-8x7B-Instruct-v0.1` 提取出来是 8x7B? 7B? 但实际上是 46.7B.

于是我们认为有需要进入详细页面爬取具体的模型参数, 来获取更精准的模型大小.

![model size](images/model_size.png)

In [4]:
from lxml import html
import re

def get_model_para(model: str, cached=True):
    '''given a model name, return the model size in billion.
    >>> print(get_model_para('CohereForAI/c4ai-command-r7b-12-2024'))
    8.03
    '''
    author, model_name = model.split('/')
    cache_path = f'cache/{author}-{model_name}.html'
    
    if os.path.exists(cache_path) and cached:
        with open(cache_path, 'r') as f:
            raw_html = f.read()
    else:
        base_url = 'https://huggingface.co/'
        resp = requests.get(base_url + model)
        raw_html = resp.content.decode('utf-8')
        
        if cached:
            with open(cache_path, 'w') as f:
                f.write(raw_html)
    
    tree = html.fromstring(raw_html)
    try:
        para = tree.xpath('//div[@class="px-1.5"]')[0]
        model_size = re.findall(r'(\d+\.\d+)B params', para.text)[0]
        model_size = round(float(model_size))
    except:
        # no model size found at tag
        model_size = None
    return model_size


# example usage
print(get_model_para('CohereForAI/c4ai-command-r7b-12-2024'))

ParserError: Document is empty

## Sqlite ORM

定义数据模型，将数据模型映射到数据库表中，数据存储在本地数据库 SQlite 中

In [7]:
import datetime

from sqlalchemy import Integer, String, DateTime, ForeignKey
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped, mapped_column, relationship

class Base(DeclarativeBase):
    pass


class Author(Base):
    __tablename__ = 'Author'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40), unique=True)
    type: Mapped[str] = mapped_column(String(40))
    isEnterprise: Mapped[bool] = mapped_column(Integer)
    
    models: Mapped[List["Model"]] = relationship(
        "Model", back_populates="author", cascade="all, delete-orphan"
    )
    
    def __repr__(self):
        return f"<{self.name}>"
    
    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "type": self.type,
            "isEnterprise": self.isEnterprise,
        }


class Model(Base):
    __tablename__ = 'Model'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40), unique=True)
    lastModified: Mapped[datetime.datetime] = mapped_column(DateTime)
    downloads: Mapped[int] = mapped_column(Integer)
    likes: Mapped[int] = mapped_column(Integer)
    
    author_id: Mapped[int] = mapped_column(Integer, ForeignKey('Author.id'))
    author: Mapped[Author] = relationship("Author", back_populates="models")
    
    def __repr__(self):
        return f"<{self.name}>"
    
    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "lastModified": self.lastModified,
            "downloads": self.downloads,
            "likes": self.likes,
            "author_id": self.author_id,
        }

模型序列化器：给一个 dict，返回一个对象

In [10]:
def author_seralizer(author_data) -> Author:
    # print(author_data)
    author = Author(
        name=author_data['name'],
        type=author_data['type'],
        isEnterprise=author_data.get('isEnterprise', False),
    )
    return author

def model_serializer(model_data) -> Model:
    model = Model(
        name=model_data['id'],
        lastModified=datetime.datetime.strptime(model_data['lastModified'], '%Y-%m-%dT%H:%M:%S.%fZ'),
        downloads=model_data['downloads'],
        likes=model_data['likes'],
        author=author_seralizer(model_data['authorData']),
    )
    return model

## 数据库操纵

### 创建数据库

In [14]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data.db', echo=False)
# If no migration
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

### 爬取与存放数据

In [17]:
from sqlalchemy.orm import Session

mx_page = 99
with Session(engine) as session:
    for page in range(0, mx_page + 1):
        models = get_single_page_models(page)
        for model in models:
            model_obj = model_serializer(model)
            # 如果作者已存在，则使用已存在的作者对象，否则重复创建会导致唯一约束冲突
            existing_author = session.query(Author).filter_by(name=model_obj.author.name).first()
            if existing_author:
                model_obj.author = existing_author
            session.add(model_obj)
        
    session.commit()

## 转换为 Dataframe

In [19]:
import pandas as pd

models = pd.read_sql_table('Model', 'sqlite:///data.db')
models.head()

Unnamed: 0,id,name,lastModified,downloads,likes,author_id
0,1,meta-llama/Meta-Llama-3-8B,2024-09-27 15:52:33,621675,5916,1
1,2,bigscience/bloom,2023-07-28 17:50:20,14616,4796,2
2,3,mistralai/Mixtral-8x7B-Instruct-v0.1,2024-08-19 13:18:42,1966899,4227,3
3,4,meta-llama/Llama-2-7b,2024-04-17 08:12:44,0,4189,1
4,5,meta-llama/Llama-2-7b-chat-hf,2024-04-17 08:40:48,1054649,4074,1


In [20]:
authors = pd.read_sql_table('Author', 'sqlite:///data.db')
authors.head()

Unnamed: 0,id,name,type,isEnterprise
0,1,meta-llama,org,1
1,2,bigscience,org,0
2,3,mistralai,org,1
3,4,microsoft,org,0
4,5,google,org,0


In [21]:
models.describe()

Unnamed: 0,id,lastModified,downloads,likes,author_id
count,3000.0,3000,3000.0,3000.0,3000.0
mean,1500.5,2024-02-23 06:01:02.547000064,53430.11,119.395667,201.052333
min,1.0,2021-03-03 01:44:59,0.0,19.0,1.0
25%,750.75,2023-10-02 23:15:24,231.0,27.0,25.0
50%,1500.5,2024-03-16 01:28:38.500000,1325.0,43.0,119.0
75%,2250.25,2024-07-29 09:05:29.249999872,5638.0,92.25,312.25
max,3000.0,2024-12-16 06:28:17,12202330.0,5916.0,788.0
std,866.169729,,456822.2,310.613872,207.443234


In [22]:
authors.describe()

Unnamed: 0,id,isEnterprise
count,788.0,788.0
mean,394.5,0.048223
std,227.620298,0.214374
min,1.0,0.0
25%,197.75,0.0
50%,394.5,0.0
75%,591.25,0.0
max,788.0,1.0


# 数据预处理

## 模型参数

In [26]:
def extract_model_size(model_name):
    pattern = r'(\d+(?:x\d+)?[Bb])'
    match = re.search(pattern, model_name)
    return match.group(1) if match else None

探索: regex 的可行性?

In [33]:
models.loc[:, 'model_size'] = models['name'].apply(
    extract_model_size)
models.head()

Unnamed: 0,id,name,lastModified,downloads,likes,author_id,model_size
0,1,meta-llama/Meta-Llama-3-8B,2024-09-27 15:52:33,621675,5916,1,8B
1,2,bigscience/bloom,2023-07-28 17:50:20,14616,4796,2,
2,3,mistralai/Mixtral-8x7B-Instruct-v0.1,2024-08-19 13:18:42,1966899,4227,3,8x7B
3,4,meta-llama/Llama-2-7b,2024-04-17 08:12:44,0,4189,1,7b
4,5,meta-llama/Llama-2-7b-chat-hf,2024-04-17 08:40:48,1054649,4074,1,7b


In [35]:
models.isna().sum()

id                0
name              0
lastModified      0
downloads         0
likes             0
author_id         0
model_size      563
dtype: int64

In [37]:
def to_int(model_size):
    pattern = r'(\d+)[Bb]'
    if model_size:
        if type(model_size) == int:
            return model_size
        match = re.search(pattern, model_size)
        if match:
            return int(match.group(1))
        else:
            return None
    return None

print(to_int('0x128b'))
print(to_int(127))
print(to_int(None))

128
127
None


In [39]:
models.loc[:, 'model_size'] = models['model_size'].apply(to_int)

## 缺省值处理

In [42]:
models['model_size'] = models.apply(lambda row: get_model_para(row['name']) if pd.isna(row['model_size']) else row['model_size'], axis=1)

UnicodeEncodeError: 'gbk' codec can't encode character '\xf1' in position 16884: illegal multibyte sequence

In [None]:
models.to_csv('models.csv', index=False)

## 数据集分割

一些分割数据集的工具函数

In [None]:
models = pd.read_csv('models.csv')

In [None]:
def split_qualified_models(models : pd.DataFrame, likes_threshold, download_threshold):
    criteria = (models['downloads'] < download_threshold) | (models['likes'] < likes_threshold)
    
    low_quality_models = models[criteria]
    high_quality_models = models[~criteria]
    
    return high_quality_models, low_quality_models


def split_models_by_time(models : pd.DataFrame, time_threshold):
    criteria = models['lastModified'] > time_threshold
    
    recent_models = models[criteria]
    old_models = models[~criteria]
    
    return recent_models, old_models

In [None]:
likes_threshold = models.describe().loc['25%','likes']
download_threshold = 10

high_quality_models, low_quality_models = split_qualified_models(models, likes_threshold, download_threshold)
high_quality_models.describe()

In [None]:
time_threshold = '2024-01-01 00:00:00'  # this year
high_quality_models_24_after, high_quality_models_24_before = split_models_by_time(
    high_quality_models, time_threshold)
high_quality_models_24_after.describe()

# 数据可视化

## 模型参数量

### 探究2024年开源社区最喜爱的模型参数量

In [None]:
def reset_index_from_one(models):
    models = models.reset_index(drop=True)
    models.index +=1
    return models
def top_features(models,feature,top_n=10):
    top_models = models.nlargest(top_n,feature)
    return top_models

#### 2024年里下载量前十的模型

In [None]:
top_downloads_models = reset_index_from_one(top_features(high_quality_models_24_after,'downloads',10))
top_downloads_models

In [None]:
import matplotlib.pyplot as plt
import numpy as np
def extract_model_size(model_size):
    #将模型参数量转换为数值
    if model_size is None:
        return 0
    match = re.search(r'(\d+)', str(model_size))
    return int(match.group(1)) if match else 0

In [None]:
top_downloads_models['model_size'] = top_downloads_models['model_size'].apply(extract_model_size)
top_downloads_models_sorted = top_downloads_models.sort_values(by='downloads', ascending=True)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_downloads_models)))#让每个模型柱状颜色不同
plt.figure(figsize=(12, 8))
plt.barh(top_downloads_models_sorted['name'], top_downloads_models_sorted['model_size'], color=colors)
plt.title('Top 10 Models by Model downloads')
plt.xlabel('Model Size (B)')
plt.ylabel('Model Name')
plt.xlim(0, 10)
#plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
high_quality_models_24_before['model_size'] = high_quality_models_24_before['model_size'].apply(extract_model_size)
high_quality_models_24_after['model_size'] = high_quality_models_24_after['model_size'].apply(extract_model_size)

#### 2024年之前所有高质量NlP大模型参数量的分布

In [None]:
bins = [0, 1, 20, 40, 60,80,float('inf')]
labels = ['<1B','1B-20B','20B-40B','40B-60B','60B-80B','>80B']
size_distribution = pd.cut(high_quality_models_24_before['model_size'], bins=bins, labels=labels, right=False)

# 绘制饼状图
plt.figure(figsize=(8, 8))
plt.pie(size_distribution.value_counts(), labels=labels, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('before 2024 Distribution of Model Sizes')
plt.show()

#### 2024年所有高质量NlP大模型参数量的分布

In [None]:
bins = [0, 1, 20, 40, 60,80,float('inf')]
labels = ['<1B','1B-20B','20B-40B','40B-60B','60B-80B','>80B']
size_distribution = pd.cut(high_quality_models_24_after['model_size'], bins=bins, labels=labels, right=False)

# 绘制饼状图
plt.figure(figsize=(8, 8))
plt.pie(size_distribution.value_counts(), labels=labels, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('after 2024 Distribution of Model Sizes')
plt.show()

## 时间分布

## 作者类型