In [1]:
from langchain_openai import ChatOpenAI
import os
from langchain_core.output_parsers import StrOutputParser
os.environ["HTTP_PROXY"] = "http://127.0.0.1:8890"
os.environ["HTTPS_PROXY"] = "http://127.0.0.1:8890"
# ucloud api key
API_KEY = os.getenv("UCLOUD_API_KEY")
API_BASE = "https://api.modelverse.cn/v1"
model = ChatOpenAI(
    openai_api_key=API_KEY,
    openai_api_base=API_BASE,
    model="deepseek-ai/DeepSeek-V3-0324")
output_parser = StrOutputParser()


In [28]:
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(
    "HuggingFace/",
    loader_cls=lambda path: TextLoader(path, encoding="utf-8")
)
docs = loader.load()

docs

[Document(metadata={'source': 'HuggingFace\\agibot-world_results.txt'}, page_content='**agibot-world/AgiBotWorld-Beta**  \n分类：数据集  \nlike数：34  \ndownload数：37482  \n最后更新时间：2025-04-12T08:41:06.000Z  \nurl: https://huggingface.co/datasets/agibot-world/AgiBotWorld-Beta  \n\n**agibot-world/AgiBotWorld-Alpha**  \n分类：数据集  \nlike数：198  \ndownload数：5777  \n最后更新时间：2025-07-10T11:49:26.000Z  \nurl: https://huggingface.co/datasets/agibot-world/AgiBotWorld-Alpha  \n\n**agibot-world/AgiBotWorldChallenge-2025**  \n分类：数据集  \nlike数：17  \ndownload数：6432  \n最后更新时间：2025-08-29T03:13:25.000Z  \nurl: https://huggingface.co/datasets/agibot-world/AgiBotWorldChallenge-2025  \n\n**agibot-world/AgiBotDigitalWorld**  \n分类：数据集  \nlike数：34  \ndownload数：5311  \n最后更新时间：2025-02-27T01:38:57.000Z  \nurl: https://huggingface.co/datasets/agibot-world/AgiBotDigitalWorld  \n\n**agibot-world/GenieSimAssets**  \n分类：数据集  \nlike数：12  \ndownload数：2220  \n最后更新时间：2025-07-14T12:38:00.000Z  \nurl: https://huggingface.co/datasets/agibo

In [40]:
from langchain.output_parsers import CommaSeparatedListOutputParser

csvparser = CommaSeparatedListOutputParser()

format_instruction = "您的响应应该是csv格式的逗号分隔值的列表，必须是：`分类，url, 项目名称, 下载量, like数量`。你的回答必须只包含每列的标题以及csv文件中的内容，没有任何其他信息，例如\'\'\' 和 csv。"

In [34]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(template="""
请查看以下内容并回答问题：

{context}

请分类每一个项目，相同类别的写在一起，不同类别直接空开一行。

{format_instruction}
""")

In [42]:
chain = prompt | model | csvparser
result = chain.invoke({"context":docs[0],"format_instruction":format_instruction})
print(result)

['分类', 'url', '项目名称', '下载量', 'like数量', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotWorld-Beta', 'agibot-world/AgiBotWorld-Beta', '37482', '34', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotWorld-Alpha', 'agibot-world/AgiBotWorld-Alpha', '5777', '198', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotWorldChallenge-2025', 'agibot-world/AgiBotWorldChallenge-2025', '6432', '17', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotDigitalWorld', 'agibot-world/AgiBotDigitalWorld', '5311', '34', '数据集', 'https://huggingface.co/datasets/agibot-world/GenieSimAssets', 'agibot-world/GenieSimAssets', '2220', '12', '数据集', 'https://huggingface.co/datasets/agibot-world/EWMBench', 'agibot-world/EWMBench', '230', '0']


In [43]:
import csv
for doc in docs:
    chain = prompt | model | csvparser
    result = chain.invoke({"context":doc,"format_instruction":format_instruction})
    print(result)


    HEADERS = ["分类", "url", "项目名称", "下载量", "like数量"]

    def chunk(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i+n]

    with open('huggingface_csv.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        # writer.writerow(HEADERS)
        for row in chunk(result[len(HEADERS):], len(HEADERS)):
            if len(row) < len(HEADERS):  # 末尾不满一行时补空
                row += [''] * (len(HEADERS) - len(row))
            writer.writerow(row)
        writer.writerow("")

['分类', 'url', '项目名称', '下载量', 'like数量', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotWorld-Beta', 'agibot-world/AgiBotWorld-Beta', '37482', '34', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotWorld-Alpha', 'agibot-world/AgiBotWorld-Alpha', '5777', '198', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotWorldChallenge-2025', 'agibot-world/AgiBotWorldChallenge-2025', '6432', '17', '数据集', 'https://huggingface.co/datasets/agibot-world/AgiBotDigitalWorld', 'agibot-world/AgiBotDigitalWorld', '5311', '34', '数据集', 'https://huggingface.co/datasets/agibot-world/GenieSimAssets', 'agibot-world/GenieSimAssets', '2220', '12', '数据集', 'https://huggingface.co/datasets/agibot-world/EWMBench', 'agibot-world/EWMBench', '230', '0']
['分类', 'url', '项目名称', '下载量', 'like数量', '模型', 'https://huggingface.co/ECNU-CILab/ArtAug-lora-FLUX.1dev-v1', 'ECNU-CILab/ArtAug-lora-FLUX.1dev-v1', '95', '8', '模型', 'https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1', 'ECNU-CILab/ExVideo-SVD