In [1]:
from langchain_openai import ChatOpenAI
import os
from langchain_core.output_parsers import StrOutputParser
os.environ["HTTP_PROXY"] = "http://127.0.0.1:8890"
os.environ["HTTPS_PROXY"] = "http://127.0.0.1:8890"
# ucloud api key
API_KEY = os.getenv("UCLOUD_API_KEY")
API_BASE = "https://api.modelverse.cn/v1"
model = ChatOpenAI(
    openai_api_key=API_KEY,
    openai_api_base=API_BASE,
    model="deepseek-ai/DeepSeek-V3-0324")
output_parser = StrOutputParser()


In [2]:
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(
    "Github/",
    loader_cls=lambda path: TextLoader(path, encoding="utf-8")
)
docs = loader.load()

docs

[Document(metadata={'source': 'Github\\4paradigm_results.txt'}, page_content='**OpenMLDB**  \n分类：2. 数据集  \nstar数字：1662  \n最后更新时间：2025-09-02  \n项目描述：OpenMLDB is an open-source machine learning database that provides a feature platform computing consistent features for training and inference.  \nurl: https://github.com/4paradigm/OpenMLDB  \n\n---\n\n**k8s-vgpu-scheduler**  \n分类：3. 工具  \nstar数字：572  \n最后更新时间：2025-09-02  \n项目描述：OpenAIOS vGPU device plugin for Kubernetes is originated from the OpenAIOS project to virtualize GPU device memory, in order to allow applications to access larger memory space than its physical capacity. It is designed for ease of use of extended device memory for AI workloads.  \nurl: https://github.com/4paradigm/k8s-vgpu-scheduler  \n\n---\n\n**AutoX**  \n分类：3. 工具  \nstar数字：539  \n最后更新时间：2025-08-07  \n项目描述：AutoX is an efficient automl tool, which is mainly aimed at data mining tasks with tabular data.  \nurl: https://github.com/4paradigm/AutoX  \n\n---\n\n**opena

In [3]:
from langchain.output_parsers import CommaSeparatedListOutputParser

csvparser = CommaSeparatedListOutputParser()

format_instruction = "您的响应应该是csv格式的逗号分隔值的列表，必须是：`分类，url, 项目名称, star数量`。你的回答必须只包含每列的标题以及csv文件中的内容，没有任何其他信息，例如\'\'\' 和 csv。"

In [4]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(template="""
请查看以下内容并回答问题：

{context}

请分类每一个项目，相同类别的写在一起，不同类别直接空开一行。

{format_instruction}
""")

In [5]:
chain = prompt | model | csvparser
result = chain.invoke({"context":docs[0],"format_instruction":format_instruction})
print(result)

['分类', 'url', '项目名称', 'star数量', '2. 数据集', 'https://github.com/4paradigm/OpenMLDB', 'OpenMLDB', '1662', '3. 工具', 'https://github.com/4paradigm/k8s-vgpu-scheduler', 'k8s-vgpu-scheduler', '572', '3. 工具', 'https://github.com/4paradigm/AutoX', 'AutoX', '539', '4. 其他', 'https://github.com/4paradigm/openaios-platform', 'openaios-platform', '98', '4. 其他', 'https://github.com/4paradigm/pafka', 'pafka', '67']


In [6]:
import csv
for doc in docs:
    chain = prompt | model | csvparser
    result = chain.invoke({"context":doc,"format_instruction":format_instruction})
    print(result)


    HEADERS = ["分类", "url", "项目名称", "star数量"]

    def chunk(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i+n]

    with open('github_csv.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        # writer.writerow(HEADERS)
        for row in chunk(result[len(HEADERS):], len(HEADERS)):
            if len(row) < len(HEADERS):  # 末尾不满一行时补空
                row += [''] * (len(HEADERS) - len(row))
            writer.writerow(row)
        writer.writerow("")

['分类', 'url', '项目名称', 'star数量', '2. 数据集', 'https://github.com/4paradigm/OpenMLDB', 'OpenMLDB', '1662', '3. 工具', 'https://github.com/4paradigm/k8s-vgpu-scheduler', 'k8s-vgpu-scheduler', '572', '3. 工具', 'https://github.com/4paradigm/AutoX', 'AutoX', '539', '4. 其他', 'https://github.com/4paradigm/openaios-platform', 'openaios-platform', '98', '4. 其他', 'https://github.com/4paradigm/pafka', 'pafka', '67']
['分类', 'url', '项目名称', 'star数量', '1. 模型', 'https://github.com/AgibotTech/agibot_x1_infer', 'agibot_x1_infer', '1717', '1. 模型', 'https://github.com/AgibotTech/agibot_x1_train', 'agibot_x1_train', '1563', '1. 模型', 'https://github.com/AgibotTech/EnerVerse-AC', 'EnerVerse-AC', '111', '4. 其他', 'https://github.com/AgibotTech/agibot_x1_hardware', 'agibot_x1_hardware', '978', '4. 其他', 'https://github.com/AgibotTech/Genie-Envisioner', 'Genie-Envisioner', '240', '3. 工具', 'https://github.com/AgibotTech/genie_sim', 'genie_sim', '273', '2. 数据集', 'https://github.com/AgibotTech/EWMBench', 'EWMBench', '82']