<a href="https://colab.research.google.com/github/Crossme0809/langchain-tutorials/blob/main/Using_OpenAI__LangChain_And_HDBSCAN_Clustering_Documents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!python3.10 --version

Python 3.10.12


## **安装依赖**

In [None]:
%pip install -r requirements.txt

In [18]:
import os

import hdbscan
import pandas as pd

from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from newsapi import NewsApiClient

from dotenv import load_dotenv

load_dotenv()

True


## 从相关来源获得10篇新闻报道（因为接口限速原因，这里演示10条）

In [29]:
newsapi = NewsApiClient(api_key=os.getenv("NEWSAPI_API_KEY"))
sources_1 = [
    "the-washington-post",
    "the-wall-street-journal",
    "business-insider",
]
sources_2 = [
    "google-news"
]

In [30]:
recent_articles = []
for source in [sources_1, sources_2]:
    recent_articles.extend(newsapi.get_everything(
        sources=",".join(source),
        language="zh",
        page_size=50
    )["articles"])

In [None]:
print(recent_articles)

## 生成新闻文章的嵌入向量

In [32]:
docs = [
    a["title"] + "\n\n" + (a["description"] or "")
    for a in recent_articles
]

In [33]:
embeddings = OpenAIEmbeddings(chunk_size=1000).embed_documents(docs)

## 对文件进行聚类，并将结果存储在一个 dataframe 中

In [34]:
hdb = hdbscan.HDBSCAN(gen_min_span_tree=True, min_samples=3, min_cluster_size=3).fit(embeddings)

In [35]:
df = pd.DataFrame({
    "title": [article["title"] for article in recent_articles],
    "description": [article["description"] for article in recent_articles],
    "cluster": hdb.labels_,
})
df = df.query("cluster != -1") # 删除不在群集中的文档

## 从每个聚类的文件中创建聚类主题

In [36]:



def get_prompt():
    system_template = "你是一位记者专家。你要帮我为新闻文章写一个引人注目的主题标题。"
    human_template = "使用以下文章，写一个能概括这些文章的主题标题。\n\nARTICLES:{articles}\n\nTOPIC TITLE:"

    return ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ],
        input_variables=["articles"],
    )


articles_str = "\n\n".join(
    [article["title"] + "\n\n" + (article["description"] or "") for article in recent_articles]
)

prompt = get_prompt()

for c in df.cluster.unique():
    chain = LLMChain(
        llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0613"), prompt=prompt, verbose=False
    )
    articles_str = "\n".join(
        [
            f"{article['title']}\n{article['description']}\n"
            for article in df.query(f"cluster == {c}").to_dict(orient="records")
        ]
    )
    result = chain.run(
        {
            "articles": articles_str,
        }
    )
    df.loc[df.cluster == c, "topic_title"] = result

In [41]:
c = 1
with pd.option_context("display.max_colwidth", None):
    print(df.query(f"cluster == {c}").topic_title.values[0])
    display(df.query(f"cluster == {c}").head())

"粽情端午：龙舟竞渡、非遗手工技艺和粽子包制活动丰富多彩"


Unnamed: 0,title,description,cluster,topic_title
11,龙舟竞渡华亭湖！松江区第十三届端午龙舟赛上午开赛_郊野 - 新民网,龙舟竞渡华亭湖！松江区第十三届端午龙舟赛上午开赛_郊野 新民网,1,"""粽情端午：龙舟竞渡、非遗手工技艺和粽子包制活动丰富多彩"""
17,端午假期首日长三角铁路迎来客流高峰预计发送旅客340万人次 - 无锡新传媒,端午假期首日长三角铁路迎来客流高峰预计发送旅客340万人次 无锡新传媒,1,"""粽情端午：龙舟竞渡、非遗手工技艺和粽子包制活动丰富多彩"""
19,看演出体验非遗手工技艺北京西城端午活动精彩纷呈 - beijing.qianlong.com,看演出体验非遗手工技艺北京西城端午活动精彩纷呈 beijing.qianlong.com,1,"""粽情端午：龙舟竞渡、非遗手工技艺和粽子包制活动丰富多彩"""
24,《颂·黄钟大吕》在国家大剧院音乐厅上演 - China Daily,《颂·黄钟大吕》在国家大剧院音乐厅上演 China Daily,1,"""粽情端午：龙舟竞渡、非遗手工技艺和粽子包制活动丰富多彩"""
27,龙舟竞渡正端午长三角龙舟邀请赛在金山山阳镇举行_新民社会 - 新民网,龙舟竞渡正端午长三角龙舟邀请赛在金山山阳镇举行_新民社会 新民网,1,"""粽情端午：龙舟竞渡、非遗手工技艺和粽子包制活动丰富多彩"""
