In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from tqdm.notebook import tqdm
import sys
import numpy as np
sys.path.append('..')
import config
import os
import csv

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

from opencc import OpenCC

# 初始化转换器，t2s表示从繁体转简体
cc = OpenCC('t2s')

data_path_prefix = config.data_path
vector_db_path = config.vector_db_path
os.environ["OPENAI_API_KEY"] = config.api_key
os.environ["OPENAI_API_BASE"] = config.api_base
model_name = config.model
# model_name = 'gpt-4'
retrive_top_k = config.top_k

In [2]:
embed_model = OpenAIEmbeddings(
    openai_api_base=os.environ["OPENAI_API_BASE"],
    openai_api_key=os.environ["OPENAI_API_KEY"],
    request_timeout=60,
)

In [3]:
def chat(system_message=None, human_message=None, model_name=model_name):
    answer = ''
    api_key = os.environ["OPENAI_API_KEY"]
    api_base = os.environ["OPENAI_API_BASE"]
    message=[{"role":"system","content":system_message},{"role":"user","content":human_message}]
    client = ChatOpenAI(
        openai_api_key=api_key,
        openai_api_base=api_base,
        model=model_name,
        temperature=0.2,
    )
    answer = client.invoke(message).content.strip()
    return answer

In [4]:
# 读取tag.txt文件
tags_data = pd.read_csv(os.path.join(data_path_prefix, 'tag.txt'), sep='\t', header=None, names=['user_id', 'tags'])
# 提取所有的标签，并统计每个标签的出现次数
tag_counts = dict()
all_tags = tags_data['tags'].dropna().str.split('_')
for tag in tqdm(all_tags):
    for t in tag:
        simplified_text = cc.convert(t)
        tag_counts[simplified_text] = tag_counts.get(simplified_text, 0) + 1

  0%|          | 0/275717 [00:00<?, ?it/s]

In [5]:
# 将标签出现次数转化为数据框
tag_count_distribution = pd.DataFrame.from_dict(tag_counts, orient='index', columns=['count']).sort_values('count', ascending=False)

# 重要类别
k = 500
main_tags = tag_count_distribution[:k]
# 长尾类别（注意此处因为目前是测试阶段，因此限制了出现次数的上下限，正式运行时，此处可以改为>5。>5的数据共有16000条左右，还在可以接受的范围里。另，全量共有28w条左右的标签，不可能全量运行）
long_tail_tags = tag_count_distribution[(tag_count_distribution['count']>20) & (tag_count_distribution['count']<26)][k:]

In [6]:
def vectorstore_from_kg(data=None, output_path: str = None, embed_model=embed_model, save=True
):
    if data is None:
        data = []
    if not os.path.exists(output_path):
        feature = data
        vectorstore = FAISS.from_texts(texts=feature, embedding=embed_model)
        if save:
            vectorstore.save_local(output_path)
    else:
        vectorstore = FAISS.load_local(output_path, embed_model, allow_dangerous_deserialization=True)
    return vectorstore

In [7]:
# 对重要标签进行处理，对齐其中语义相近的标签
tag_des = []
split_flag = '，该标签的描述为：'
if os.path.exists(os.path.join(data_path_prefix, "main_tags.csv")):
    df = pd.read_csv(os.path.join(data_path_prefix, "main_tags.csv"))
    # 读取向量数据库
    vector_db = vectorstore_from_kg(output_path=vector_db_path)
else:
    # 将标签逐个进行描述，增加额外的信息，使得嵌入向量更加合理
    for tag in tqdm(main_tags.index):
        system_messgae = "你是一个对用户兴趣标签十分了解的专家，请在50个字以内描述一下输入给你的用户兴趣标签。请注意，输入的词语并不是单纯的词，而是一个用户的兴趣，你需要做的是阐述当这个词被当作用户画像的一部分时，它的含义是什么。"
        human_message = tag
        answer = chat(system_messgae, human_message)
        tag_des.append(answer)
    main_tags["tag_des"] = tag_des

    # 把标签及其描述处理成如[音乐，该标签的描述为XXX]的形式
    data = list(
        main_tags.reset_index(drop=False).apply(
            lambda x: x[0] + split_flag + x["tag_des"], axis=1
        )
    )
    # 创建临时向量数据库
    vector_db_temp = vectorstore_from_kg(data, vector_db_path, save=False)

    # 对重要标签中的每个标签，都从向量数据库中检索top2的标签，其中第一个为该标签本身，第二个为与其相似的标签
    data_list = []
    for i in tqdm(data):
        retrieve = [
            (j.page_content, s)
            for j, s in vector_db_temp.similarity_search_with_relevance_scores(i, 2)
        ]
        data = [
            retrieve[0][0],  # tag1
            tag_count_distribution.loc[retrieve[0][0].split(split_flag)[0]][
                "count"
            ],  # tag1_count
            retrieve[1][0],  # tag2
            tag_count_distribution.loc[retrieve[1][0].split(split_flag)[0]][
                "count"
            ],  # tag2_count
            retrieve[0][1]
            - retrieve[1][1],  # tag1与tag2检索分数的差值，用于衡量两个标签有多相近
        ]
        data_list.append(data)
    df = pd.DataFrame(
        data_list,
        columns=["tag1", "tag1_count", "tag2", "tag2_count", "relevance_score_diff"],
    )

    # 将相近的标签进行对齐，生成最终的tag
    df["final_tag"] = df.apply(
        lambda x: (
            x["tag1"]
            if x["relevance_score_diff"]
            >= 0.1  # 差值大于0.1的不考虑合并，经观察，没有合并的必要
            or (
                len(x["tag1"].split(split_flag)[0])
                == len(x["tag2"].split(split_flag)[0])
                and x["tag1_count"] >= x["tag2_count"]
            )
            or len(x["tag1"].split(split_flag)[0])
            < len(
                x["tag2"].split(split_flag)[0]
            )  # 考虑标签的长度，希望最终的标签尽可能短，过长的标签有较大可能是如“偶稀饭睡觉觉”这类描述性标签，转化为“爱睡觉”是比较合理的。另外，当标签长度一致的时候，使用出现次数进行进一步的判断。
            else x["tag2"]
        ),
        axis=1,
    )
    
    # 把标签及其描述处理成如[音乐，该标签的描述为XXX]的形式
    data = list(df['final_tag'].drop_duplicates())
    # 创建向量数据库
    vector_db = vectorstore_from_kg(data, vector_db_path, save=True)
    
    df.to_csv(os.path.join(data_path_prefix, "main_tags.csv"), index=False)

In [11]:
data_list = []
data_temp = pd.read_csv(os.path.join(data_path_prefix, 'processed_tags.csv'), header=None)
check_point = len(data_temp)
long_tail_tags_list= long_tail_tags.index.tolist()[check_point:]
for i, tag in tqdm(enumerate(long_tail_tags_list), total=len(long_tail_tags_list)):
    # 为每个tag生成一段描述
    system_messgae = "你是一个对用户兴趣标签十分了解的专家，请在50个字以内描述一下输入给你的用户兴趣标签。请注意，输入的词语并不是单纯的词，而是一个用户的兴趣，你需要做的是阐述当这个词被当作用户画像的一部分时，它的含义是什么。"
    human_message = tag
    answer = chat(system_messgae, human_message)
    tag_with_desc = tag + split_flag + answer
    # 利用标签及其描述进行检索
    retrieve_tags = [(i.page_content.split(split_flag)[0]) for i, _ in vector_db.similarity_search_with_relevance_scores(tag_with_desc, retrive_top_k)]
    
    # 读取分组的prompt
    with open('./prompt.txt', 'r') as f:
        prompt = f.read()
    # 进行标签分组
    system_messgae = '你是一个标签分组的标注员，请协助用户从候选标签组中选择出一个与输入标签最匹配的标签组，若用户提供的候选列表中没有合适的标签组，请返回None'
    human_message = prompt.replace('<<<tag>>>', tag_with_desc).replace('<<<candidates>>>', ','.join(retrieve_tags))
    answer = chat(system_messgae, human_message)
    data_for_tag = [i+check_point, tag, answer, retrieve_tags, tag_with_desc]
    data_list.append(data_for_tag)
    # 将结果持续追加至本地
    with open(os.path.join(data_path_prefix, 'processed_tags.csv'), 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(data_for_tag)

41


  0%|          | 0/382 [00:00<?, ?it/s]