In [1]:
import os
os.environ["http_proxy"]="127.0.0.1:7890"
os.environ["https_proxy"]="127.0.0.1:7890"

In [2]:
from openai import OpenAI

api_key = ""
api_base = ""
client = OpenAI(api_key=api_key, base_url=api_base)

In [3]:
import telebot

BOT_TOKEN = ""
bot = telebot.TeleBot(BOT_TOKEN)

In [4]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# 假设你已经加载了 df，包含 'Abstract', 'Title', 和 'Href' 列

# 1. 生成嵌入
def create_embeddings(text):
    response = client.embeddings.create(input=text, model='text-embedding-3-large')  # 使用适当的模型
    embeddings = response.data[0].embedding
    return embeddings

# 生成嵌入并存入df新的一列 'Abstract embeddings'
def generate_embeddings_for_dataframe(df, file_path):
    # 检查 'Abstract embeddings' 列是否存在，如果不存在则创建
    if 'Abstract embeddings' not in df.columns:
        df['Abstract embeddings'] = None

    embeddings_updated = False

    # 遍历 'Abstract' 列，只有当 'Abstract embeddings' 为空时才生成嵌入
    for i, abstract in df['Abstract'].items():  # 修改为 items()
        if pd.isna(df.at[i, 'Abstract embeddings']):
            df.at[i, 'Abstract embeddings'] = create_embeddings(abstract)
            embeddings_updated = True  # 标记更新

    # 仅当更新了嵌入时才保存到原文件，避免不必要的I/O操作
    if embeddings_updated:
        df.to_csv(file_path, index=False)
        print(f"### 已更新嵌入并保存到 {file_path} ###")
    else:
        print(f"### 没有新的嵌入需要更新，跳过保存 ###")

    return df

import ast  # 用于将字符串转化为列表

# 处理抽象嵌入列表，确保每个嵌入都是list而非str
def process_abstract_embeddings(abstract_embeddings):
    processed_embeddings = []
    for embedding in abstract_embeddings:
        if isinstance(embedding, str):  # 如果嵌入是字符串格式
            try:
                embedding = ast.literal_eval(embedding)  # 将字符串转为列表
            except (ValueError, SyntaxError):
                print(f"无法解析嵌入：{embedding}")
                continue
        processed_embeddings.append(embedding)
    return processed_embeddings

# 2. 根据用户请求生成嵌入并找到最相似的Top_k论文
def find_similar_papers(request_vector, df, top_k=5):
    abstract_embeddings = df['Abstract embeddings'].tolist()
    
    # 确保所有嵌入是正确的数值列表
    abstract_embeddings = process_abstract_embeddings(abstract_embeddings)
    
    # print(type(abstract_embeddings), type(abstract_embeddings[0]), len(abstract_embeddings)) # 第一次输出:<class 'list'> <class 'list'> 10,第二次输出:<class 'list'> <class 'str'> 10
    
    # 使用 NearestNeighbors 查找最相似的向量
    nbrs = NearestNeighbors(n_neighbors=top_k, algorithm='ball_tree').fit(abstract_embeddings)
    
    distances, indices = nbrs.kneighbors([request_vector])
    
    # 收集最相似的论文 title 和 href
    similar_papers = []
    for index, distance in zip(indices[0], distances[0]):
        similar_papers.append({
            'Title': df['Title'].iloc[index],
            'Href': df['href'].iloc[index],
            'Distance': distance
        })
    
    return similar_papers

In [5]:
# 3. 实现推荐功能
@bot.message_handler(commands=['recommend'])
def recommend_initialize(message):
    """
    让用户选择领域并提问。
    """
    markup = telebot.types.ReplyKeyboardMarkup(one_time_keyboard=True) # 卧槽,牛逼啊,变成选择题了!
    fields = ['Education', 'Computer_Science', 'Medicine', 'Literature', 'Other']
    for field in fields:
        markup.add(field)
    bot.send_message(message.chat.id, "请选择一个领域:", reply_markup=markup)
    bot.register_next_step_handler(message, handle_fields)

def handle_fields(message):
    selected_field = message.text # 例如:Education
    file_path = f"data/data_crawled/database_persistent/{selected_field}.csv"
    bot.send_message(message.chat.id, "您需要我推荐什么文章?")
    bot.register_next_step_handler(message, recommend_papers, file_path, top_k=3)

def recommend_papers(message, file_path, top_k=5):
    request = message.text
    df = pd.read_csv(file_path)
    # 为用户请求生成嵌入
    request_embedding = create_embeddings(request)

    # 更新 DataFrame 中的嵌入并保存到文件
    df = generate_embeddings_for_dataframe(df, file_path)

    # 找到与用户请求最相似的论文
    similar_papers = find_similar_papers(request_embedding, df, top_k=top_k)
    
    # 组合推荐结果
    total_content = "Here are the top recommended papers based on your request:\n\n"
    for paper in similar_papers:
        total_content += f"Title: {paper['Title']}\n"
        total_content += f"Link: {paper['Href']}\n"
        total_content += f"Similarity Score: {paper['Distance']}\n\n"
    
    # 使用 OpenAI 生成完整的推荐响应（可选）
    messages = [
        {"role": "system", "content": "You are an AI assistant that helps with academic paper recommendations."},
        {"role": "user", "content": total_content}
    ]

    # 生成回复
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )
    
    bot.send_message(message.chat.id, completion.choices[0].message.content, parse_mode="Markdown")

In [6]:
# test
# file_path = "data/data_crawled/database_persistent/Education.csv"
# df = pd.read_csv(file_path)

# # df = df.drop(columns=['Abstract embeddings'])
# # df.to_csv(file_path, index=False)

# # 为用户请求生成推荐
# request = "I want a paper about empathy"
# recommended_papers = recommend_papers(request, df, file_path, top_k=3)  # 注意top_k不能超过df的len

# print(recommended_papers)

In [7]:
# bot.infinity_polling()
bot.polling()

### 没有新的嵌入需要更新，跳过保存 ###
