In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
import pandas as pd
import re
import os
import requests
from PyPDF2 import PdfReader
from langchain_community.utilities import GoogleSerperAPIWrapper
from concurrent.futures import ProcessPoolExecutor
import concurrent.futures
import logging
import time
import spacy
import jieba
import uuid
import json

nlp = spacy.load("zh_core_web_sm")
from concurrent.futures import ThreadPoolExecutor
import fitz
import faiss
import os
from sentence_transformers import SentenceTransformer

In [None]:
url = "https://api.siliconflow.cn/v1/chat/completions"
# headers = {
#     "Authorization": "Bearer ***",
#     "Content-Type": "application/json"
# }
def get_llm_response(question):
    payload = {
        "model": "Qwen/Qwen2-72B-Instruct",
        "messages": [
            {
                "role": "user",
                "content":  f"作为消化科医生，请用精简的中文回答，不要透露AI身份。直接给出结论，不要重复问题。请严格地以连续的自然段形式作答，不要分条列举。输出生成的内容前，请仔细检查文本内容，禁止使用###、##、#、***、**、*、 这样的格式标记内容，禁止重点标记、标题、空格。禁止回复答案来源，例如‘根据xxx研究’、‘根据搜索材料’、‘根据提供的信息’、‘基于提供的资料’之类的话术：{question}"
            }
        ],
        "stream": False,
        "max_tokens":800 ,
        "temperature": 0.4,
        "top_p": 0.8,
        "top_k": 5,
        "frequency_penalty": 0.5,
        "n": 1,
        "response_format": {"type": "text"}
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        if response.status_code == 200:
            return response.json()["choices"][0]["message"]["content"]
        else:
            print(f"请求失败，状态码：{response.status_code}")
            return "请求失败"
    except Exception as e:
        print(f"请求发生错误：{e}")
        return "请求失败"

model = SentenceTransformer('D:/python/BERTopic/all_MiniLM_L6_v2')

def sliding_window(text, window_size=20, step_size=15):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    chunks = []
    for i in range(0, len(sentences) - window_size + 1, step_size):
        chunk = ' '.join(sentences[i:i + window_size])
        chunks.append(chunk)
    return chunks

def process_chunks(chunks):
    try:
        embeddings = model.encode(chunks, batch_size=8) 
        return embeddings
    except Exception as e:
        print(f"Error processing chunks: {e}")
        return None
    
documents = sliding_window(text)
results = []
batch_size = 8 
batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]
for batch in batches:
    result = process_chunks(batch)
    results.append(result)

embeddings = np.concatenate(results, axis=0)

model = SentenceTransformer('D:/python/BERTopic/all_MiniLM_L6_v2')
nlp = spacy.load("zh_core_web_sm")
index = faiss.IndexFlatL2(embeddings.shape[1]) 
index.add(np.array(embeddings, dtype=np.float32)) 

def get_llm_response(question, context):
    payload = {
        "model": "Qwen/Qwen2-72B-Instruct",
        "messages": [
            {
                "role": "user",
                "content":  f"作为消化科医生，请用精简的中文回答，不要透露AI身份。直接给出结论，不要重复问题。请严格地以连续的自然段形式作答，不要分条列举。输出生成的内容前，请仔细检查文本内容，禁止使用###、##、#、***、**、*、 这样的格式标记内容，禁止重点标记、标题、空格。禁止回复答案来源，例如‘根据xxx研究’、‘根据搜索材料’、‘根据提供的信息’、‘基于提供的资料’之类的话术。患者问题：{question}。\
                    参考内容不一定与问题完全相关，若参考内容对问题没有贡献，可以考虑无视。如果参考部分内容，请流畅地把相关内容加入到你的回答中，参考信息如下：{context}。"
            }
        ],
        "stream": False,
        "max_tokens": 800,
        "temperature": 0.4,
        "top_p": 0.8,
        "top_k": 5,
        "frequency_penalty": 0.5,
        "n": 1,
        "response_format": {"type": "text"}
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        if response.status_code == 200:
            return response.json()["choices"][0]["message"]["content"]
        else:
            print(f"请求失败，状态码：{response.status_code}")
            return "请求失败"
    except Exception as e:
        print(f"请求发生错误：{e}")
        return "请求失败"
    
def get_top_5_answers(question):
    url = "https://google.serper.dev/search"
    payload = json.dumps({
      "q": question
    })
    headers = {
      'X-API-KEY': '***',
      'Content-Type': 'application/json'
    }

    response = requests.post(url, headers=headers, data=payload)

    if response.status_code == 200:
        data = response.json()        
        organic_results = data.get('organic', [])
        top_5_answers = []
        for result in organic_results[:5]:
            snippet = result.get('snippet', 'No Snippet Available')
            top_5_answers.append(snippet)
        return top_5_answers
    else:
        print(f"Error: {response.status_code}")
        return []
    
def process_question(question):
    query_embedding = model.encode([question])
    k = 5  
    distances, indices = index.search(np.array(query_embedding, dtype=np.float32), k)
    context = " ".join([documents[idx] for idx in indices[0]])
    serper_result = get_top_5_answers(question)
    serper_result = "。".join(serper_result)
    context += f" {serper_result}"
    answer = get_llm_response_with_formatting(question, context)
    return answer