In [None]:
openai_api_key = ''
# Should be the output of the first_pass_gen
work_on_data = 'data_with_reasonings_0.pkl'

In [1]:
from typing import Annotated, Literal, TypedDict, Any
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from langchain_core.tools import tool
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import END, START, StateGraph, MessagesState
from langgraph.prebuilt import ToolNode
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
import pandas as pd
import pickle
from tqdm import tqdm
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import json
import uuid
from langchain_core.documents import Document
from time import time
import chromadb
from langgraph.graph.message import add_messages
from pydantic import Field, BaseModel
from langchain_community.retrievers import BM25Retriever
import loguru
import pickle
import requests as rq
from bs4 import BeautifulSoup
from urllib.parse import quote, quote_plus, urlparse, parse_qs, urlunparse, urlencode
from concurrent.futures import ThreadPoolExecutor
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain_openai import ChatOpenAI

In [2]:
class RetrievalRequest(BaseModel):

    reasoning: str = Field(
        description="The process of how you arrived on the following queries you are going to make based on the input. Reasoning should be concise and contain only important points."
    )
    queries: list[str] = Field(
        description="The queries you want to make. Each should be a question."
    )

class RefinedResult(BaseModel):

    refined_result: str = Field(
        description="The refined result of the given message. Should be in no more than 2 to 3 sentences. Should contain no more content than what's related to the query."
    )

class ReasoningCheck(BaseModel):

    reasoning: str = Field(
        description="The reason why you assert that you can deduce the answer of the professional doctor from the given information, or why you cannot deduce it. Reasoning should be concise and contain only important points."
    )
    deducible: bool = Field(description="Whether you can deduce the answer of the professional doctor from the given information.")

class Response(BaseModel):

    reasoning: str = Field(
        description="The reason why you ask the user for more information or respond to them. Reasoning should be concise and contain only important points."
    )
    is_asking: bool = Field(
        description="Wehther your sentence asks the user for more information or tell them something. If this is true, this response will be marked as ask, or else it will be marked as tell."
    )
    entities: list[str] = Field(
        description="The medical entities related to this conversation. Only the important ones should be included."
    )
    text: str = Field(
        description="The text sent to the patient."
    )

class Conversation(TypedDict):
    
    patient: str
    doctor: str

class RetrievalItem(TypedDict):
    
    query: str
    refined_result: str

class StructuredOutputWithRaw(TypedDict):
    
    raw: AIMessage
    parsed: Any
    parsing_error: Any

In [3]:
aux_llm = ChatOllama(model="qwen2.5:32b")

In [4]:
class AskingPart(BaseModel):

    asking: str = Field(
        description="The part of the sentence that the teacher asked."
    )
prompt_ask = "重复句子\n{}\n的问句部分，以 AskingPart 的方式给出。"

In [5]:
prompt = """在医患对话场景中，医生通常会提问以下几种对象。

- 疾病：即具体的疾病名称。
- 临床表现：即患者的症状，例如：头疼。
- 药物：即具体的药品名称，例如：阿司匹林。
- 手术：即具体的手术名称，例如：阑尾切除，心脏搭桥。
- 身体部位：即身体部位，例如：头，颈，胸，背，腰，腿，足。
- 检查项目：包括检查项目，检查结果，检查结论。
- 问诊医学概念：包括性别,年龄, 职业, 发病时间, 伴随症状，饮食情况，症状程度，曾用药，是否手术等等。

现在，对于以下这个医生的提问，

{}

给出医生提问中的所有对象，以 MedicalAskContent 的格式给出。

注意，你给出的所有内容必须出现在医生的提问中。每种对象可能不存在，也可能存在任意数目个。
"""

class MedicalAskContent(BaseModel):
    
    disease: list[str] = Field(
        default_factory=list,
        description="医生提问中的疾病对象"
    )
    symptom: list[str] = Field(
        default_factory=list,
        description="医生提问中的症状对象"
    )
    medcine: list[str] = Field(
        default_factory=list,
        description="医生提问中的药物对象"
    )
    surgery: list[str] = Field(
        default_factory=list,
        description="医生提问中的手术对象"
    )
    body_part: list[str] = Field(
        default_factory=list,
        description="医生提问中的身体部位对象"
    )
    medical_check: list[str] = Field(
        default_factory=list,
        description="医生提问中的检查项目对象"
    )
    concept: list[str] = Field(
        default_factory=list,
        description="医生提问中的问诊医学概念对象"
    )

In [6]:
ld = pickle.load(open(work_on_data, "rb"))

In [8]:
llm = ChatOpenAI(
    api_key=openai_api_key,
    model="gpt-4o-mini",
)

In [9]:
def rework(each):
    if not each['response_structured_with_raw']['parsed'].is_asking:
        each['is_asking'] = False
        each['ask_sentence'] = ""
        return
    each['is_asking'] = True
    res = aux_llm.with_structured_output(AskingPart).invoke(
        prompt_ask.format(each['response_structured_with_raw']['parsed'].text)
    )
    each['ask_sentence'] = res.asking
    res2 = llm.with_structured_output(MedicalAskContent).invoke(
        prompt.format(res)
    )
    each['objs'] = res2

In [10]:
for i in tqdm(range(len(ld))):
    rework(ld[i])

100%|██████████| 4/4 [00:09<00:00,  2.37s/it]


In [11]:
pickle.dump(ld, open(work_on_data, "wb"))