## Azure调用

In [None]:
import os
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
#os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
os.environ["OPENAI_API_BASE"] = "https://omdbeta-ae-01.openai.azure.com"
os.environ["OPENAI_API_KEY"] = "***"

In [2]:
# 对话类langchain API调用
import os
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage

llm = AzureChatOpenAI(deployment_name="gpt-4")

msg = HumanMessage(content="Explain step by step. How old is the president of USA?")
print(llm(messages=[msg]))

content="As an AI developed by OpenAI, I can't provide real-time data or updates. But here's a general way to find out the age of the current president of the USA:\n\n1. Firstly, you need to know who the current president is. As of my last update in October 2021, the president of the United States is Joe Biden.\n\n2. Joe Biden was born on November 20, 1942.\n\n3. To calculate his age, subtract the year he was born from the current year. If the current date is before November 20, subtract an additional year because his birthday hasn't occurred yet this year.\n\nRemember, for the most accurate and up-to-date information, you should look up the current president and their date of birth on a trusted source like a news website or an encyclopedia."


## 开源模型调用方式

In [None]:
import pandas as pd
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter,MarkdownTextSplitter
import langchain.text_splitter
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings
import IPython
import sentence_transformers
from langchain.vectorstores import Chroma
import pandas as pd

In [None]:
# 加载文件夹
qa_file = './qa.csv'
qa_file_cleaned = './qa_clean.csv'
embedding_file = './embedding'

In [None]:
#问答对数据处理：一问一答处理成一个chunk
df=pd.read_csv(qa_file)
df['new'] = df['Similar questions'].str.strip().str.replace('\n\n', '')  + '\n\n'
df['new'] 
df['new'].to_csv(qa_file_cleaned,encoding='utf-8-sig', header=False, index=False)

In [None]:
## 文档切分
md_loader = TextLoader(qa_file_cleaned, encoding='utf8')
md_doc = md_loader.load()

markdown_splitter = CharacterTextSplitter(separator = '\n\n',  chunk_size=25, chunk_overlap=0 )
md_docs = markdown_splitter.split_documents(md_doc)
print(len(md_docs))

In [None]:
## 向量化知识库
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings
import IPython
import sentence_transformers

# 选择模型---https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=downloads
model_name = "BAAI/bge-large-zh-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    #model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name) #, device='mps'
print("embeddings", embeddings)

In [None]:
# 本地持久化向量库  
db = Chroma.from_documents(md_docs, embeddings, persist_directory=embedding_file)

# 加载本地持久化数据
db = Chroma(persist_directory=embedding_file, embedding_function=embeddings)
# print(dir(db))
db.get(limit =1)

In [None]:
# 知识库索引参考文献
import warnings 
warnings.filterwarnings("ignore")

def RetriveQA(question):
    retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .7,"k": 3})
    similarDocs = retriever.get_relevant_documents(question)
    if len(similarDocs)>0:
        similardocs1 = similarDocs[0].page_content.replace('"', '').replace('\n', '')
        question1 = similardocs1
        filtered_df = df_qa[df_qa['Similar questions']==str(question1)]
        answer1 = filtered_df['Standard answer'].values
        if len(similarDocs)>1:
            similardocs2 = similarDocs[1].page_content.replace('"', '').replace('\n', '')
            question2 = similardocs2
            filtered_df = df_qa[df_qa['Similar questions']==str(question2)]
            answer2 = filtered_df['Standard answer'].values
            docs_input = "【参考问题】" + question1 + '\n' + "【参考答案】" + answer1 + '\n' + "【参考问题】" + question2 + '\n' + "【参考答案】" + answer2 + '\n'
        else:
            docs_input = "【参考问题】" + question1 + '\n' + "【参考答案】" + answer1 + '\n'
    else:
        docs_input = ''
    return docs_input

## openai调用方式

In [None]:
import openai
import gradio as gr
import math
import os

openai.api_key = "***"
os.environ["OPENAI_API_KEY"] = '***'

start_sequence = "\AI:"
restart_sequence = "\Human:"

##调用fine-tuning后的模型
#ft_model = 'ada:ft-personal-2023-05-30-08-57-22'
ft_model = 'ada:ft-personal-2023-06-20-07-48-30'

prompt = " "

##LABEL转换
def testc(a):
    if a == 0:
        label = 'Booking'
    elif a == 1:
        label = 'Cancellation'
    elif a == 2:
        label = 'Carer'
    elif a == 3:
        label = 'Login'
    elif a == 4:
        label = 'Minor'
    elif a == 5:
        label = 'Others'
    elif a == 6:
        label = 'Patient Particulars'
    elif a == 7:
        label = 'Payment'
    elif a == 8:
        label = 'Record'
    elif a == 9:
        label = 'Registration'
    elif a == 10:
        label = 'Rescheduling'
    else:
        label = 'Others'
    return label

##获取分类及TOP3置信度
def generate_response(prompt):
    completion = openai.Completion.create(
           model = ft_model,
           prompt = prompt,
           temperature = 0,
           max_tokens= 1, 
           #top_p=1,
           logprobs=5,
           frequency_penalty=0, 
           presence_penalty=0
       ) 
    #return completion['choices'][0]
    top_p = None
    df_op_new = None
    try:
        top_p = completion['choices'][0].text
        top_p = testc(int(top_p))
        output_prob = completion['choices'][0]['logprobs']['top_logprobs'][0]
        dict_op = output_prob.to_dict()
        df_op = pd.DataFrame.from_dict(dict_op,orient='index',columns=['Confidence_log']).reset_index().rename(columns={'index':'Type'})
        df_op['Confidence']=df_op['Confidence_log'].apply(lambda x: math.exp(x))
        #df_op['LABEL'] = df_op['Type'].apply(lambda x: testc(x))
        df_op['Type_adj']=df_op['Type'].astype('int')##空格问题
        df_op_new = df_op.groupby('Type_adj').sum().reset_index().sort_values(by='Confidence',ascending=False).iloc[:3,:]
        df_op_new['LABEL'] = df_op_new['Type_adj'].apply(lambda x: testc(x))
        
    except:
        top_p = None
        df_op_new = None
    return top_p, df_op_new


##主函数
def my_chatbot(input, history):
    #history = history or []
    history = []
    my_history = list(sum(history, ()))
    my_history.append(input)
    my_input = ' '.join(my_history)
    my_input = my_input+'\n\n###\n\n'
    my_input.replace(' two', '2')
    output1,output2 = generate_response(my_input)
    #history.append((input, output))
    return output1,output2

In [None]:
## 不需运行
## 仅首次构建本地向量库需运行，其他时候不需要
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI,VectorDBQA
from langchain.document_loaders import DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders.sitemap import SitemapLoader
from langchain.document_loaders import UnstructuredPDFLoader
from getpass import getpass
import os
import openai
import gradio as gr
import math

openai.api_key = "***"
os.environ["OPENAI_API_KEY"] = '***'

embeddings = OpenAIEmbeddings()

# 添加预训练文档库
def add_documents(loader, instance):
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separators= ["\n\n", "\n", ".", ";", ",", " ", ""])
    texts = text_splitter.split_documents(documents)
    instance.add_documents(texts)


# 创建Chroma实例
instance = Chroma(embedding_function=embeddings, persist_directory="./DataProcessing/embedding_index")

# 添加本地知识库 (CSV file)，HA-GO常见问题
loader = TextLoader('./常見問題.csv')
add_documents(loader, instance)

# 持久化向量库
instance.persist()

In [None]:
# from flask import Flask, request,make_response
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

embeddings = OpenAIEmbeddings()

#初始化向量库，供本地调用使用
instance = Chroma(persist_directory="./DataProcessing/embedding_index", embedding_function=embeddings)

#prompt模版
tech_template = """你是一位医院APP助手，请根据提供的知识库学习【Question】以及对应的【Answer】，找出最相近的【Question】并只输出其对应的【Answer】

注意：只输出对应的【Answer】即可，不需要任何其他的内容，如果不确定，就请返回"9999"

{summaries}
Q: {question}
A: """


PROMPT = PromptTemplate(
    template=tech_template, input_variables=["summaries","question"]
)

qa = RetrievalQAWithSourcesChain.from_chain_type(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
                                                chain_type="stuff",
                                                #retriever=instance.as_retriever(search_type="mmr", search_kwargs={"k":1}),
                                                retriever=instance.as_retriever(),
                                                chain_type_kwargs={"prompt": PROMPT},
                                                reduce_k_below_max_tokens=True
                                )


def getAnswer(index):
    try:
        index = int(index)
        #releQuestion = qa_answer.loc[qa_answer['Index']==index,(['Answer'])]
        result = qa_answer.loc[qa_answer['Index']==index,(['Question','Answer'])]
    except:
        result = qa_answer.loc[qa_answer['Question'].str.contains(index),(['Question','Answer'])]
    #result = qa_answer.loc[qa_answer['Index']==index,(['Answer'])]
    #result = qa_answer.loc[(qa_answer['Index']==int(index)) | (qa_answer['Question'].str.contains(result['answer'])),(['Answer'])]
    return result

def qa_hago(query):
    result = qa({"question": query}, return_only_outputs=True)
    index = result['answer']
    try:
        result = getAnswer(index)
        #result = result.values[0]
        ### 把问题也提取出来
        #result = str(result)[1:-1]
        qs = 'Most Relevant Question: ' + str(result['Question'].values[0]) + '\n'
        ans = 'Answer: ' + str(result['Answer'].values[0])
        res = qs + ans
    except:
        res = '我是一个基于大语言模型的医疗智能助手，抱歉这个问题在现有知识库中无法找到相关答案，请您联系人工服务，谢谢您，祝您快乐'
    return res

In [None]:
import gradio as gr
import pandas as pd

blocks = gr.Blocks()

with blocks as demo: 
    gr.Markdown("""<h1><center>Complaints Classification for HA</center></h1>""")
    gr.Markdown("""<h3><center>Produced by Tencent Healthcare</center></h3>""")
    
    state = gr.State()
    
    with gr.Row(scale=3,min_width=600):
        txt = gr.Textbox(show_label=False, placeholder="请输入用户投诉内容，我可以帮你进行分类",lines=5).style(container=False)
    
    with gr.Row():
        btn = gr.Button("输出类别")
        reverse_btn = gr.Button("输出置信度")
        
    output1 = gr.Textbox(label="判定类别：")
    output2 = gr.Textbox(value = '',label="TOP3类别置信度：",lines=4)
    
    def testa(txt,state):
        a,b = my_chatbot(txt,state)
        return a
    
    def testb(txt,state):
        a,b = my_chatbot(txt,state)
        return b
    

    
    btn.click(testa, inputs=[txt,state], outputs=output1)
    reverse_btn.click(testb, [txt,state], outputs=output2)
    #reverse_btn.click(my_chatbot, [txt], output2, _js="(s, v, o) => o + ' ' + v + ' ' + s")
    
    btn_plot = gr.Button("输出柱状图")
    #inp = inputs.Textbox()
    #df_op = testb(txt,state)
    df_op = txt.submit(testb, inputs=[txt, state], outputs=None)
    output3 = gr.BarPlot(value=df_op,x='LABEL',y='Confidence', vertical=False,title='类别置信度')
    btn_plot.click(testb, [txt,state], output3)
    #output4 = gr.Interface(None, None, None)
demo.launch()