  ![Simo.Chat](./image/SimonChat.png)

#### 导包

In [1]:
import os
import re
import time
import spacy
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

#### 获取环境变量

In [2]:
load_dotenv("info.env", override=True)
API_BASE = os.getenv("API_BASE")
API_KEY = os.getenv("API_KEY")

#### 创建LLM实例

In [3]:
client = OpenAI(api_key=API_KEY, base_url=API_BASE)

#### 功能函数定义

In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
CLIFF = pd.read_csv("./CLIFF/CLIFF_error_only.csv", encoding="iso-8859-1")

In [6]:
# 选择样例
def choose_sample(index):
    dataset = pd.read_csv("./CLIFF/CLIFF_error_only.csv", encoding="iso-8859-1")
    if index <= 226:
        text_doc = dataset["doc"][index-1]
        text_summ = dataset["summ"][index-1]
    
        return text_doc, text_summ

In [7]:
# 实体分析
def entity_analysis(doc, summ, nlp=nlp):
    list_D = []
    list_S = []
    doc_D = nlp(doc)
    doc_S = nlp(summ)
    
    for ent in doc_D.ents:
        list_D.append(ent.text+"\t"+"["+ent.label_+"]")
        
    for ent in doc_S.ents:
        list_S.append(ent.text+"\t"+"["+ent.label_+"]")
    
    ents_D = "\n".join(list_D)
    ents_S = "\n".join(list_S)
    
    entities_in_D = {ent.text for ent in doc_D.ents}
    unique_entities_in_S = {ent for ent in doc_S.ents if ent.text not in entities_in_D}
    ents_inconsist =  [ent.text for ent in unique_entities_in_S]
    
    return ents_D, ents_S, ents_inconsist

In [23]:
# 实体消歧
def entity_resolution(document, summary):
    doc_D = nlp(document)
    doc_S = nlp(summary)
    new_summary = summary
    ents_D = {ent.text for ent in doc_D.ents}
    unique_ents_S = {ent for ent in doc_S.ents if ent.text not in ents_D}
    
    for ent in unique_ents_S:
        print(ent.text, ent.label_)
        
        if ent.label_ == "DATE":
            new_summary = re.sub(ent.text, "(?)", new_summary)
            continue
        
        ents_candidate = [e.text for e in doc_D.ents if e.label_==ent.label_]
        # ents_candidate = list(set(ents_candidate))
    
        if len(ents_candidate) == 0:
            new_summary = re.sub(ent.text, "(?)", new_summary)
            continue
        
        max_similarity = -1
        most_similar_token = None
        doc_ent = nlp(ent.text)
        for token in ents_candidate:
            doc_token = nlp(token)
            similarity = doc_ent.similarity(doc_token)
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_token = token
        print(f"""The most similar word to '{ent.text}' is '{most_similar_token}' with a similarity of {max_similarity:.4f}
        """)
        if max_similarity >= 0.5:
            new_summary = re.sub(ent.text, most_similar_token, new_summary)
        else:
            new_summary = re.sub(ent.text, "(?)", new_summary)
        print(f"""Updated summary: {new_summary}""")
    
    return new_summary

In [9]:
# 实体修改
def entity_alignment(doc, summ, ents_inconsist):
    
    prompt = f"""
        
        The content of the summary is inconsistent with the document. 
        We have identified words in the summary that do not match the document in Inconsistency.
        
        Document: {doc}
        Summary: {summ}
        Inconsistency: {ents_inconsist}
        
        You need to remove these inconsistent words from Summary according to Document directly.
        Only output the revised summary.
        
    """
    
    model = "yi-34b-chat-0205"
    messages = [
        {"role": "system", "content": "You are now an summary evaluation assistant."}, 
        {"role": "user", "content": f"{prompt}"}
    ]
    
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.8
    )
    
    return completion.choices[0].message.content

In [10]:
# 事实检测 Version1
def factuality_detection(doc, summ, ents_inconsist):
    
    prompt = f"""
        
        Please analyze whether the content in the summary is consistent with the content in the document.
        
        Document: {doc}
        Summary: {summ}
        
    """
    
    model = "yi-34b-chat-0205"
    messages = [
        {"role": "system", "content": "You are now an summary evaluation assistant."}, 
        {"role": "user", "content": f"{prompt}"}
    ]
    
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.8
    )
    
    return completion.choices[0].message.content

In [11]:
# 事实检测 Version2
def factuality_detection(doc, summ, ents_inconsist):
    
    prompt = f"""
        根据文档[Document: {doc}]
        请你检查并修正以下摘要中的错误部分：
        [Summary: {summ}]，
        已知的错误包括[{ents_inconsist}]。
        请给出修正后的摘要:
    """
    
    model = "yi-34b-chat-0205"
    messages = [
        {"role": "system", "content": "You are now an summary evaluation assistant."}, 
        {"role": "user", "content": f"{prompt}"}
    ]
    
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.8
    )
    
    return completion.choices[0].message.content

In [12]:
# 摘要修复
def summary_revise(doc, summ):
    
    prompt = f"""
        Based on the content in the Document, complete the fill-in-the-blank(?) in the Summary with the information from the Document.
        Document: {doc}
        Summary: {summ}
        Output the Summary with the blanks filled in:
    """
    
    model = "yi-34b-chat-0205"
    messages = [
        {"role": "system", "content": "You are now an summary evaluation assistant."}, 
        {"role": "user", "content": f"{prompt}"}
    ]
    
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.8
    )
    
    return completion.choices[0].message.content

In [13]:
# 格式处理
import json
    
def deep_replace_question_mark(obj):
    if isinstance(obj, dict):
        return {k: deep_replace_question_mark(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_replace_question_mark(item) for item in obj]
    else:
        return '?'

In [14]:
# 通用信息抽取

def schema_UIE (text, masked_schema):
    
    prompt = f"""
        Please extract key information from the provided source document to fill in the placeholders of schema represented by '?'s.
        If it is difficult to fill in appropriate content, it can be left blank.
        source document:
        {text}
        schema:
        {masked_schema}
        Only output the result.
    """
    
    completion = client.chat.completions.create(
            model="yi-34b-chat-0205",
            messages=[{"role": "user", "content": f"{prompt}"}]
            )
    return completion.choices[0].message.content


def uie(doc, summ):
    prompt = """
        Please extract triples from the text entirely and output them in JSON format, 
        pay special attention to extracting information about numbers, places, times and people:
        
        EXAMPLE
        18-year-old Bob enjoys eating apples and bananas, is skilled at playing basketball and badminton, and his girlfriend is Alice.
        OUTPUT
        { "Bob": { 
            "age": "18-year-old",
            "enjoys eating": ["apples", "bananas"], 
            "is skilled at": ["basketball", "badminton"], 
            "girlfriend": "Alice" } 
        }
    """
    completion = client.chat.completions.create(
            model="yi-34b-chat-0205",
            messages=[{"role": "user", "content": f"{prompt}\nEXAMPLE: {summ}\nOUTPUT:"}],
            temperature=0
    )
    
    uie_summ = json.loads(completion.choices[0].message.content)
    uie_schema = deep_replace_question_mark(uie_summ)
    uie_doc = schema_UIE(doc, uie_schema)
    
    return uie_doc, uie_summ

In [15]:
# Chatbot交互部分
def add_text(history, text):
    history = history + [(text, None)]
    return history, gr.Textbox(value="", interactive=False)

def bot(history):
    messages = [{"role": "user", "content": history[-1][0]}]
    completion = client.chat.completions.create(
        model="yi-34b-chat-0205",
        messages=messages,
        stream=True,
        temperature=0
    )

    history[-1][1] = ""
    for chunk in completion:
        for choice in chunk.choices:
            content = choice.delta.content
            if content:
                history[-1][1] += content
                time.sleep(0.02)
                yield history

In [16]:
# 修改提示
def generate_prompt(uie_doc, uie_summ):
    prompt = f"""
        Answer the question in Chinese:
        Please output the different part between the following two JSON structures?
        DOCUMENT.JSON
        {uie_doc}
        SUMMARY.JSON
        {uie_summ}
    """
    
    return prompt   

In [17]:
# 修改提示
def generate_prompt(doc, uie_doc, uie_summ):
    prompt = f"""
        下面两个json信息是从文档中提取的关键信息，试比较哪一个与文档不一致？
        文档：
        {doc}
        DOCUMENT.JSON
        {uie_doc}
        SUMMARY.JSON
        {uie_summ}
    """
    
    return prompt  

#### SimonChat UI

In [25]:
import gradio as gr

theme = gr.themes.Default(neutral_hue="blue")

with gr.Blocks(theme=theme) as demo:
    
    gr.Markdown("# 🤖Simon.Chat")
    gr.Markdown("## 🤔Document-Summary Pair")
    with gr.Row("Document-Summary Pair"):
        text_doc = gr.Textbox(
            label="Document",
            placeholder="输入待检测的文档"
        )
        text_summ = gr.Textbox(
            label="Summary",
            placeholder="输入待检测的摘要"
        )
        
    with gr.Accordion("Open and choose a group of samples", open=False):
        sample_slider = gr.Slider(
            minimum=1,
            maximum=226,
            value=1,
            step=1,
            interactive=True,
            label="Sample Index",
        )
        sample_btn = gr.Button("Confirm")  
        sample_btn.click(fn=choose_sample, inputs=[sample_slider], outputs=[text_doc, text_summ])
        
    gr.Markdown("## 🤔Entity Analysis")
    with gr.Row("Entity Analysis"):
        ents_doc = gr.Textbox(label="Entities in Document")
        ents_summ = gr.Textbox(label="Entities in Summary")
    ents_inconsist = gr.Textbox(label="Inconsistent entities between Document and Summary")
    ea_btn = gr.Button("Entity Analysis")
    
    with gr.Tab("Revise 1"):
        rv1_summ = gr.Textbox(label="Revise 1")
        rv1_btn = gr.Button("Revise 1")  
    with gr.Tab("Revise 2"):
        rv2_summ = gr.Textbox(label="Revise 2")
        rv2_btn = gr.Button("Revise 2")  
    
    gr.Markdown("## 🤔UIE Analysis")
    with gr.Row("UIE Analysis"):
        uie_doc = gr.Textbox(label="Structure in Document")
        uie_summ = gr.Textbox(label="Structure in Summary")
    uie_btn = gr.Button("UIE Analysis")
    
    gr.Markdown("## 🤔Real-time Q&A")
    chatbot = gr.Chatbot(
        [],
        elem_id="chatbot",
        bubble_full_width=False,
        avatar_images=("./image/UserIcon.jpg", "./image/SimonIcon.png"),
        height=500
    )
    txt = gr.Textbox(
        scale=4,
        show_label=False,
        placeholder="你想问我什么问题？",
        container=False,
        label="Simon",
    )
    with gr.Accordion("Open and generate a prompt for revise", open=False):
        prompt_btn = gr.Button("Generate")
    
    ea_btn.click(fn=entity_analysis, inputs=[text_doc, text_summ], outputs=[ents_doc, ents_summ, ents_inconsist])
    # uie_btn.click(fn=uie, inputs=[text_doc, text_summ], outputs=[uie_doc, uie_summ])
    # rv1_btn.click(fn=entity_alignment, inputs=[text_doc, text_summ, ents_inconsist], outputs=[rv1_summ])
    rv1_btn.click(fn=entity_resolution, inputs=[text_doc, text_summ], outputs=[rv1_summ])
    # rv2_btn.click(fn=factuality_detection, inputs=[text_doc, rv1_summ, ents_inconsist], outputs=[rv2_summ])
    rv2_btn.click(fn=summary_revise, inputs=[text_doc, rv1_summ], outputs=[rv2_summ])
    uie_btn.click(fn=uie, inputs=[text_doc, rv2_summ], outputs=[uie_doc, uie_summ])
    prompt_btn.click(fn=generate_prompt, inputs=[text_doc, uie_doc, uie_summ], outputs=[txt])
    
    commit_btn = gr.Button("Chat",scale=2)
    commit_btn.click(fn=add_text, inputs=[chatbot, txt], outputs=[chatbot, txt]).then(
        bot, chatbot, chatbot, api_name="bot_response"
    ).then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=True).then(
        bot, chatbot, chatbot, api_name="bot_response"
    )
    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
    
    clear = gr.ClearButton(
        components=[chatbot, txt], value="Clear All"
    )
    
demo.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




96 CARDINAL
The most similar word to '96' is '68' with a similarity of 0.8412
        
Updated summary: Steven Davies hit 68 as Surrey beat Yorkshire by 24 runs to reach the One-Day Cup final at Lord's.
24 CARDINAL
The most similar word to '24' is '13' with a similarity of 0.7637
        
Updated summary: Steven Davies hit 68 as Surrey beat Yorkshire by 13 runs to reach the One-Day Cup final at Lord's.
One-Day Cup DATE
Steven Davies PERSON
The most similar word to 'Steven Davies' is 'Adam Lyth' with a similarity of 0.8039
        
Updated summary: Adam Lyth hit 68 as Surrey beat Yorkshire by 13 runs to reach the (?) final at Lord's.
73-year-old DATE
Aberdeenshire GPE
The most similar word to 'Aberdeenshire' is 'Portsoy' with a similarity of 0.3739
        
Updated summary: A (?) woman has appeared in court accused of causing a four-vehicle crash in (?).
four CARDINAL
The most similar word to 'four' is 'three' with a similarity of 0.9791
        
Updated summary: A (?) woman has appeare