# Environment Setup

**Google Colab Only**

# Reference Generation/Modification

### Import packages

In [2]:
# For Chinese users, changing your https/http proxy is necessary
import os
os.environ["https_proxy"] = "100.64.0.2:11080"
os.environ["http_proxy"] = "100.64.0.2:11080"
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import HumanMessage
import json
os.environ["OPENAI_API_KEY"] = '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'
# Set work dir in Colab
os.chdir("/home/wangqi/Projects/LLM_Examinee")
from langchain.docstore.base import Docstore
from langchain.docstore.document import Document
from langchain.prompts import ( # type: ignore
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)
import tqdm
from data_generator.prompt_templates import SRT_instruction,SRT_few_shot_input,SRT_few_shot_output

### Load SRT Documents

In [3]:
def get_data(file_path, mode):
    '''
    Extract document and metadata from iaq json files
    '''
    if mode == '教材':
        # Load the JSON data from the file
        with open(file_path) as file:
            data = json.load(file)
        # Extract the 'parent' and 'text' values, merge them into 'knowledge', and add to the knowledge_list
        knowledge_list = []
        for item in data.values():
            parent_data = item['parent']
            sub_block = item['sub_block']
            sub_title = sub_block['title']
            if sub_title == '经典例题详解':
                continue
            for key, value in sub_block.items():
                if "sub_block" in key or "mid_data" in key:
                    knowledge_list.extend([f"{parent_data} : {sub_title} : {blocks['paragraph']}" for blocks in sub_block[key] if 'paragraph' in blocks.keys()])
        meta_data = {}
        meta_data['source'] = file_path.split('/')[-1]
        if '财务' in file_path:
            meta_data['class'] = '财务'
        elif '法规' in file_path:
            meta_data['class'] = '法规'
        return knowledge_list, meta_data
    
    elif mode == '例题':
        with open(file_path) as file:
            data = json.load(file)
        meta_data = {}
        meta_data['source'] = file_path.split('/')[-1].split('.')[0]
        knowledge_list = []
        knowledge_list.extend([{"章节":meta_data['source'],"题型":item['instruction'],"问题":item['input'],"答案":item['output'],"解析":item['CoT']}
                                for item in data.values() if 'instruction' in item.keys()  and 'input'  
                                in item.keys() and 'output' in item.keys() and 'CoT'  in item.keys()]) 
        return knowledge_list, meta_data

In [8]:
# load document 
doc_folder = '/home/wangqi/Projects/LLM_Examinee/database/iaq_document/json_files/doc'
title_doc = []
knowledge_doc = []
for f_file in os.listdir(doc_folder):    
    file_path = os.path.join(doc_folder, f_file)
    # load the json file
    knowledge_list, meta_data = get_data(file_path, mode = '教材')
    # construct langchain document
    knowledge_doc.extend([ Document(page_content=knowledge) for knowledge in knowledge_list])
knowledge_db = FAISS.from_documents(knowledge_doc, embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

In [9]:
# load questions
question_folder = '/home/wangqi/Projects/LLM_Examinee/database/iaq_document/json_files/test/mock'
q_list = []
for f_file in os.listdir(question_folder):    
    file_path = os.path.join(question_folder, f_file)
    # load the json file
    q_list.extend(get_data(file_path, mode = '例题')[0])

### Pointwise Rerank

In [22]:
def reference_evaluator(question, answer, k, vector_db):
    doc_candidates = vector_db.similarity_search(question)
    if len(doc_candidates) > k:
        doc_candidates = doc_candidates[:k]
    res = []
    for i in range(len(doc_candidates)):
        prompt = f"""
        Given the question, the answer with analysis, and the reference materials, please rate the reference materials based on their ability to support the solution to this question. The rating scale is from 0 to 5 points.

        Scoring Criteria for Reference Materials:
        - 1 point: The reference material is not related to the question.
        - 2 points: The reference material is somewhat related but does not provide substantial support.
        - 3 points: The reference material is related and provides some support for the answer.
        - 4 points: The reference material is highly relevant and provides significant support for the answer.
        - 5 points: The reference material is perfectly relevant and essential for understanding and solving the question.

        Your response must be formatted as JSON, with the  "ratings" as the keys.

        The question is as follows:
        {question}

        The answer and analysis are as follows:
        {answer}

        The reference materials are as follows:
        {doc_candidates[i].page_content}
        """
        llm = ChatOpenAI(model_name = 'gpt-4o-mini', temperature=0)
        response = llm.invoke([HumanMessage(content=prompt)]).content
        res.append({"reference": doc_candidates[i].page_content, "DPR_rank": i, "rating": response})
    return res

### Rewrites Labelling

### Data Augmentation

In [10]:
def SRT_Data_Aug(question, answer):
    SRT_DA_instruction = \
    "你学识渊博，请阅读这道保荐代表人考试试题，我提供了答案，首先请你一步一步地结合解析思考一下正确答案是如何得到的。\
    然后结合题目答案与解析复盘一下这道题目的考察要点。\n\
    输出格式请遵循:\
    问题解析:.....\n\
    考查要点:1. ......\n2. .....\n3. ......"
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", SRT_DA_instruction),
            ("human", "试题如下：{question}\n答案与解析如下：{answer}"),
        ]
    )
    llm = ChatOpenAI(model_name = 'gpt-4o', temperature=0)
    LI_chain = ( {"question": RunnablePassthrough(),"answer": RunnablePassthrough()} | prompt | llm | StrOutputParser())
    return LI_chain.invoke({"question": question, "answer": answer})

In [None]:
output_data = []
for i in tqdm.tqdm(range(len(q_list))):
    annotation = SRT_Data_Aug(q_list[i]["问题"],q_list[i]["答案"])
    new_entry = {"问题": q_list[i]["问题"], "考点": annotation}
    output_data.append(new_entry)
    if i % 50 == 0:
        print(new_entry)


with open("database/iaq_document/annotation/SRC_LI.json", 'a', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=4)

In [17]:
instruction = "用户的查询是一道保荐代表人试题，你需要重写这道试题以便模型更好地检索。重写的方式将此题改写成题目的考察意图。"
lt_with_ins = []
for item in output_data:
    lt_with_ins.append({"instruction":instruction, "input":item["问题"], "output":item["考点"]})
with open("database/iaq_document/annotation/SRC_SFT.json", 'a', encoding='utf-8') as f:
    json.dump(lt_with_ins, f, ensure_ascii=False, indent=4)

## Filtering LI with knowledge points

In [None]:
def jaccard_similarity(entry_a, entry_b):
    # 分词
    set_a = set(entry_a.split())
    set_b = set(entry_b.split())
    
    intersection = len(set_a.intersection(set_b))
    union = len(set_a.union(set_b))
    jaccard_sim = intersection / union
    return jaccard_sim

def is_match_jaccard(entry_a, entry_b, threshold=0.3):
    jaccard_sim = jaccard_similarity(entry_a, entry_b)
    return 1 if jaccard_sim >= threshold else 0