# Environment Setup

**Google Colab Only**

# Reference Generation/Modification

### Import packages

In [4]:
# For Chinese users, changing your https/http proxy is necessary
import os
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import HumanMessage
import json
from langchain.docstore.base import Docstore
from langchain.docstore.document import Document
from langchain.prompts import ( # type: ignore
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)
import tqdm

USER_AGENT environment variable not set, consider setting it to identify your requests.


### Rewrites Labelling

#### SRCQA Rewrites Labeling

In [1]:
def get_data(file_path, mode):
    '''
    Extract document and metadata from iaq json files
    '''
    if mode == '教材':
        # Load the JSON data from the file
        with open(file_path) as file:
            data = json.load(file)
        # Extract the 'parent' and 'text' values, merge them into 'knowledge', and add to the knowledge_list
        knowledge_list = []
        for item in data.values():
            parent_data = item['parent']
            sub_block = item['sub_block']
            sub_title = sub_block['title']
            if sub_title == '经典例题详解':
                continue
            for key, value in sub_block.items():
                if "sub_block" in key or "mid_data" in key:
                    knowledge_list.extend([f"{parent_data} : {sub_title} : {blocks['paragraph']}" for blocks in sub_block[key] if 'paragraph' in blocks.keys()])
        meta_data = {}
        meta_data['source'] = file_path.split('/')[-1]
        if '财务' in file_path:
            meta_data['class'] = '财务'
        elif '法规' in file_path:
            meta_data['class'] = '法规'
        return knowledge_list, meta_data
    
    elif mode == '例题':
        with open(file_path) as file:
            data = json.load(file)
        meta_data = {}
        meta_data['source'] = file_path.split('/')[-1].split('.')[0]
        knowledge_list = []
        knowledge_list.extend([{"章节":meta_data['source'],"题型":item['instruction'],"问题":item['input'],"答案":item['output'],"解析":item['CoT']}
                                for item in data.values() if 'instruction' in item.keys()  and 'input'  
                                in item.keys() and 'output' in item.keys() and 'CoT'  in item.keys()]) 
        return knowledge_list, meta_data

In [None]:
# load document 
doc_folder = '/home/wangqi/Projects/LLM_Examinee/data/iaq_document/json_files/doc'
title_doc = []
knowledge_doc = []
for f_file in os.listdir(doc_folder):    
    file_path = os.path.join(doc_folder, f_file)
    # load the json file
    knowledge_list, meta_data = get_data(file_path, mode = '教材')
    # construct langchain document
    knowledge_doc.extend([ Document(page_content=knowledge) for knowledge in knowledge_list])
knowledge_db = FAISS.from_documents(knowledge_doc, embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

In [None]:
# load questions
question_folder = '/home/wangqi/Projects/LLM_Examinee/data/iaq_document/json_files/test/mock'
q_list = []
for f_file in os.listdir(question_folder):    
    file_path = os.path.join(question_folder, f_file)
    # load the json file
    q_list.extend(get_data(file_path, mode = '例题')[0])

In [2]:
def SRT_Data_Aug(question, answer):
    SRT_DA_instruction = \
    "你学识渊博，请阅读这道保荐代表人考试试题，我提供了答案，首先请你一步一步地结合解析思考一下正确答案是如何得到的。\
    然后结合题目答案与解析复盘一下这道题目的考察要点。\n\
    输出格式请遵循:\
    问题解析:.....\n\
    考查要点:1. ......\n2. .....\n3. ......"
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", SRT_DA_instruction),
            ("human", "试题如下：{question}\n答案与解析如下：{answer}"),
        ]
    )
    llm = ChatOpenAI(model_name = 'gpt-4o', temperature=0)
    LI_chain = ( {"question": RunnablePassthrough(),"answer": RunnablePassthrough()} | prompt | llm | StrOutputParser())
    return LI_chain.invoke({"question": question, "answer": answer})

In [None]:
output_data = []
for i in tqdm.tqdm(range(len(q_list))):
    annotation = SRT_Data_Aug(q_list[i]["问题"],q_list[i]["答案"])
    new_entry = {"问题": q_list[i]["问题"], "考点": annotation}
    output_data.append(new_entry)
    if i % 50 == 0:
        print(new_entry)
instruction = "用户的查询是一道保荐代表人试题，你需要重写这道试题以便模型更好地检索。重写的方式将此题改写成题目的考察意图。"
lt_with_ins = []
for item in output_data:
    lt_with_ins.append({"instruction":instruction, "input":item["问题"], "output":item["考点"]})
with open("data/SRCQA/SRC_SFT.json", 'a', encoding='utf-8') as f:
    json.dump(lt_with_ins, f, ensure_ascii=False, indent=4)

In [15]:
def keep_after_substring(s):
    parts = s.split('考查要点:')
    if len(parts) > 1:
        return parts[1]
    else:
        return ''  # 如果子串不存在，返回空字符串
with open("/home/wangqi/Projects/r-r-rewriter/data/SRCQA/SRC_SFT.json", 'r', encoding='utf-8') as f:
    SFT_json = json.load(f)
    for i in range(len(SFT_json)):
        SFT_json[i]['output'] = keep_after_substring(SFT_json[i]['output'])
with open("/home/wangqi/Projects/r-r-rewriter/data/SRCQA/SRC_SFT_P.json", 'w', encoding='utf-8') as f:
    json.dump(SFT_json, f, ensure_ascii=False, indent=4)

#### SyllabusQA/FintexQA Rewrites Labeling

In [12]:
def QA_Data_Aug(question, answer):
    DA_instruction = \
    "You are knowledgeable. Please read this question. I have provided the answer. First, please think about how the correct answer is obtained step by step in combination with the analysis. \
    Then, review the key points of this question in combination with the answer and analysis.\n\
    Please follow the output format:\
    Problem Analysis:.....\n\
    Key Points of Examination: 1. ......\n2. .....\n3. ......"
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", DA_instruction),
            ("human", "The question is: {question}\nThe answer is: {answer}"),
        ]
    )
    llm = ChatOpenAI(model_name = 'gpt-4o', temperature=0)
    LI_chain = ( {"question": RunnablePassthrough(),"answer": RunnablePassthrough()} | prompt | llm | StrOutputParser())
    return LI_chain.invoke({"question": question, "answer": answer})

def keep_after_substring(s):
    parts = s.split('Key Points of Examination:')
    if len(parts) > 1:
        return parts[1]
    else:
        return ''  # 如果子串不存在，返回空字符串

SyllabusQA Labeling

In [4]:
os.environ['http_proxy'] = 'http://100.64.0.2:11080'
os.environ['https_proxy'] = 'http://100.64.0.2:11080'

In [None]:
import pandas as pd
from  tqdm import tqdm
exam_fn = '/home/wangqi/Projects/r-r-rewriter/data/SyllabusQA/data/dataset_split/train.csv'
df = pd.read_csv(exam_fn)
output_data = []
for i in tqdm(range(1, len(df))):
    sub_df = df.iloc[i]
    query = sub_df['question']
    ground_truth = sub_df['answer']
    annotation = QA_Data_Aug(query, ground_truth)
    new_entry = {"question": query, "rewrites": annotation}
    output_data.append(new_entry)
    if i % 50 == 0:
        print(new_entry)
instruction = "The user's query is a question about Course Syllabi. You need to rewrite the user's query for better retrieval by the model. In the rewritten query, the user's intention of inquiry should be clarified."
lt_with_ins = []
for item in output_data:
    lt_with_ins.append({"instruction":instruction, "input":item["question"], "output":item["rewrites"]})
with open("/home/wangqi/Projects/r-r-rewriter/data/SyllabusQA/Syllabus_SFT.json", 'a', encoding='utf-8') as f:
    json.dump(lt_with_ins, f, ensure_ascii=False, indent=4)

In [None]:
with open("/home/wangqi/Projects/r-r-rewriter/data/SyllabusQA/Syllabus_SFT.json", 'r', encoding='utf-8') as f:
    SFT_json = json.load(f)
    for i in range(len(SFT_json)):
        SFT_json[i]['output'] = keep_after_substring(SFT_json[i]['output'])
with open("/home/wangqi/Projects/r-r-rewriter/data/SyllabusQA/Syllabus_SFT_P.json", 'w', encoding='utf-8') as f:
    json.dump(SFT_json, f, ensure_ascii=False, indent=4)

FintextQA Rewrites Labeling

In [None]:
from tqdm import tqdm

file_path = '/home/wangqi/Projects/r-r-rewriter/data/FintextQA/fin_dataset_train.json'

with open(file_path, 'r') as file:
    content = json.load(file)
output_data = []
for item in tqdm(content):
    new_entry = {}
    if 'questions' in item and isinstance(item['questions'], list):
        for sub_item in item['questions']:
            question, answer = sub_item.get('question', ''), sub_item.get('answer', '')
            if question and answer:
                annotation = QA_Data_Aug(question, answer)
                new_entry = {"question": question, "answer": answer, "rewrites": annotation}
                output_data.append(new_entry)
    else:
        continue  # Skip items without 'questions' or not a list

instruction = "The user's query is a question about finance. You need to rewrite the user's query for better retrieval by the model. In the rewritten query, the user's intention of inquiry should be clarified."
lt_with_ins = []
for item in output_data:
    lt_with_ins.append({"instruction":instruction, "input":item["question"], "output":item["rewrites"]})
with open("/home/wangqi/Projects/r-r-rewriter/data/FintextQA/Fintext_SFT.json", 'a', encoding='utf-8') as f:
    json.dump(lt_with_ins, f, ensure_ascii=False, indent=4)

In [13]:
with open("/home/wangqi/Projects/r-r-rewriter/data/Fintext_SFT.json", 'r', encoding='utf-8') as f:
    SFT_json = json.load(f)
    for i in range(len(SFT_json)):
        SFT_json[i]['output'] = keep_after_substring(SFT_json[i]['output'])
with open("/home/wangqi/Projects/r-r-rewriter/data/Fintext_SFT_P.json", 'w', encoding='utf-8') as f:
    json.dump(SFT_json, f, ensure_ascii=False, indent=4)