In [1]:
import json
import pandas as pd

In [2]:
def jsonl_to_dataframe(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                # Try loading the JSON object from each line
                data.append(pd.json.loads(line.strip()))  # strip to remove any leading/trailing whitespace
            except ValueError as e:
                print(f"Error parsing line: {e}")
                continue  # skip malformed lines
    return pd.DataFrame(data)

In [3]:
data_set1 = pd.read_csv("parsed_set1_chain_gpt4omini.csv")
data_set2 = pd.read_csv("parsed_set2_chain_gpt4omini.csv")
data_set3 = pd.read_csv("parsed_set3_chain_gpt4omini.csv")
data_set4 = pd.read_csv("parsed_set4_chain_gpt4omini.csv")

In [4]:
import ast
spotlight = pd.concat([data_set1, data_set2, data_set3, data_set4], ignore_index=True)
for idx in range(len(spotlight)):
    spotlight["doc"][idx] = ast.literal_eval(spotlight["doc"][idx])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  spotlight["doc"][idx] = ast.literal_eval(spotlight["doc"][idx])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [5]:
with open('retrieved_idx_set1_chain.txt', 'r', encoding='utf-8') as f:
    retrieved_idx_set1 = json.load(f)

with open('retrieved_idx_set2_chain.txt', 'r', encoding='utf-8') as f:
    retrieved_idx_set2 = json.load(f)

with open('retrieved_idx_set3_chain.txt', 'r', encoding='utf-8') as f:
    retrieved_idx_set3 = json.load(f)

with open('retrieved_idx_set4_chain.txt', 'r', encoding='utf-8') as f:
    retrieved_idx_set4 = json.load(f)

In [6]:
retrieved_idx = []

for item in retrieved_idx_set1:
    retrieved_idx.append(item)
for item in retrieved_idx_set2:
    retrieved_idx.append(item)
for item in retrieved_idx_set3:
    retrieved_idx.append(item)
for item in retrieved_idx_set4:
    retrieved_idx.append(item)

In [7]:
data = retrieved_idx

In [8]:
def process_data(data):
    new_dict = {}
    accum = 0
    
    for item in data:
        for key in item.keys():
            if key not in new_dict:
                accum += 1
                new_dict[key] = accum * 100
    
    for item in data:
        for key, values in item.items():
            base_value = new_dict[key]
            item[key] = [base_value + i for i in values]
    
    return new_dict, data

In [9]:
new_dict, data = process_data(data)

In [10]:
updated_data = {}
inverted_chunk_name = {}
for idx in range(len(data)):
    tmp_dict = data[idx]
    updated_data[idx] = []
    for key in tmp_dict.keys():
        tmp_chunk = tmp_dict[key]
        for item in tmp_chunk:
            inverted_chunk_name[item] = key
            updated_data[idx].append(item)


# Now construct the new dataframe with
* Key | Attribute | Contraint | Chunk List | Level Set | Raw Question | Answer

In [14]:
from tqdm import tqdm
from utils import *

struct_info_dict = {}
new_spotlight = {}
new_spotlight["key"] = []
new_spotlight["attribute"] = []
new_spotlight["constraint"] = []
new_spotlight["chunk_list"] = []
new_spotlight["type"] = []
new_spotlight["set"] = []
new_spotlight["question"] = []
new_spotlight["answer"] = []
new_spotlight["doc"] = []

for idx in tqdm(range(len(spotlight))):
    if spotlight["type"][idx] == "paper":
        new_spotlight["key"].append(spotlight["question"][idx])
        new_spotlight["attribute"].append("cite/reference")
        new_spotlight["constraint"].append("graph")
        new_spotlight["chunk_list"].append(updated_data[idx])
        new_spotlight["set"].append(spotlight["set"][idx])
        new_spotlight["type"].append(spotlight["type"][idx])
        new_spotlight["question"].append(spotlight["instruction"][idx])
        new_spotlight["answer"].append(spotlight["answer"][idx])
        new_spotlight["doc"].append(spotlight["doc"][idx])

    if spotlight["type"][idx] != "paper":
        parsed = spotlight["parsed"][idx]
        type_word, merge_meta = parse_input(parsed)
        if type_word not in struct_info_dict:
            struct_info_dict[type_word] = {}
        struct_info_dict[type_word][idx] = merge_meta

        key = merge_meta[0][0]
        if key.endswith("Inc."):
            key = key[:-5]
        if key.endswith("标题"):
            key = key[:-2]
        attribute = merge_meta[0][1]
        constraint = merge_meta[1]
        if constraint and "conditional selection: " in constraint:
            constraint = constraint[23:]
        new_spotlight["key"].append(key)
        new_spotlight["attribute"].append(attribute)
        new_spotlight["constraint"].append(constraint)
        new_spotlight["chunk_list"].append(updated_data[idx])
        new_spotlight["set"].append(spotlight["set"][idx])
        new_spotlight["type"].append(spotlight["type"][idx])
        new_spotlight["question"].append(spotlight["question"][idx])
        new_spotlight["answer"].append(spotlight["answer"][idx])
        new_spotlight["doc"].append(spotlight["doc"][idx])


100%|██████████| 409/409 [00:00<00:00, 24271.97it/s]


In [15]:
df_new_spotlight = pd.DataFrame(new_spotlight)

In [16]:
with open('skylined_legal_chunks.json', 'r', encoding='utf-8') as f:
    legal_divided_chunks_raw = json.load(f)

with open('skylined_legal_chunks2.json', 'r', encoding='utf-8') as f:
    legal_divided_chunks_raw2 = json.load(f)

with open('financial_docs_divided_chunks.json', 'r', encoding='utf-8') as f:
    financial_divided_chunks = json.load(f)

with open('skylined_paper_dict.json', 'r', encoding='utf-8') as f:
    paper_divided_chunks = json.load(f)

In [17]:
with open("chunks_answer_chain.json", 'r', encoding='utf-8') as f:
    answer_4_selected_chunk = json.load(f)

# Now use the answer and other chunks to answer all questions

* For each query in dataset (spotlight), obtain its corresponding chunk list and for each chunk
    * if the chunk is visited (determined by checking if the key and chunk_idx exists in answer_4_selected_chunk), retrieve the answer in it for LLMs;
    * else, retrieve the chunk raw text and pass to LLMs
    * LLMs will generate based on the above answer & raw chunk texts


In [None]:
import requests

def get_gpt_response(query):
    """发送最终提问，并获取 GPT 的回复"""
    conversation_history.append({"role": "user", "content": query})  # 加入提问
    
    data = {
        "model": "gpt-3.5-turbo",
        "messages": conversation_history,  # 传递完整的对话历史
        "temperature": 0
    }
    
    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response_json = response.json()
        
        if "choices" in response_json:
            answer = response_json["choices"][0]["message"]["content"]
            conversation_history.append({"role": "assistant", "content": answer})
            # print("> Answer: ", answer)
            return answer
        else:
            print("⚠️ API 响应异常:", response_json)
            return "API_ERROR"
    
    except requests.exceptions.RequestException as e:
        print(f"⚠️ 请求异常: {e}")
        return "REQUEST_ERROR"
    except json.JSONDecodeError:
        print("⚠️ JSON 解析错误")
        return "JSON_ERROR"



url = ""
headers = { 
    "Content-Type": "application/json", 
    "Authorization": ""
}

# 存储对话历史
conversation_history = [
    {"role": "system", "content": "你是一个有帮助的助手。"}  # 设定初始角色
]

def add_context(text):
    """向 GPT 添加多段长文本作为上下文"""
    conversation_history.append({"role": "user", "content": text})

In [19]:
import tiktoken

def count_tokens(data, encoding_name="cl100k_base"):
    """
    计算字典中所有 token 的数量，使用 OpenAI 的 tiktoken 进行计算。
    - data: dict，可能包含嵌套结构的字典
    - encoding_name: str，指定编码方式（默认 "cl100k_base"）
    """
    encoding = tiktoken.get_encoding(encoding_name)
    
    if isinstance(data, dict):
        return sum(count_tokens(v, encoding_name) for v in data.values())
    elif isinstance(data, list):
        return sum(count_tokens(v, encoding_name) for v in data)
    elif isinstance(data, str):
        return len(encoding.encode(data))  # 使用 tiktoken 计算 token 数量
    else:
        return 0

In [25]:
df_new_spotlight_set1 = df_new_spotlight[df_new_spotlight["set"] == 2].reset_index(False)

In [None]:
from tqdm import tqdm
import requests
import sys
import json
import re
def extract_text_or_preserve(text):
    # 使用正则表达式提取 ' ' 中的内容
    matches = re.findall(r"'([^']*)'", text)
    return matches if matches else [text]

# with open('gpt4omini_answer_chain_set3.json', 'r', encoding='utf-8') as f:
#     final_answer = json.load(f)

final_answer = []

for query_idx in tqdm(range(len(final_answer), len(df_new_spotlight_set1))):
    conversation_history.clear()  # 清空对话历史
    question = df_new_spotlight_set1["question"][query_idx]
    key = df_new_spotlight_set1["key"][query_idx]
    attribute = df_new_spotlight_set1["attribute"][query_idx]
    constraint = df_new_spotlight_set1["constraint"][query_idx]
    chunk_lst = df_new_spotlight_set1["chunk_list"][query_idx]
    type = df_new_spotlight_set1["type"][query_idx]
    corresponding_chunks = {} 
    legal_divided_chunks = legal_divided_chunks_raw
    for chunk_key in chunk_lst:
        doc_name = inverted_chunk_name[chunk_key]
        chunk_idx_in_doc = chunk_key - new_dict[doc_name]
        if type == "paper":
            paper_name = paper_divided_chunks[doc_name]["file_name"]
            corresponding_chunks[paper_name] = paper_divided_chunks[doc_name]["content"]
        if doc_name in answer_4_selected_chunk:
            if type == "legal":
                if attribute != "判决结果":
                    legal_divided_chunks = legal_divided_chunks_raw
                    if "案由" in answer_4_selected_chunk[doc_name]:
                        corresponding_chunks[doc_name] = "(" + doc_name + ", 案由): " + answer_4_selected_chunk[doc_name]["案由"]
                    else:
                        corresponding_chunks[doc_name] = legal_divided_chunks[doc_name]
                else:
                    legal_divided_chunks = legal_divided_chunks_raw2
                    if "判决结果" in answer_4_selected_chunk[doc_name]:
                        corresponding_chunks[doc_name] = "(" + doc_name + ", 判决结果): " + answer_4_selected_chunk[doc_name]["判决结果"]
                    else:
                        corresponding_chunks[doc_name] = legal_divided_chunks[doc_name]
        

            if type == "financial":
                if doc_name not in corresponding_chunks:
                    corresponding_chunks[doc_name] = {}
                financial_doc_text = financial_divided_chunks[doc_name]
                chunk_name = list(financial_doc_text.keys())[chunk_idx_in_doc]
                chunk_idx_in_doc = str(chunk_idx_in_doc)
                if (chunk_idx_in_doc in answer_4_selected_chunk[doc_name]) and (attribute in answer_4_selected_chunk[doc_name][chunk_idx_in_doc]) and (answer_4_selected_chunk[doc_name][chunk_idx_in_doc][attribute] != "void"):
                    corresponding_chunks[doc_name][chunk_name] = "(" + doc_name + ", " + attribute + "): " + answer_4_selected_chunk[doc_name][chunk_idx_in_doc][attribute]
                    if key in doc_name:
                        break
                else:
                    corresponding_chunks[doc_name][chunk_name] = financial_divided_chunks[doc_name][chunk_name]
        else:
            if type == "legal":
                corresponding_chunks[doc_name] = legal_divided_chunks[doc_name]
            if type == "financial":
                if doc_name not in corresponding_chunks:
                    corresponding_chunks[doc_name] = {}
                financial_doc_text = financial_divided_chunks[doc_name]
                chunk_name = list(financial_doc_text.keys())[chunk_idx_in_doc]
                corresponding_chunks[doc_name][chunk_name] = financial_divided_chunks[doc_name][chunk_name]
    instruction = "We will provide a question and a list of textual chunks the question is related to, some is short, some is very long."
    add_context(instruction)
    
    if type == "legal":
        for key in corresponding_chunks:
            chunk_str = corresponding_chunks[key]
            add_context(f"Key: {key}; Content: {chunk_str}")
            # constraint = extract_text_or_preserve(constraint)[0]
        record_response = get_gpt_response(f"{question} Do NOT give any other explanations")
    
    elif type == "paper":
        record_response = ''
        response_dict = dict()
        token_count = count_tokens(corresponding_chunks)
        if token_count > 120000:
            for paper_name in corresponding_chunks:
                ######if the inputs is too long, use this
                conversation_history.clear()
                ######if the inputs is too long, use this

                chunks = corresponding_chunks[paper_name]
                add_context("***********" + paper_name + "***********\n\n" + chunks)
                response_dict[paper_name] = get_gpt_response(f"{question} You only need to consider references/citations of the provided paper names as {str(list(corresponding_chunks.keys()))}. The target paper is {key}. Find whether the paper names in the paper list is in target paper's references and find whether the target paper is cited in other papers in the paper list. Do NOT give any explanations")
            response_str = json.dumps(response_dict, ensure_ascii=False)
            print(response_str)
            record_response = get_gpt_response(f"{question} You only need to consider references/citations of the provided paper names as {str(list(corresponding_chunks.keys()))}. The target paper is {key}. Find whether the paper names in the paper list is in target paper's references and find whether the target paper is cited in other papers in the paper list. Do NOT give any explanations")

        else:
            for paper_name in corresponding_chunks:
                chunks = corresponding_chunks[paper_name]
                add_context("***********" + paper_name + "***********\n\n" + chunks)
            record_response = get_gpt_response(f"{question} You only need to consider references/citations of the provided paper names as {str(list(corresponding_chunks.keys()))}. The target paper is {key}. Find whether the paper names in the paper list is in target paper's references and find whether the target paper is cited in other papers in the paper list. Do NOT give any explanations")
    elif type == "financial":
        record_response = ''
        response_dict = dict()
        token_count = count_tokens(corresponding_chunks)
        print(token_count)
        if token_count > 120000:
            for doc_n_record in corresponding_chunks:
                ######if the inputs is too long, use this
                conversation_history.clear()
                ######if the inputs is too long, use this
                chunks = corresponding_chunks[doc_n_record]
                token_count = count_tokens(chunks)
                if token_count > 120000:
                    for key in chunks:
                        conversation_history.clear()
                        add_context("doc_name: " + doc_n_record + "\n\n")
                        chunk_str = corresponding_chunks[doc_n_record][key]
                        add_context(chunk_str)
                        response_dict[doc_n_record] = dict()
                        response_dict[doc_n_record][key] = get_gpt_response(f"What is ({key}, {attribute})? Do NOT give any explanations.")
                else:
                    add_context("doc_name: " + doc_n_record + "\n\n")
                    for key in chunks:
                        chunk_str = corresponding_chunks[doc_n_record][key]
                        add_context(chunk_str)
                        
                    response_dict[doc_n_record] = get_gpt_response(f"What is ({key}, {attribute})? Do NOT give any explanations.")
                response_str = json.dumps(response_dict, ensure_ascii=False)
            ######if the inputs is too long, use this


            record_response = get_gpt_response(f"{response_str} Please answer the question according to the provided contents {question}. Do NOT give any explanations")
        else:
            for doc_n_record in corresponding_chunks:
                chunks = corresponding_chunks[doc_n_record]
                add_context(doc_n_record + ": ")
        
                for key in chunks:
                    chunk_str = corresponding_chunks[doc_n_record][key]
                    add_context(chunk_str)
            record_response = get_gpt_response(f"Please answer the question according to the provided contents {question}. Do NOT give any explanations")

    if record_response in ["API_ERROR", "REQUEST_ERROR", "JSON_ERROR"]:
        print(f"⚠️ 发生错误，保存当前进度 ({query_idx})...")
        with open('gpt4omini_answer_chain_set2.json', 'w', encoding='utf-8') as f:
            json.dump(final_answer, f, ensure_ascii=False, indent=4)
        sys.exit()
    else:
        final_answer.append(record_response)
        print(record_response)

with open('gpt4omini_answer_chain_set2.json', 'w', encoding='utf-8') as f:
    json.dump(final_answer, f, ensure_ascii=False, indent=4)

  1%|          | 1/143 [00:02<04:46,  2.02s/it]

["Self-Rewarding Language Models", "KTO: Model Alignment as Prospect Theoretic Optimization", "DI Orca-Math: Unlocking the potential of SLMs in Grade School Math"]


  1%|▏         | 2/143 [00:04<04:44,  2.02s/it]

["Blind Quality Assessment for in-the-Wild Images via Hierarchical Feature Fusion and Iterative Mixed Database Training", "Generalized Portrait Quality Assessment", "PICNIQ: Pairwise Comparisons for Natural Image Quality Assessment", "Dual-Branch Network for Portrait Image Quality Assessment"]
