In [2]:
## 尝试做特征
import os
from os.path import join
from tqdm import tqdm
from collections import defaultdict as dd
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import numpy as np
from tqdm import trange
from sklearn.metrics import classification_report, precision_recall_fscore_support, average_precision_score
import logging

import utils
import settings

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

data_dir = settings.DATA_TRACE_DIR

papers_train = utils.load_json('', "paper_source_trace_train_ans.json")
papers_valid = utils.load_json('', "paper_source_trace_valid_wo_ans.json")
papers_test = utils.load_json('',  "paper_source_trace_test_wo_ans.json")


files = []
data_dir = './'
in_dir = join(data_dir, 'paper-xml')
for f in os.listdir(in_dir):
    if f.endswith('.xml'):
        files.append(f)

import re
from collections import Counter
import collections
import json


def clean_text(text):
    return re.sub(r'[^a-zA-Z]+', ' ', text).lower().strip()

import re
def get_pattern(input_string):
    pattern = r'\n\n(.*?)\n\n'

# 执行正则表达式搜索
    match = re.search(pattern, input_string)

    # 检查是否有匹配，如果有，则输出匹配到的内容
    if match:
        return match.group(1).strip()
    else:
        return ""
    

## test
id_train = [ x['_id'] for x in papers_train]
id_valid = [ x['_id'] for x in papers_valid]
id_test = [ x['_id'] for x in papers_test]



2024-06-12 21:47:36,504 loading paper_source_trace_train_ans.json ...
2024-06-12 21:47:36,534 paper_source_trace_train_ans.json loaded
2024-06-12 21:47:36,535 loading paper_source_trace_valid_wo_ans.json ...
2024-06-12 21:47:36,543 paper_source_trace_valid_wo_ans.json loaded
2024-06-12 21:47:36,544 loading paper_source_trace_test_wo_ans.json ...
2024-06-12 21:47:36,553 paper_source_trace_test_wo_ans.json loaded


In [3]:
def read_file(file, in_dir):
    '''
    读取XML格式论文文件
    '''
    file_path = join(in_dir, file)
    f = open(file_path, encoding='utf-8')
    xml = f.read()
    bs = BeautifulSoup(xml, "xml")    
    return xml, bs

def get_ref_list(bs):
    '''
    获取输入文件中所有引用对应的标题
    输入: bs格式的论文XML文件
    输出: dict格式的论文引用标题
    '''
    this_ref_list = {}
    references = bs.find_all("biblStruct")
    for ref in references:
        key = ref.get('xml:id')
        try:
            title = ref.find('title', level="a").get_text()
        except:
            try:
                title = ref.find('title', level="m").get_text()
            except:
                title = ref.getText()
                title = get_pattern(title)
        this_ref_list[key] = clean_text(title)
    return this_ref_list


def get_body_ref_count(body):
    '''
    获取论文引用数目
    输入: bs格式的论文XML文件
    输出: dict格式的论文引用数目
    '''
    #找到正文
    references = body.find_all('ref', type='bibr')
    reference_counter = collections.Counter()
    for reference in references:
        if reference.has_attr('target'):
            reference_counter[reference['target'].strip("#")] += 1
    return reference_counter


def get_bs_title(bs):
    '''
    获取标题
    '''
    ##--------------------- pid_to_title / title_to_pid --------------------------
    title = clean_text(bs.find(lambda tag: tag.name == "title" and tag.get('type') == 'main').text)
    return title


    ##--------------------- authors_info_dict --------------------------
def get_author_info(bs):
    authors_info = []
    authors = bs.find_all('author')

    # Loop through all authors and extract name and affiliation
    for author in authors:
        # Extracting the author's name (forename and surname)
        if author.affiliation:
            try:
                forename = author.forename.get_text(strip=True)
                surname = author.surname.get_text(strip=True)
                try:
                    country = author.find('country').get_text(strip=True)
                except:
                    country = ""
                org_names = author.find_all('orgName', attrs={"type": "institution"})
                authors_info.append({
                    "forename": forename,
                    "surname": surname,
                    "country": country,
                    "org_names": [org_name.get_text(strip=True) for org_name in org_names]
                })
            except:
                pass
    
    return authors_info

def find_bib_context(xml, bs, dist=100):
    bib_to_context = dd(list)
    bibr_strs_to_bid_id = {}
    for item in bs.find_all(type='bibr'):
        if "target" not in item.attrs:
            continue
        bib_id = item.attrs["target"][1:]
        item_str = "<ref type=\"bibr\" target=\"{}\">{}</ref>".format(item.attrs["target"], item.get_text())
        bibr_strs_to_bid_id[item_str] = bib_id

    for item_str in bibr_strs_to_bid_id:
        bib_id = bibr_strs_to_bid_id[item_str]
        cur_bib_context_pos_start = [ii for ii in range(len(xml)) if xml.startswith(item_str, ii)]
        for pos in cur_bib_context_pos_start:
            bib_to_context[bib_id].append(xml[pos - dist: pos + dist].replace("\n", " ").replace("\r", " ").strip())
    return bib_to_context

def  get_bib_context(bs, xml):
    return find_bib_context(xml, bs)

def find_section_number(header):
    if not header:
        return None
    
    section_number = header.get('n')
    
    if section_number:
        return section_number

    
    text = header.text
    
    if not text:
        return None
    
    if 'I.' in text or 'introduction' in text.lower():
        return '1.'
    if 'II.' in text or 'related' in text.lower():
        return '2.'
    if 'III.' in text or 'method' in text.lower():
        return '3.'
    if 'IV.' in text:
        return '4.'
    if 'V.' in text:
        return '5.'
    if 'VI.' in text:
        return '6.'
    if 'VII.' in text:
        return '7.'
    if 'VIII.' in text:
        return '8.'
    if 'IX.' in text:
        return '9.'
    else:
        return None

def get_section_bib_number(bs):
    sections = bs.find_all("div")
    reference_counter_1 = collections.Counter()
    reference_counter_2 = collections.Counter()
    reference_counter_3 = collections.Counter()
    reference_counter_4 = collections.Counter()
    reference_counter_5 = collections.Counter()
    reference_counter_6 = collections.Counter()

    now_section_index = 0

    for section in sections:
        if not section.find('head'):
            continue
        
        header = section.find("head")
        #section_number = header.get('n')
        section_number = find_section_number(header)

        #if not section_number:
        #    continue

        if section_number:

            section_index = int(section_number.split('.')[0])
            section_title = header.text.strip()
            now_section_index = section_index

        references = section.find_all('ref', type='bibr')
        for reference in references:
            if reference.has_attr('target'):
                if now_section_index == 1:
                    reference_counter_1[reference['target'].strip("#")] += 1
                elif now_section_index == 2:
                    reference_counter_2[reference['target'].strip("#")] += 1
                elif now_section_index == 3:
                    reference_counter_3[reference['target'].strip("#")] += 1
                elif now_section_index == 4:
                    reference_counter_4[reference['target'].strip("#")] += 1
                elif now_section_index == 5:
                    reference_counter_5[reference['target'].strip("#")] += 1
                elif now_section_index == 6:
                    reference_counter_6[reference['target'].strip("#")] += 1
        
        return reference_counter_1, reference_counter_2, reference_counter_3, reference_counter_4, reference_counter_5, reference_counter_6
    

## 解析原文的feat

Chinese_surname_list = ['li', 'wang', 'zhang', 'liu', 'chen', 'yang', 'huang', 'zhao', 'wu', 'zhou', 'xiong', 'xiu', 'sun', 'ma', 'zhu', 'hu', 'guo', 'he', 'gao', 'lin', 'luo', 'zheng', 'qian', 'zhen', 'tong', 'zeng', 'zen', 'zhuang',
                        'liang', 'xie', 'song', 'tang', 'dong', 'yuan', 'cai', 'feng', 'xiao', 'jiang', 'shi', 'hang', 'ang', 'ong', 'ie', 
                        'xu', 'sun', 'ma', 'zhu', 'hu', 'guo', 'he', 'gao', 'lin', 'luo', 'zheng', 'qian', 'zhen', 'tong', 'zeng', 'zen', 'zhuang',
                        'liang', 'xie', 'song', 'tang', 'dong', 'yuan', 'cai', 'feng', 'xiao', 'jiang', 'shi', 'hang', 'ang', 'ong', 'ie', 'chua',
                        'tian', 'jia', 'pan', 'du', 'dai', 'wei', 'yu', 'bai', 'han', 'gu', 'yao', 'kuang', 'shuang','qi','mei']

def get_num_author_source(paper):
    surname_list = [x['name'] for x in paper['authors']]
    num_author = len(surname_list)
    return num_author

def get_num_chinese_author_source(paper):
    surname_list = [x['name'] for x in paper['authors']]
    surname_list = [x.split(' ')[-1] for x in surname_list]
    chinese_author_list = [x for x in surname_list if x.lower() in Chinese_surname_list]
    return len(chinese_author_list)


## 引用的信息

def get_reference_info(bs):
    this_paper_list = {}
    references = bs.find_all("biblStruct") 
    for ref in references:
        key = ref.get('xml:id')
        this_ref_list = {}
        surname_list = [surname.get_text() for surname in ref.find_all("surname")]
        this_ref_list['num_author'] = len(surname_list)
        this_ref_list['num_chinese_author'] = len([x for x in surname_list if x.lower() in Chinese_surname_list])
        try:
            journal = ref.find("title", {"level": "m"}) or ref.find("title", {"level": "j"})
            journal = journal.get_text()
        except:
            journal = ""
        this_ref_list['journal'] = journal
        this_paper_list[key] = this_ref_list
    return this_paper_list

    
def extract_text(bs):
    sections = bs.find_all('div', limit=10) 
    total_text = []
    for i, section in enumerate(sections):
        # 处理引用标记
        for ref in section.find_all('ref', type='bibr'):
            # 获取编号，假设target属性类似于"#b20"
            try:
                ref_num = re.search(r'#b(\d+)', ref.get('target'))
                if ref_num:
                    # 将 ref 替换为如 [b20]
                    ref.replace_with('[b{}]'.format(ref_num.group(1)))
            except:
                pass

        # 从div标签中提取文本
        text = section.get_text(separator=' ', strip=True)
        total_text.append(text)
    return total_text


### 首先洗一遍数据

In [4]:
import random
import re
from typing import List, Union

class PaperParser:
    def __init__(self, bs, xml):
        self.paper_id = None
        self.title = None 
        self.body = None
        self.bs = bs
        self.xml = xml
        self.ref_info = {}
        self.preprocess()
        self.extract_ref_author_info()
        self.check_reference()

    def check_reference(self):
        unnumbered_references_texts = []
        numbered_references_texts = []
        for ref in self.references:
            if not ref.has_attr('target'):
                unnumbered_references_texts.append(ref.get_text())
            else:
                numbered_references_texts.append(ref.get_text())
        
        self.unnumbered_references_texts = unnumbered_references_texts
        self.numbered_references_texts = numbered_references_texts
        self.n_ref_valid = len(self.numbered_references_texts)
        self.n_ref_invalid = len(self.unnumbered_references_texts)


    def extract_references_from_text_digit(self, text) -> Union[List[int], None]:
        # 定义正则表达式，用于匹配引用格式
        pattern = re.compile(r'\[\s*(\d+(?:\s*;\s*\d+)*)\s*\]')
        
        # 在文本中搜索匹配的模式
        matches = pattern.findall(text)
        
        # 用于存储解析后的引用序号
        references = []
        
        # 如果找到匹配模式，解析引用序号
        if matches:
            for match in matches:
                # 分割每个匹配到的文本，并转换成整数序号
                refs = match.split(';')
                for ref in refs:
                    references.append(int(ref.strip()))  # 移除可能的空格，并将文本转换成整数
            return 'b' + str(references[0] - 1)
        else:
            return None  # 如果没有找到匹配，则返回None

    def extract_ref_author_info(self):
        '''
        获取作者信息, key为b_key
        '''

        def extract_year(ref):
            return ref.find('date', {'type': 'published'}).get('when')
        
        def extract_year_v2(ref):
            # 使用BeautifulSoup解析XML数据
            
            # 寻找所有的<biblScope>标签
            bibl_scope_tags = ref.find_all('biblScope')
            
            # 遍历找到的标签，寻找可能包含年份信息的标签
            for tag in bibl_scope_tags:
                if tag.get('unit') == 'page':
                    # 尝试将内容转换为整数，以验证其是否可能是年份
                    try:
                        year = int(tag.text)
                        # 根据需要可以添加更多的验证步骤，例如检查年份范围等
                        # 如果内容成功转换为整数，并且看起来像合理的年份，返回它
                        if 1900 <= year <= 2100:
                            return tag.text
                    except ValueError:
                        # 如果内容不能转换为整数，继续遍历
                        continue
            
            return None


        def extract_year_from_title(ref):
            # 使用BeautifulSoup解析XML数据
            
            # 寻找<title>标签
            title_tag = ref.find('title')
            
            if title_tag:
                # 使用正则表达式从标题文本中提取年份
                match = re.search(r'\b(\d{4})\b', title_tag.text)
                if match:
                    return match.group(1)
            
            return None
        
        for ref in self.references_bib:
            #title_tag = ref.find('title')
            suffix = None
            key = ref.get('xml:id')
            try:
                first_author = [surname.get_text() for surname in ref.find_all("surname")][0]
            except:
                first_author = None
            try:
                ref_years = extract_year(ref)
            except:
                try:
                    ref_years = extract_year_v2(ref)
                except:
                    try:
                        ref_years = extract_year_from_title(ref)
                    except:
                        ref_years = None

            if ref_years:
                if '-' in ref_years:
                    ref_years = ref_years[:4]
        

            self.ref_info[key] = {'first_author': first_author, 'ref_years': ref_years}


    def infer_ref_number_from_text(self, text):
        '''返回如b1...'''
        ## 第一种情况, ... et al.的形式
        ## 从解析的ref_info去匹配
        # 用于匹配年份且不以字母结尾的正则表达式
        #print('start')
        regex_no_letter = re.compile(r'\d{4}(?!\w)')

        # 用于匹配年份以字母结尾的正则表达式
        regex_with_letter = re.compile(r'\d{4}\w')

        good_match = []

        #先匹配作者名字
        author_name_match = []
        for key, value in self.ref_info.items():
            if key is None:
                continue
            first_author, ref_years = value['first_author'], value['ref_years']
            #print(f'first_author is {first_author}, ref_years is {ref_years}')
            if first_author is not None and text is not None:
                if first_author.lower() in text.lower():
                    author_name_match.append(key)

        #print(f'author_name_match is {author_name_match}')
        
        if len(author_name_match) == 1:
                return author_name_match[0]
        
        ## 否则，需要按年份来匹配
        for key, value in self.ref_info.items():
            if key is None:
                continue
            first_author, ref_years = value['first_author'], value['ref_years']
            #print(f'first_author is {first_author}, ref_years is {ref_years}, text is {text}')
            if first_author is not None and text is not None and ref_years is not None:
                if first_author.lower() in text.lower() and ref_years in text:
                    good_match.append((key, value))

        #print(f'good_match is {good_match}')
        
        if len(good_match) == 1:
            return key
        
        elif len(good_match) > 1:
            # 多个匹配
            ## 检查以abc结尾
            if regex_with_letter.search(text):
                ## 找到最后的那个字母
                last_letter = regex_with_letter.search(text).group()[-1]
                if last_letter == 'a':
                    return good_match[0][0]
                elif last_letter == 'b':
                    return good_match[1][0]
                elif last_letter == 'c' and len(good_match) > 2:
                    return good_match[2][0]
                elif last_letter == 'd' and len(good_match) > 3:
                    return good_match[3][0]
                else:
                    return good_match[0][0]


        elif len(good_match) == 0:
            ## 猜测是数字的形式
            if self.extract_references_from_text_digit(text):
                return self.extract_references_from_text_digit(text)
        
        if len(author_name_match) > 0:
            return author_name_match[0]
                    
        return None
            
    def replace_refs_with_inferred_numbers(self, body):

        self.body_raw = body

        # 找到所有的引用标签
        refs = body.find_all('ref', type="bibr")
        
        for ref in refs:
            if not ref.has_attr('target'):  # 如果没有 target 属性，则尝试推断引用编号
                #print(f'ref.text is {ref.text}')
                inferred_ref_number = self.infer_ref_number_from_text(ref.text)
                #print(inferred_ref_number)
                if inferred_ref_number is not None:
                    ref['target'] = '#' + inferred_ref_number  # 添加推断出的编号
                    ref.string = "{} ({})".format(ref.text, inferred_ref_number)
        
        # 返回处理后的XML数据
        self.body_processed = body
        
        

    def preprocess(self):
        '''
        初步的解析
        '''
        self.body = self.bs.find('body')
        self.references = self.body.find_all('ref', type='bibr')
        self.references_bib = self.bs.find_all("biblStruct")

        ### Check 引用
        
    def __repr__(self):
        return f'The paper with id {self.paper_id}, title {self.title}'

In [5]:
# 结果的记录
submission_ref_list = {}
submission_ref_year_list = {}
body_ref_count = {}
pid_to_title = {}
title_to_pid = {}
authors_info_dict = {}
bib_to_contexts_dict = {}
total_author_data_sup = {}

body_ref_count_1 = {}
body_ref_count_2 = {}
body_ref_count_3 = {}
body_ref_count_4 = {}
body_ref_count_5 = {}
body_ref_count_6 = {}

other_featuers = {}

total_id = id_train + id_valid + id_test

papers_list = {}

for file in tqdm(files):

    # 读文件
    paper_key = file.split('.')[0]

    # 无关数据不处理
    if paper_key not in total_id:
        continue

    xml, bs = read_file(file, in_dir)
    paper = PaperParser(bs = bs, xml = xml)
    body = bs.find('body')
    paper.replace_refs_with_inferred_numbers(body)
    papers_list[paper_key] = paper

  0%|          | 0/7541 [00:00<?, ?it/s]

100%|██████████| 7541/7541 [01:04<00:00, 117.49it/s]


In [6]:
iii = 0
for file in tqdm(files):

    # 读文件
    paper_key = file.split('.')[0]

    # 无关数据不处理
    if paper_key not in total_id:
        continue

    iii = iii + 1
    xml, bs = read_file(file, in_dir)

    # 获取处理过的paper
    paper = papers_list[paper_key]

    body = paper.body_processed

    this_ref_list = get_ref_list(bs)

    ## 获取论文引用标题
    submission_ref_list[paper_key] = this_ref_list

    ## 获取论文引用数目 
    body_ref_count[paper_key] = get_body_ref_count(body)

    title = get_bs_title(bs)

    pid_to_title[paper_key] = title
    title_to_pid[title] = paper_key

    ## 获取作者信息
    authors_info = get_author_info(bs)
    authors_info_dict[paper_key] = authors_info

    # 获取引用上下文信息
    bib_to_contexts_dict[paper_key] = get_bib_context(bs, xml)

    ## 获取段落引用信息
    reference_counter_1, reference_counter_2, reference_counter_3, reference_counter_4, reference_counter_5, reference_counter_6 = get_section_bib_number(bs)
    body_ref_count_1[paper_key] = reference_counter_1
    body_ref_count_2[paper_key] = reference_counter_2
    body_ref_count_3[paper_key] = reference_counter_3
    body_ref_count_4[paper_key] = reference_counter_4
    body_ref_count_5[paper_key] = reference_counter_5
    body_ref_count_6[paper_key] = reference_counter_6

    total_author_data_sup[paper_key] = get_reference_info(bs)    

100%|██████████| 7541/7541 [11:30<00:00, 10.92it/s]  


In [7]:
## 保存文件
processed_data = (submission_ref_list, 
                  body_ref_count, 
                  pid_to_title, 
                  title_to_pid, 
                  authors_info_dict, 
                  body_ref_count_1,
                  body_ref_count_2,
                  body_ref_count_3,
                  body_ref_count_4,
                  body_ref_count_5,
                  body_ref_count_6, total_author_data_sup, bib_to_contexts_dict)

## saved processed_data
import pickle
processed_data_name = 'processed_data_0601.pickle'
with open(processed_data_name, 'wb') as f:
    pickle.dump(processed_data, f)

In [8]:
#### 做GPT prompt
#### 第一类prompt: 直接给Inspiration基于结果的打分
## prompt 
def get_prompt_gpt_short_3(text):
    return f'''**** I have a task to identify the source papers of a given paper, which author indicates that they inspire them most based on its text. 
                **** I will now give you a text of academic papers, to find the most pertinent source papers:
                Firstly, Determine the primary challenges outlined in the paper, and understand the algorithm proposed by the author.
                Then, look for key phrases such as "inspired by", "motivated by", "inspired us", "motivated us", "take inspiration", "the pioneering/previous work", "following.. we adopt ... to solve the challenge/problem", "we use... based on to achieve..." or other language that indicates a strong reliance on previous research for developing the paper's core contributions.
                If key phrases exist, locate the key phrases in the text and find the sources papers which are indicated by these key phrases.
                If key phrases do not exist or for other reasons, identify the novel methods and approaches the paper introduces to tackle these challenges and locate references that are directly linked to these main challenges and methods.
                Verify that your answer do not include the ref papers appearing at the begining part of the text which describe the historical findings  like "someone et al. proposed...", normally they are not direct related to the paper's topic.
                Verify that the source papers are directly relevant to the paper's novel contributions very directly.
                Specifically highlight any references that are preceded by phrases indicating direct inspiration or motivation, such as 'Inspired by [reference]', and make these references a priority in the list
                Please provide a concise list of source papers based on the aforementioned criteria, ideally limiting the selection to the most central references that heavily influenced the main contributions of the paper. 
                **** Normally you should return less than 8 source papers. ****
                **** Please re-evaluate your result by the following metric: 		Is the main idea of paper p inspired by the reference？
* 		Is the core method of paper p derived from the reference？
* 		Is the reference essential for paper p? Without the work of this reference, paper p cannot be completed. 
Then, you should return your result in the json format, with the key is “reference number” and the value is “confidence score” between 0 and 1.
                **** The text of the paper is:{text}'''

## prompt 
def get_prompt_gpt_short_3(text):
    return f'''**** I have a task to identify the source papers of a given paper, which author indicates that they inspire them most based on its text. 
                **** I will now give you a text of academic papers, to find the most pertinent source papers:
                Firstly, Determine the primary challenges outlined in the paper, and understand the algorithm proposed by the author.
                Then, look for key phrases such as "inspired by", "motivated by", "inspired us", "motivated us", "take inspiration", "the pioneering/previous work", "following.. we adopt ... to solve the challenge/problem", "we use... based on to achieve..." or other language that indicates a strong reliance on previous research for developing the paper's core contributions.
                If key phrases exist, locate the key phrases in the text and find the sources papers which are indicated by these key phrases.
                If key phrases do not exist or for other reasons, identify the novel methods and approaches the paper introduces to tackle these challenges and locate references that are directly linked to these main challenges and methods.
                Verify that your answer do not include the ref papers appearing at the begining part of the text which describe the historical findings  like "someone et al. proposed...", normally they are not direct related to the paper's topic.
                Verify that the source papers are directly relevant to the paper's novel contributions very directly.
                Specifically highlight any references that are preceded by phrases indicating direct inspiration or motivation, such as 'Inspired by [reference]', and make these references a priority in the list
                Please provide a concise list of source papers based on the aforementioned criteria, ideally limiting the selection to the most central references that heavily influenced the main contributions of the paper. 
                **** Normally you should return less than 8 source papers. ****
                **** Please re-evaluate your result by the following metric: 		Is the main idea of paper p inspired by the reference？
* 		Is the core method of paper p derived from the reference？
* 		Is the reference essential for paper p? Without the work of this reference, paper p cannot be completed. 
Then, you should return your result in the json format, with the key is “reference number” and the value is “confidence score” between 0 and 1.
                **** The text of the paper is:{text}'''

### 刷prompt: get_prompt_gpt_short_3
total_prompt_train = {}
for paper_key in tqdm(id_train):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])
    my_prompt = get_prompt_gpt_short_3(text)
    total_prompt_train[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_short_3_train.pkl', 'wb') as f:
    pickle.dump(total_prompt_train, f)



### 刷prompt: get_prompt_gpt_short_3
total_prompt_valid = {}
for paper_key in tqdm(id_valid):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])
    my_prompt = get_prompt_gpt_short_3(text)
    total_prompt_valid[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_short_3_valid.pkl', 'wb') as f:
    pickle.dump(total_prompt_valid, f)


### 刷prompt: get_prompt_gpt_short_3
total_prompt_test = {}
for paper_key in tqdm(id_test):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])
    my_prompt = get_prompt_gpt_short_3(text)
    total_prompt_test[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_short_3_test.pkl', 'wb') as f:
    pickle.dump(total_prompt_test, f)

print(len(total_prompt_train))
print(len(total_prompt_valid))
print(len(total_prompt_test))

100%|██████████| 788/788 [00:00<00:00, 1468.20it/s]
100%|██████████| 394/394 [00:00<00:00, 1355.73it/s]
100%|██████████| 394/394 [00:00<00:00, 1435.72it/s]

788
394
394





In [10]:
#### 做GPT prompt
#### 第二类prompt: 给出Inspiration基于结果的分级
## prompt 
def get_prompt_gpt_json(text):
    return f'''**** I have a task to identify the source papers of a given paper, which author indicates that they inspire them most based on its text.
                **** I will now give you a text of academic papers, to find the most pertinent source papers:
                Firstly, Determine the primary challenges outlined in the paper, and understand the algorithm proposed by the author.
                Then, look for key phrases such as "inspired by", "motivated by", "inspired us", "motivated us", "take inspiration", "the pioneering/previous work", "following.. we adopt ... to solve the challenge/problem", "we use... based on to achieve..." or other language that indicates a strong reliance on previous research for developing the paper's core contributions.
                If key phrases exist, locate the key phrases in the text and find the sources papers which are indicated by these key phrases.
                If key phrases do not exist or for other reasons, identify the novel methods and approaches the paper introduces to tackle these challenges and locate references that are directly linked to these main challenges and methods.
                Verify that your answer do not include the ref papers appearing at the begining part of the text which describe the historical findings  like "someone et al. proposed...", normally they are not direct related to the paper's topic.
                Verify that the source papers are directly relevant to the paper's novel contributions very directly.
                Specifically highlight any references that are preceded by phrases indicating direct inspiration or motivation, such as 'Inspired by [reference]', and make these references a priority in the list
                Please provide a concise list of source papers based on the aforementioned criteria, ideally limiting the selection to the most central references that heavily influenced the main contributions of the paper. 
                **** Normally you should return less than 8 source papers. ****
                **** please give your answer in four parts: 0. **Summary of the challenges and inspirations of the paper**; 1. **Direct Inspiration/Motivation**; 2.** Indirect Inspiration/Motivation**; 3.**Other important ispiration/motivation**, and You MUST return the reference number in the answer.
                **** please give your answer in four parts in the json format with keys ['Summary','Direct Inspiration','Indirect Inspiration','Other Inspiration'], and return the reference number in the format of b1, b2... 
                **** The text of the paper is:{text}'''


### 刷prompt: get_prompt_gpt_short_3
total_prompt_train = {}
for paper_key in tqdm(id_train):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])
    my_prompt = get_prompt_gpt_json(text)
    total_prompt_train[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_json_3_train.pkl', 'wb') as f:
    pickle.dump(total_prompt_train, f)



### 刷prompt: get_prompt_gpt_short_3
total_prompt_valid = {}
for paper_key in tqdm(id_valid):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])
    my_prompt = get_prompt_gpt_json(text)
    total_prompt_valid[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_json_3_valid.pkl', 'wb') as f:
    pickle.dump(total_prompt_valid, f)


### 刷prompt: get_prompt_gpt_short_3
total_prompt_test = {}
for paper_key in tqdm(id_test):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])
    my_prompt = get_prompt_gpt_json(text)
    total_prompt_test[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_json_3_test.pkl', 'wb') as f:
    pickle.dump(total_prompt_test, f)

print(len(total_prompt_train))
print(len(total_prompt_valid))
print(len(total_prompt_test))

100%|██████████| 788/788 [00:00<00:00, 3570.39it/s]
100%|██████████| 394/394 [00:00<00:00, 3361.60it/s]
100%|██████████| 394/394 [00:00<00:00, 3457.84it/s]

788
394
394





In [11]:
total_prompt_test['622183525aee126c0f23c7c2']

'**** I have a task to identify the source papers of a given paper, which author indicates that they inspire them most based on its text.\n                **** I will now give you a text of academic papers, to find the most pertinent source papers:\n                Firstly, Determine the primary challenges outlined in the paper, and understand the algorithm proposed by the author.\n                Then, look for key phrases such as "inspired by", "motivated by", "inspired us", "motivated us", "take inspiration", "the pioneering/previous work", "following.. we adopt ... to solve the challenge/problem", "we use... based on to achieve..." or other language that indicates a strong reliance on previous research for developing the paper\'s core contributions.\n                If key phrases exist, locate the key phrases in the text and find the sources papers which are indicated by these key phrases.\n                If key phrases do not exist or for other reasons, identify the novel method

In [12]:
#### 第三类Prompt: 只刷test
def get_prompt_gpt_short_4(text):
    return f'''**** I have a task to identify the source papers of a given paper, which author indicates that they inspire them most based on its text. 
                **** I will now give you a text of academic papers, to find the most pertinent source papers:
                Firstly, Determine the primary challenges outlined in the paper, and understand the algorithm proposed by the author.
                Then, look for key phrases such as "inspired by", "motivated by", "inspired us", "motivated us", "take inspiration", "the pioneering/previous work", "following.. we adopt ... to solve the challenge/problem", "we use... based on to achieve..." or other language that indicates a strong reliance on previous research for developing the paper's core contributions.
                If key phrases exist, locate the key phrases in the text and find the sources papers which are indicated by these key phrases.
                If key phrases do not exist or for other reasons, identify the novel methods and approaches the paper introduces to tackle these challenges and locate references that are directly linked to these main challenges and methods.
                Verify that your answer do not include the ref papers appearing at the begining part of the text which describe the historical findings  like "someone et al. proposed...", normally they are not direct related to the paper's topic.
                Verify that the source papers are directly relevant to the paper's novel contributions very directly.
                Specifically highlight any references that are preceded by phrases indicating direct inspiration or motivation, such as 'Inspired by [reference]', and make these references a priority in the list
                Please provide a concise list of source papers based on the aforementioned criteria, ideally limiting the selection to the most central references that heavily influenced the main contributions of the paper. 
                **** Normally you should return less than 8 source papers. ****
                **** You can also infer the answer by evaluating the titles of each paper. ****
               **** You should re-evaluate your answer by: Emphasize Novel Contributions: Instead of broadly asking for "challenges", explicitly ask for the novel contributions of the paper. This helps focus the search for references that directly contribute to those specific aspects.
Prioritize Methodological Similarity: Instruct the search to prioritize references that share strong methodological similarities with the paper being analyzed. For example, papers that also employ minimax optimization or focus on sample efficiency in data augmentation.
Look for Comparative Phrases: Guide the search towards phrases that indicate comparisons with previous work, such as "Unlike [reference], we...", "Improving upon [reference], our method...", or "Similar to [reference] in terms of [aspect], but...".
#**** Please think step by step. Then, you should return your result in the json format, with the key is “reference number” and the value is “confidence score” between 0 and 1.
                **** The text of the paper is:{text}'''

In [13]:
### 刷prompt: get_prompt_gpt_short_3
total_prompt_test = {}
for paper_key in tqdm(id_test):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])
    my_prompt = get_prompt_gpt_short_4(text)
    total_prompt_test[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_detailed_test.pkl', 'wb') as f:
    pickle.dump(total_prompt_test, f)

100%|██████████| 394/394 [00:00<00:00, 3139.50it/s]


In [14]:
#### 第四类Prompt: 补充信息 test + train


def get_prompt_gpt_short_5(text, author_note = None, refs_list = None):

    part1 = f'''**** I have a task to identify the source papers of a given paper, which author indicates that they inspire them most based on its text. 
                **** I will now give you a text of academic papers, to find the most pertinent source papers:
                Firstly, Determine the primary challenges outlined in the paper, and understand the algorithm proposed by the author.
                Then, look for key phrases such as "inspired by", "motivated by", "inspired us", "motivated us", "take inspiration", "the pioneering/previous work", "following.. we adopt ... to solve the challenge/problem", "we use... based on to achieve..." or other language that indicates a strong reliance on previous research for developing the paper's core contributions.
                If key phrases exist, locate the key phrases in the text and find the sources papers which are indicated by these key phrases.
                If key phrases do not exist or for other reasons, identify the novel methods and approaches the paper introduces to tackle these challenges and locate references that are directly linked to these main challenges and methods.
                Verify that your answer do not include the ref papers appearing at the begining part of the text which describe the historical findings  like "someone et al. proposed...", normally they are not direct related to the paper's topic.
                Verify that the source papers are directly relevant to the paper's novel contributions very directly.
                Specifically highlight any references that are preceded by phrases indicating direct inspiration or motivation, such as 'Inspired by [reference]', and make these references a priority in the list
                Please provide a concise list of source papers based on the aforementioned criteria, ideally limiting the selection to the most central references that heavily influenced the main contributions of the paper. 
                **** Normally you should return less than 8 source papers. ****
                **** You can also infer the answer by evaluating the titles of each paper. ****
               **** You should re-evaluate your answer by: Emphasize Novel Contributions: Instead of broadly asking for "challenges", explicitly ask for the novel contributions of the paper. This helps focus the search for references that directly contribute to those specific aspects.
Prioritize Methodological Similarity: Instruct the search to prioritize references that share strong methodological similarities with the paper being analyzed. For example, papers that also employ minimax optimization or focus on sample efficiency in data augmentation.
Look for Comparative Phrases: Guide the search towards phrases that indicate comparisons with previous work, such as "Unlike [reference], we...", "Improving upon [reference], our method...", or "Similar to [reference] in terms of [aspect], but...".
#**** Please think step by step. Then, you should return your result in the json format, with the key is “reference number” and the value is “confidence score” between 0 and 1.'''
    if author_note is not None:
        part_note = f'''**** The author notes are: {author_note}, you can infer the answer based on this information'''     
    else:
        part_note = ''       
                
    part_text = f'''**** The text of the paper is:{text}'''

    part_refs = f'''**** The titles of the references are: {refs_list}. You can infer the answer based on this information'''

    return part1 + part_note + part_text + part_refs


### 刷train
total_prompt_train = {}
for paper_key in tqdm(id_train):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])

    raw_paper = [x for x in papers_train if x['_id'] == paper_key][0]
    if 'notes' in raw_paper.keys():
        note = raw_paper['notes']
    else:
        note = None
    refs_list = submission_ref_list[paper_key]
    my_prompt = get_prompt_gpt_short_5(text, author_note = note, refs_list = refs_list)
    total_prompt_train[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_detailed_train_V2.pkl', 'wb') as f:
    pickle.dump(total_prompt_train, f)


total_prompt_test = {}
for paper_key in tqdm(id_test):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])

    raw_paper = [x for x in papers_test if x['_id'] == paper_key][0]
    if 'notes' in raw_paper.keys():
        note = raw_paper['notes']
    else:
        note = None
    refs_list = submission_ref_list[paper_key]
    my_prompt = get_prompt_gpt_short_5(text, author_note = note, refs_list = refs_list)
    total_prompt_test[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_detailed_test_V2.pkl', 'wb') as f:
    pickle.dump(total_prompt_test, f)

print(len(total_prompt_train), len(total_prompt_test))

100%|██████████| 788/788 [00:00<00:00, 2594.60it/s]
100%|██████████| 394/394 [00:00<00:00, 2777.78it/s]

788 394





In [15]:
### 第五类test: 考虑作者的note
def get_prompt_gpt_short_note(text, notes):
    return f'''**** I have a task to identify the source papers of a given paper, which author indicates that they inspire them most based on its text. 
                **** I will now give you a text of academic papers and the author's note of the paper. To find the most pertinent source papers:
                Firstly, Understand the note given by arthor. Identify the main challenges and novel methods and approaches the paper introduces.
                **** Then, find all  relevant source papers that the author based on the authors'note.
                **** Please re-evaluate your result by the following metric: 		Is the main idea of paper p inspired by the reference？
* 		Is the core method of paper p derived from the reference？
* 		Is the reference essential for paper p? Without the work of this reference, paper p cannot be completed. 
*       Is the contribution of this reference paper mentioned in the author's note.
Then, you should return your result in the json format, with the key is “reference number” (like b1, b2, ...) and the value is “confidence score” between 0 and 1.
                **** The author's note is:{notes}
                **** The text of the paper is:{text}'''

total_prompt_test = {}
for paper_key in tqdm(id_test):
    file = paper_key + '.xml'
    this_paper = papers_list[paper_key]
    processed_body = this_paper.body_processed
    text = extract_text(processed_body)
    text = '.'.join([x for x in text])

    raw_paper = [x for x in papers_test if x['_id'] == paper_key][0]
    if 'notes' in raw_paper.keys():
        note = raw_paper['notes']
    else:
        continue
    
    refs_list = submission_ref_list[paper_key]

    my_prompt = get_prompt_gpt_short_note(text, notes = note)
    total_prompt_test[paper_key] = my_prompt

import pickle
with open('20240528_get_prompt_gpt_detailed_test_note.pkl', 'wb') as f:
    pickle.dump(total_prompt_test, f)

100%|██████████| 394/394 [00:00<00:00, 2951.00it/s]
