In [1]:
## 尝试做特征
import os
from os.path import join
from tqdm import tqdm
from collections import defaultdict as dd
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import numpy as np
from tqdm import trange
from sklearn.metrics import classification_report, precision_recall_fscore_support, average_precision_score
import logging

import utils
import settings

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

data_dir = settings.DATA_TRACE_DIR

papers_train = utils.load_json('', "paper_source_trace_train_ans.json")
papers_valid = utils.load_json('', "paper_source_trace_valid_wo_ans.json")
papers_test = utils.load_json('',  "paper_source_trace_test_wo_ans.json")


files = []
data_dir = './'
in_dir = join(data_dir, 'paper-xml')
for f in os.listdir(in_dir):
    if f.endswith('.xml'):
        files.append(f)

import re
from collections import Counter
import collections
import json


def clean_text(text):
    return re.sub(r'[^a-zA-Z]+', ' ', text).lower().strip()

import re
def get_pattern(input_string):
    pattern = r'\n\n(.*?)\n\n'

# 执行正则表达式搜索
    match = re.search(pattern, input_string)

    # 检查是否有匹配，如果有，则输出匹配到的内容
    if match:
        return match.group(1).strip()
    else:
        return ""
    
## 读数据-GPT
## 读数据


## test
id_train = [ x['_id'] for x in papers_train]
id_valid = [ x['_id'] for x in papers_valid]
id_test = [ x['_id'] for x in papers_test]



2024-06-13 08:35:50,102 loading paper_source_trace_train_ans.json ...
2024-06-13 08:35:50,150 paper_source_trace_train_ans.json loaded
2024-06-13 08:35:50,151 loading paper_source_trace_valid_wo_ans.json ...
2024-06-13 08:35:50,158 paper_source_trace_valid_wo_ans.json loaded
2024-06-13 08:35:50,159 loading paper_source_trace_test_wo_ans.json ...
2024-06-13 08:35:50,168 paper_source_trace_test_wo_ans.json loaded


In [2]:
## saved processed_data
import pickle
processed_data_name = 'processed_data_0601.pickle'
with open(processed_data_name, 'rb') as f:
    processed_data = pickle.load(f)

In [3]:
submission_ref_list,  body_ref_count,  pid_to_title,  title_to_pid,  authors_info_dict,  body_ref_count_1, body_ref_count_2, body_ref_count_3, body_ref_count_4, body_ref_count_5, body_ref_count_6, total_author_data_sup, bib_to_contexts_dict = processed_data

In [4]:
### Read paper
def read_file(file, in_dir):
    '''
    读取XML格式论文文件
    '''
    file_path = join(in_dir, file)
    f = open(file_path, encoding='utf-8')
    xml = f.read()
    bs = BeautifulSoup(xml, "xml")    
    return xml, bs

def get_ref_list(bs):
    '''
    获取输入文件中所有引用对应的标题
    输入: bs格式的论文XML文件
    输出: dict格式的论文引用标题
    '''
    this_ref_list = {}
    references = bs.find_all("biblStruct")
    for ref in references:
        key = ref.get('xml:id')
        try:
            title = ref.find('title', level="a").get_text()
        except:
            try:
                title = ref.find('title', level="m").get_text()
            except:
                title = ref.getText()
                title = get_pattern(title)
        this_ref_list[key] = clean_text(title)
    return this_ref_list


def get_body_ref_count(body):
    '''
    获取论文引用数目
    输入: bs格式的论文XML文件
    输出: dict格式的论文引用数目
    '''
    #找到正文
    references = body.find_all('ref', type='bibr')
    reference_counter = collections.Counter()
    for reference in references:
        if reference.has_attr('target'):
            reference_counter[reference['target'].strip("#")] += 1
    return reference_counter


def get_bs_title(bs):
    '''
    获取标题
    '''
    ##--------------------- pid_to_title / title_to_pid --------------------------
    title = clean_text(bs.find(lambda tag: tag.name == "title" and tag.get('type') == 'main').text)
    return title


    ##--------------------- authors_info_dict --------------------------
def get_author_info(bs):
    authors_info = []
    authors = bs.find_all('author')

    # Loop through all authors and extract name and affiliation
    for author in authors:
        # Extracting the author's name (forename and surname)
        if author.affiliation:
            try:
                forename = author.forename.get_text(strip=True)
                surname = author.surname.get_text(strip=True)
                try:
                    country = author.find('country').get_text(strip=True)
                except:
                    country = ""
                org_names = author.find_all('orgName', attrs={"type": "institution"})
                authors_info.append({
                    "forename": forename,
                    "surname": surname,
                    "country": country,
                    "org_names": [org_name.get_text(strip=True) for org_name in org_names]
                })
            except:
                pass
    
    return authors_info

def find_bib_context(xml, bs, dist=100):
    bib_to_context = dd(list)
    bibr_strs_to_bid_id = {}
    for item in bs.find_all(type='bibr'):
        if "target" not in item.attrs:
            continue
        bib_id = item.attrs["target"][1:]
        item_str = "<ref type=\"bibr\" target=\"{}\">{}</ref>".format(item.attrs["target"], item.get_text())
        bibr_strs_to_bid_id[item_str] = bib_id

    for item_str in bibr_strs_to_bid_id:
        bib_id = bibr_strs_to_bid_id[item_str]
        cur_bib_context_pos_start = [ii for ii in range(len(xml)) if xml.startswith(item_str, ii)]
        for pos in cur_bib_context_pos_start:
            bib_to_context[bib_id].append(xml[pos - dist: pos + dist].replace("\n", " ").replace("\r", " ").strip())
    return bib_to_context

def  get_bib_context(bs, xml):
    return find_bib_context(xml, bs)

def find_section_number(header):
    if not header:
        return None
    
    section_number = header.get('n')
    
    if section_number:
        return section_number

    
    text = header.text
    
    if not text:
        return None
    
    if 'I.' in text or 'introduction' in text.lower():
        return '1.'
    if 'II.' in text or 'related' in text.lower():
        return '2.'
    if 'III.' in text or 'method' in text.lower():
        return '3.'
    if 'IV.' in text:
        return '4.'
    if 'V.' in text:
        return '5.'
    if 'VI.' in text:
        return '6.'
    if 'VII.' in text:
        return '7.'
    if 'VIII.' in text:
        return '8.'
    if 'IX.' in text:
        return '9.'
    else:
        return None

def get_section_bib_number(bs):
    sections = bs.find_all("div")
    reference_counter_1 = collections.Counter()
    reference_counter_2 = collections.Counter()
    reference_counter_3 = collections.Counter()
    reference_counter_4 = collections.Counter()
    reference_counter_5 = collections.Counter()
    reference_counter_6 = collections.Counter()

    now_section_index = 0

    for section in sections:
        if not section.find('head'):
            continue
        
        header = section.find("head")
        #section_number = header.get('n')
        section_number = find_section_number(header)

        #if not section_number:
        #    continue

        if section_number:

            section_index = int(section_number.split('.')[0])
            section_title = header.text.strip()
            now_section_index = section_index

        references = section.find_all('ref', type='bibr')
        for reference in references:
            if reference.has_attr('target'):
                if now_section_index == 1:
                    reference_counter_1[reference['target'].strip("#")] += 1
                elif now_section_index == 2:
                    reference_counter_2[reference['target'].strip("#")] += 1
                elif now_section_index == 3:
                    reference_counter_3[reference['target'].strip("#")] += 1
                elif now_section_index == 4:
                    reference_counter_4[reference['target'].strip("#")] += 1
                elif now_section_index == 5:
                    reference_counter_5[reference['target'].strip("#")] += 1
                elif now_section_index == 6:
                    reference_counter_6[reference['target'].strip("#")] += 1
        
        return reference_counter_1, reference_counter_2, reference_counter_3, reference_counter_4, reference_counter_5, reference_counter_6
    

## 解析原文的feat

Chinese_surname_list = ['li', 'wang', 'zhang', 'liu', 'chen', 'yang', 'huang', 'zhao', 'wu', 'zhou', 'xiong', 'xiu', 'sun', 'ma', 'zhu', 'hu', 'guo', 'he', 'gao', 'lin', 'luo', 'zheng', 'qian', 'zhen', 'tong', 'zeng', 'zen', 'zhuang',
                        'liang', 'xie', 'song', 'tang', 'dong', 'yuan', 'cai', 'feng', 'xiao', 'jiang', 'shi', 'hang', 'ang', 'ong', 'ie', 
                        'xu', 'sun', 'ma', 'zhu', 'hu', 'guo', 'he', 'gao', 'lin', 'luo', 'zheng', 'qian', 'zhen', 'tong', 'zeng', 'zen', 'zhuang',
                        'liang', 'xie', 'song', 'tang', 'dong', 'yuan', 'cai', 'feng', 'xiao', 'jiang', 'shi', 'hang', 'ang', 'ong', 'ie', 'chua',
                        'tian', 'jia', 'pan', 'du', 'dai', 'wei', 'yu', 'bai', 'han', 'gu', 'yao', 'kuang', 'shuang','qi','mei']

def get_num_author_source(paper):
    surname_list = [x['name'] for x in paper['authors']]
    num_author = len(surname_list)
    return num_author

def get_num_chinese_author_source(paper):
    surname_list = [x['name'] for x in paper['authors']]
    surname_list = [x.split(' ')[-1] for x in surname_list]
    chinese_author_list = [x for x in surname_list if x.lower() in Chinese_surname_list]
    return len(chinese_author_list)


## 引用的信息

def get_reference_info(bs):
    this_paper_list = {}
    references = bs.find_all("biblStruct") 
    for ref in references:
        key = ref.get('xml:id')
        this_ref_list = {}
        surname_list = [surname.get_text() for surname in ref.find_all("surname")]
        this_ref_list['num_author'] = len(surname_list)
        this_ref_list['num_chinese_author'] = len([x for x in surname_list if x.lower() in Chinese_surname_list])
        try:
            journal = ref.find("title", {"level": "m"}) or ref.find("title", {"level": "j"})
            journal = journal.get_text()
        except:
            journal = ""
        this_ref_list['journal'] = journal
        this_paper_list[key] = this_ref_list
    return this_paper_list

    
def extract_text(bs):
    sections = bs.find_all('div', limit=10) 
    total_text = []
    for i, section in enumerate(sections):
        # 处理引用标记
        for ref in section.find_all('ref', type='bibr'):
            # 获取编号，假设target属性类似于"#b20"
            try:
                ref_num = re.search(r'#b(\d+)', ref.get('target'))
                if ref_num:
                    # 将 ref 替换为如 [b20]
                    ref.replace_with('[b{}]'.format(ref_num.group(1)))
            except:
                pass

        # 从div标签中提取文本
        text = section.get_text(separator=' ', strip=True)
        total_text.append(text)
    return total_text


import random
import re
from typing import List, Union

class PaperParser:
    def __init__(self, bs, xml):
        self.paper_id = None
        self.title = None 
        self.body = None
        self.bs = bs
        self.xml = xml
        self.ref_info = {}
        self.preprocess()
        self.extract_ref_author_info()
        self.check_reference()

    def check_reference(self):
        unnumbered_references_texts = []
        numbered_references_texts = []
        for ref in self.references:
            if not ref.has_attr('target'):
                unnumbered_references_texts.append(ref.get_text())
            else:
                numbered_references_texts.append(ref.get_text())
        
        self.unnumbered_references_texts = unnumbered_references_texts
        self.numbered_references_texts = numbered_references_texts
        self.n_ref_valid = len(self.numbered_references_texts)
        self.n_ref_invalid = len(self.unnumbered_references_texts)


    def extract_references_from_text_digit(self, text) -> Union[List[int], None]:
        # 定义正则表达式，用于匹配引用格式
        pattern = re.compile(r'\[\s*(\d+(?:\s*;\s*\d+)*)\s*\]')
        
        # 在文本中搜索匹配的模式
        matches = pattern.findall(text)
        
        # 用于存储解析后的引用序号
        references = []
        
        # 如果找到匹配模式，解析引用序号
        if matches:
            for match in matches:
                # 分割每个匹配到的文本，并转换成整数序号
                refs = match.split(';')
                for ref in refs:
                    references.append(int(ref.strip()))  # 移除可能的空格，并将文本转换成整数
            return 'b' + str(references[0] - 1)
        else:
            return None  # 如果没有找到匹配，则返回None

    def extract_ref_author_info(self):
        '''
        获取作者信息, key为b_key
        '''

        def extract_year(ref):
            return ref.find('date', {'type': 'published'}).get('when')
        
        def extract_year_v2(ref):
            # 使用BeautifulSoup解析XML数据
            
            # 寻找所有的<biblScope>标签
            bibl_scope_tags = ref.find_all('biblScope')
            
            # 遍历找到的标签，寻找可能包含年份信息的标签
            for tag in bibl_scope_tags:
                if tag.get('unit') == 'page':
                    # 尝试将内容转换为整数，以验证其是否可能是年份
                    try:
                        year = int(tag.text)
                        # 根据需要可以添加更多的验证步骤，例如检查年份范围等
                        # 如果内容成功转换为整数，并且看起来像合理的年份，返回它
                        if 1900 <= year <= 2100:
                            return tag.text
                    except ValueError:
                        # 如果内容不能转换为整数，继续遍历
                        continue
            
            return None


        def extract_year_from_title(ref):
            # 使用BeautifulSoup解析XML数据
            
            # 寻找<title>标签
            title_tag = ref.find('title')
            
            if title_tag:
                # 使用正则表达式从标题文本中提取年份
                match = re.search(r'\b(\d{4})\b', title_tag.text)
                if match:
                    return match.group(1)
            
            return None
        
        for ref in self.references_bib:
            #title_tag = ref.find('title')
            suffix = None
            key = ref.get('xml:id')
            try:
                first_author = [surname.get_text() for surname in ref.find_all("surname")][0]
            except:
                first_author = None
            try:
                ref_years = extract_year(ref)
            except:
                try:
                    ref_years = extract_year_v2(ref)
                except:
                    try:
                        ref_years = extract_year_from_title(ref)
                    except:
                        ref_years = None

            if ref_years:
                if '-' in ref_years:
                    ref_years = ref_years[:4]
        

            self.ref_info[key] = {'first_author': first_author, 'ref_years': ref_years}


    def infer_ref_number_from_text(self, text):
        '''返回如b1...'''
        ## 第一种情况, ... et al.的形式
        ## 从解析的ref_info去匹配
        # 用于匹配年份且不以字母结尾的正则表达式
        #print('start')
        regex_no_letter = re.compile(r'\d{4}(?!\w)')

        # 用于匹配年份以字母结尾的正则表达式
        regex_with_letter = re.compile(r'\d{4}\w')

        good_match = []

        #先匹配作者名字
        author_name_match = []
        for key, value in self.ref_info.items():
            if key is None:
                continue
            first_author, ref_years = value['first_author'], value['ref_years']
            #print(f'first_author is {first_author}, ref_years is {ref_years}')
            if first_author is not None and text is not None:
                if first_author.lower() in text.lower():
                    author_name_match.append(key)

        #print(f'author_name_match is {author_name_match}')
        
        if len(author_name_match) == 1:
                return author_name_match[0]
        
        ## 否则，需要按年份来匹配
        for key, value in self.ref_info.items():
            if key is None:
                continue
            first_author, ref_years = value['first_author'], value['ref_years']
            #print(f'first_author is {first_author}, ref_years is {ref_years}, text is {text}')
            if first_author is not None and text is not None and ref_years is not None:
                if first_author.lower() in text.lower() and ref_years in text:
                    good_match.append((key, value))

        #print(f'good_match is {good_match}')
        
        if len(good_match) == 1:
            return key
        
        elif len(good_match) > 1:
            # 多个匹配
            ## 检查以abc结尾
            if regex_with_letter.search(text):
                ## 找到最后的那个字母
                last_letter = regex_with_letter.search(text).group()[-1]
                if last_letter == 'a':
                    return good_match[0][0]
                elif last_letter == 'b':
                    return good_match[1][0]
                elif last_letter == 'c' and len(good_match) > 2:
                    return good_match[2][0]
                elif last_letter == 'd' and len(good_match) > 3:
                    return good_match[3][0]
                else:
                    return good_match[0][0]


        elif len(good_match) == 0:
            ## 猜测是数字的形式
            if self.extract_references_from_text_digit(text):
                return self.extract_references_from_text_digit(text)
        
        if len(author_name_match) > 0:
            return author_name_match[0]
                    
        return None
            
    def replace_refs_with_inferred_numbers(self, body):

        self.body_raw = body

        # 找到所有的引用标签
        refs = body.find_all('ref', type="bibr")
        
        for ref in refs:
            if not ref.has_attr('target'):  # 如果没有 target 属性，则尝试推断引用编号
                #print(f'ref.text is {ref.text}')
                inferred_ref_number = self.infer_ref_number_from_text(ref.text)
                #print(inferred_ref_number)
                if inferred_ref_number is not None:
                    ref['target'] = '#' + inferred_ref_number  # 添加推断出的编号
                    ref.string = "{} ({})".format(ref.text, inferred_ref_number)
        
        # 返回处理后的XML数据
        self.body_processed = body
        
        

    def preprocess(self):
        '''
        初步的解析
        '''
        self.body = self.bs.find('body')
        self.references = self.body.find_all('ref', type='bibr')
        self.references_bib = self.bs.find_all("biblStruct")

        ### Check 引用
        
    def __repr__(self):
        return f'The paper with id {self.paper_id}, title {self.title}'

In [5]:
# 结果的记录
other_featuers = {}

total_id = id_train + id_valid + id_test

papers_list = {}

for file in tqdm(files):

    # 读文件
    paper_key = file.split('.')[0]

    # 无关数据不处理
    if paper_key not in total_id:
        continue

    xml, bs = read_file(file, in_dir)
    paper = PaperParser(bs = bs, xml = xml)
    body = bs.find('body')
    paper.replace_refs_with_inferred_numbers(body)
    papers_list[paper_key] = paper

  0%|          | 0/7541 [00:00<?, ?it/s]

100%|██████████| 7541/7541 [01:13<00:00, 102.19it/s]


In [6]:
### 读数据 - gpt4给出的Inspiration数据
import json
with open('gpt4_res_parse_train_level_parsed.json', 'rb') as f:
    gpt4_res_level_train = json.load(f)

with open('gpt4_res_parse_valid_level_parsed.json', 'rb') as f:
    gpt4_res_level_valid = json.load(f)

with open('gpt4_res_parse_test_level_parsed.json', 'rb') as f:
    gpt4_res_level_test = json.load(f)


In [7]:
### 读数据 - gpt4给出的直接打分数据

with open('gpt4_res_parse_train_short_parsed.json', 'rb') as f:
    gpt4_res_short_train = json.load(f)

with open('gpt4_res_parse_valid_short_parsed.json', 'rb') as f:
    gpt4_res_short_valid = json.load(f)

with open('gpt4_res_parse_test_short_parsed.json', 'rb') as f:
    gpt4_res_short_test = json.load(f)


In [8]:
### 读数据 - OPUS给出的直接打分数据

with open('opus_res_parse_json_test.json', 'rb') as f:
    opus_res_test = json.load(f)

In [9]:
### 读数据 - gpt4给出的仅在test上打分的数据
with open('gpt4_turbo_res_parse_json_test.json', 'rb') as f:
    gpt4_res_test = json.load(f)

with open('gpt4_turbo_res_parse_json_test_v2.json', 'rb') as f:
    gpt4_res_test_v2 = json.load(f)

with open('GPT4_res_test_V3_parse.json', 'rb') as f:
    GPT4_res_test_V3_parse = json.load(f)

In [10]:
### 读gemini数据
with open('gemini_res_parse_json_test.json', 'rb') as f:
    gemini_res_test = json.load(f)

with open('gemini_res_parse_json_test_round2.json', 'rb') as f:
    gemini_res_test_round2 = json.load(f)
    
with open('gemini_res_parse_json_test_round3.json', 'rb') as f:
    gemini_res_test_round3 = json.load(f)

with open('gemini_res_parse_json_test_round4.json', 'rb') as f:
    gemini_res_test_round4 = json.load(f)


In [11]:
### 读取基于note的数据
with open('GPT4_res_test_note_parse.json', 'rb') as f:
    gpt4_res_test_note = json.load(f)


In [12]:
def get_is_CVPR(x):
    x = str(x)
    if 'cvpr' in x.lower() or 'computer vision and pattern recognition' in x.lower():
        return 1
    else:
        return 0
    
def get_is_NIPS(x):
    x = str(x)
    if 'nips' in x.lower() or 'neurips' in x.lower() or 'neural information processing systems' in x.lower():
        return 1
    else:
        return 0
    
def get_is_ICML(x):
    x = str(x)
    if 'icml' in x.lower() or 'international conference on machine learning' in x.lower():
        return 1
    else:
        return 0
    
def get_is_ns(x):
    x = str(x)
    if 'nature' in x.lower() or 'science' in x.lower():
        return 1
    else:
        return 0  

def get_is_ECCV(x):
    x = str(x)
    if 'eccv' in x.lower():
        return 1
    else:
        return 0  
    
def get_is_KDD(x):
    x = str(x)
    if 'kdd' in x.lower():
        return 1
    else:
        return 0  
    
def get_is_famous(x):
    if x == '' or x is np.nan or x is None:
        return 0
    else:
        return 1
    

conference_parse_dict = {
    'cvpr':'cvpr',
    'computer vision and pattern recognition':'cvpr',
    'iccv':'iccv',
    'international conference on computer vision':'iccv',
    'eccv':'eccv',
    'europ. conf. computer vision':'eccv',
    'european conference on computer vision':'eccv',
    'icml':'icml',
    'international conference on machine learning':'icml',
    'iclr':'iclr',
    'international conference on learning representations':'iclr',
    'nips':'neurips',
    'neurips':'neurips',
    'neural information processing systems':'neurips',
    'aaai':'aaai',
    'advancement of artificial intelligence':'aaai',
    'ijcai':'ijcai',
    'joint conference on artificial intelligence':'ijcai',
    'kdd':'kdd',
    'knowledge discovery and data mining':'kdd',
    'sigkdd':'kdd',
    'acm sigkdd international conference on knowledge discovery and data mining':'kdd',
    'www':'www',
    'world wide web':'www',
    'sigir':'sigir',
    'research and development in information retrieval':'sigir',
    'naacl':'naacl',
    'acl':'acl',
    'association for computational linguistics':'acl',
    'emnlp':'emnlp',
    'empirical methods in natural language processing':'emnlp',
    'north american chapter of the association for computational linguistics':'naacl',
    'icde':'icde',
    'international conference on data engineering':'icde',
    'vldb':'vldb',
    'very large data bases':'vldb',
    'pvldb':'vldb',
    'the vldb journal':'vldb',
    'sigmod':'sigmod',
    'acm sigmod conference':'sigmod',
    'acm sigmod international conference on management of data':'sigmod',
    'icdm':'icdm',
    'conference on data mining':'icdm',
    'sdm':'sdm',
    'conference on data mining':'sdm',
    'cikm':'cikm',
    'conference on information and knowledge management':'cikm',
    'cell':'cell',
    'nature':'nature',
    'science':'science',
    'pattern analysis and machine intelligence':'pami',
    'pami':'pami',
    'jmlr':'jmlr',
    'machine learning':'jmlr',
    'british conference on machine vision':'bmvc',
    'bmvc':'bmvc',
    'uai':"uai",
    "operating systems":"os",
    'eurosys':'os',
    'chemical':'chemistry'
}



def conference_parse(x):
    x = str(x).lower()
    for k, v in conference_parse_dict.items():
        if k in x:
            return v
    return None

def is_type_vision(x):
    x = str(x).lower()
    if x in ['cvpr', 'eccv', 'iccv', 'bmvc']:
        return 1
    else:
        return 0
    
def is_type_NLP(x):
    x = str(x).lower()
    if x in ['acl','naacl','emnlp']:
        return 1
    else:
        return 0
    
def is_type_ml(x):
    x = str(x).lower()
    if x in ['icml','neurips','iclr', 'pami', 'jmlr', 'uai']:
        return 1
    else:
        return 0
    
def is_type_datamining(x):
    x = str(x).lower()
    if x in ['www','kdd','sigir','cikm','waim','sigmod','vldb','icde']:
        return 1
    else:
        return 0
    

def is_type_ai(x):
    x = str(x).lower()
    if x in ['aaai', 'ijcai']:
        return 1
    else:
        return 0
    
def is_type_os(x):
    x = str(x).lower()
    if x in ['os']:
        return 1
    else:
        return 0
    
def is_type_chem(x):
    x = str(x).lower()
    if x in ['chemistry']:
        return 1
    else:
        return 0

# 开始做特征

total_papers = papers_train + papers_valid + papers_test


def make_train_features(types = 'train'):

    features_train = {}
    labels_train = {}

    # 获取生产特征的类型
    if types == 'train':
        papers = papers_train
        gpt_res_label = gpt4_res_level_train
        gpt_res_short = gpt4_res_short_train
    elif types == 'valid':
        papers = papers_valid
        gpt_res_label = gpt4_res_level_valid
        gpt_res_short = gpt4_res_short_valid
    elif types == 'test':
        papers = papers_test
        gpt_res_label = gpt4_res_level_test
        gpt_res_short = gpt4_res_short_test
        opus_res = opus_res_test
        gpt4_turbo_res = gpt4_res_test
        gemini_res = gemini_res_test
        gpt4_res_v2 = gpt4_res_test_v2
        gemini_res_round2 = gemini_res_test_round2
        gemini_res_round3 = gemini_res_test_round3
        gemini_res_round4 = gemini_res_test_round4
        gpt4_res_note = gpt4_res_test_note
        gpt_res_short_r2 = GPT4_res_test_V3_parse



    # 迭代所有paper
    for key in tqdm([x['_id'] for x in papers]):
        features_key = {}
        labels_key = {}
        context =  bib_to_contexts_dict[key]
        this_paper = papers_list[key]
        ref_info = this_paper.ref_info
        #citation_list = submission_ref_citation[key]


        if types == 'train':
            ref_source = [x for x in papers if x['_id'] == key][0]['refs_trace']
            ref_nums = [x['referenced_serial_number'] for x in ref_source if 'referenced_serial_number' in x.keys()]
            ref_keys = ['b' + str(x) for x in ref_nums]
            #total_ref_keys = list(set(body_ref_count[key].keys()) + set(ref_keys))
            for b_key in body_ref_count[key].keys():
                if b_key in ref_keys:
                    labels_key[b_key] = 1
                else:
                    labels_key[b_key] = 0

        elif types == 'valid' or types == 'test':
            for b_key in body_ref_count[key].keys():
                labels_key[b_key] = 0


        try:
            this_gpt_res_label = gpt_res_label[key]
            this_gpt_res_short = gpt_res_short[key]
        except:
            this_gpt_res_label = {}
            this_gpt_res_short = {}
        
        try:
            this_opus_res = opus_res[key]
        except:
            this_opus_res = {}
        
        try:
            this_turbo_res = gpt4_turbo_res[key]
        except:
            this_turbo_res = {}

        try:
            this_turbo_res_v2 = gpt4_res_v2[key]
        except:
            this_turbo_res_v2 = {}

        try:
            this_gemini_res = gemini_res[key]
        except:
            this_gemini_res = {}

        try:
            this_gemini_res_round2 = gemini_res_round2[key]
        except:
            this_gemini_res_round2 = {}

        try:
            this_gemini_res_round3 = gemini_res_round3[key]
        except:
            this_gemini_res_round3 = {}

        try:
            this_gemini_res_round4 = gemini_res_round4[key]
        except:
            this_gemini_res_round4 = {}

        try:
            this_gpt4_res_note = gpt4_res_note[key]
        except:
            this_gpt4_res_note = {}

        try:
            this_gpt_res_short_r2 = gpt_res_short_r2[key]
        except:
            this_gpt_res_short_r2 = {}


        max_ref_year = -1
        for ref_key in ref_info.keys():
            if ref_info[ref_key]['ref_years'] is not None:
                ref_year = int(ref_info[ref_key]['ref_years'])
                if ref_year > max_ref_year:
                    max_ref_year = ref_year

        # 制作每个引用文献的特征
        for b_key in body_ref_count[key].keys():
            if b_key not in submission_ref_list[key].keys():
                continue

            if int(b_key[1:]) > 500:
                continue
            
            b_feature = {}

            ## 该引用文献在文本的出现次数
            b_feature['ref_count'] = body_ref_count[key][b_key]

            ## ADD 论文年份和引用年份
            ref_year = ref_info[b_key]['ref_years']
            if ref_year is None:
                ref_year = 2024
            ref_year = int(ref_year)

            b_feature['max_ref_year'] = max_ref_year
            b_feature['ref_year'] = ref_year
            b_feature['ref_year_diff'] = max_ref_year - ref_year

            ## ADD 引用次数
            '''
            if b_key in citation_list.keys():
                b_feature['n_citation'] = citation_list[b_key]
            else:
                b_feature['n_citation'] = -1
            '''

            ###
            paper = [x for x in papers if x['_id'] == key][0]
            try:
                b_feature['venue'] = paper['venue']
            except:
                b_feature['venue'] = ""
            try:
                num_author_source = get_num_author_source(paper)
                num_chinese_author_source = get_num_chinese_author_source(paper)
            except:
                num_author_source = 0
                num_chinese_author_source = 0
            
            b_feature['num_author_source'] = num_author_source
            b_feature['num_chinese_author_source'] = num_chinese_author_source



            ## 该引用文献在在历史正样本中的出现次数
            
            assert b_key in submission_ref_list[key].keys(), f'key is {key}, b_key is {b_key}'
            title = clean_text(submission_ref_list[key][b_key])
            try:
                b_feature['positive_count'] = counter_train_sources[title]
            except:
                b_feature['positive_count'] = 0
            

            ## 是否出现在所有文件中
            '''
            if title in title_to_pid.keys():
                b_feature['in_files_list'] = 1
            else:
                b_feature['in_files_list'] = 0
            '''


            b_feature['num_author'] = total_author_data_sup[key][b_key]['num_author']
            b_feature['num_chinese_author'] = total_author_data_sup[key][b_key]['num_chinese_author']
            b_feature['journal'] = total_author_data_sup[key][b_key]['journal']
            b_feature['chinese_rate'] = b_feature['num_chinese_author'] / (b_feature['num_author'] + 0.01)
            
            ## 该引用文献的上下文特征
            num_inspired_by = 0
            num_motivated_by = 0
            num_we_present = 0
            num_based_on = 0

            try:
                b_context = context[b_key]
                for context_piece in b_context:
                    context_piece = context_piece.lower()
                    if 'inspired' in context_piece:
                        num_inspired_by += 1
                    if 'motivated' in context_piece:
                        num_motivated_by += 1
                    if 'we present' in context_piece or 'we propose' in context_piece:
                        num_we_present += 1
                    if 'based on' in context_piece:
                        num_based_on += 1
            except:
                pass
            
            b_feature['num_inspired_by'] = num_inspired_by
            b_feature['num_motivated_by'] = num_motivated_by
            b_feature['num_we_present'] = num_we_present
            b_feature['based_on'] = num_based_on

            ## 段落引用特征
            b_feature['ref_count_1'] = body_ref_count_1[key][b_key]
            b_feature['ref_count_2'] = body_ref_count_2[key][b_key]
            b_feature['ref_count_3'] = body_ref_count_3[key][b_key]
            b_feature['ref_count_4'] = body_ref_count_4[key][b_key]
            b_feature['ref_count_5'] = body_ref_count_5[key][b_key]
            b_feature['ref_count_6'] = body_ref_count_6[key][b_key]

            ## Venue 特征
            b_feature['is_CVPR_journal'] = get_is_CVPR(b_feature['journal'])
            b_feature['is_NIPS_journal'] = get_is_NIPS(b_feature['journal'])
            b_feature['is_ICML_journal'] = get_is_ICML(b_feature['journal'])
            b_feature['is_ns_journal'] = get_is_ns(b_feature['journal'])
            b_feature['is_ECCV_journal'] = get_is_ECCV(b_feature['journal'])
            b_feature['is_KDD_journal'] = get_is_KDD(b_feature['journal'])

            b_feature['is_CVPR_venue'] = get_is_CVPR(b_feature['venue'])
            b_feature['is_NIPS_venue'] = get_is_NIPS(b_feature['venue'])
            b_feature['is_ICML_venue'] = get_is_ICML(b_feature['venue'])
            b_feature['is_ns_venue'] = get_is_ns(b_feature['venue'])
            b_feature['is_ECCV_venue'] = get_is_ECCV(b_feature['venue'])
            b_feature['is_KDD_venue'] = get_is_KDD(b_feature['venue'])

            b_feature['venue_type'] = conference_parse(b_feature['venue'])
            b_feature['journal_type'] = conference_parse(b_feature['journal'])

            b_feature['journal_type_nlp'] = is_type_NLP(b_feature['journal_type'])
            b_feature['venue_type_nlp'] = is_type_NLP(b_feature['venue_type'])

            b_feature['joirnal_type_ml'] = is_type_ml(b_feature['journal_type'])
            b_feature['venue_type_ml'] = is_type_ml(b_feature['venue_type'])

            b_feature['journal_type_dm'] = is_type_datamining(b_feature['journal_type'])
            b_feature['venue_type_dm'] = is_type_datamining(b_feature['venue_type'])

            b_feature['journal_type_ai'] = is_type_ai(b_feature['journal_type'])
            b_feature['venue_type_ai'] = is_type_ai(b_feature['venue_type'])

            b_feature['journal_type_os'] = is_type_os(b_feature['journal_type'])
            b_feature['venue_type_os'] = is_type_os(b_feature['venue_type'])

            b_feature['journal_type_chem'] = is_type_chem(b_feature['journal_type'])
            b_feature['venue_type_chem'] = is_type_chem(b_feature['venue_type'])

            b_feature['journal_type_vision'] = is_type_vision(b_feature['journal_type'])
            b_feature['venue_type_vision'] = is_type_vision(b_feature['venue_type'])

            try:
                b_feature['context'] = '.'.join([x for x in bib_to_contexts_dict[key][b_key]])
            except:
                b_feature['context'] = ""
    
            features_key[b_key] = b_feature

            ### gpt 特征
            for ii in range(5):
                b_feature[f'gpt_res_label_{ii}'] = 0
                b_feature[f'gpt_res_short_{ii}'] = 0
                b_feature[f'gpt_res_short_{ii}_r2'] = 0
            
            for ii in range(5):
                try:
                    vv = this_gpt_res_label[ii]
                    if b_key in vv['s3']:
                        b_feature[f'gpt_res_label_{ii}'] = 1
                    if b_key in vv['s2']:
                        b_feature[f'gpt_res_label_{ii}'] = 2
                    if b_key in vv['s1']:
                        b_feature[f'gpt_res_label_{ii}'] = 3
                except:
                    b_feature[f'gpt_res_label_{ii}'] = 0 

                try:
                    vv = this_gpt_res_short[ii]
                    if b_key in vv.keys():
                        b_feature[f'gpt_res_short_{ii}'] = vv[b_key]
                
                except:
                    b_feature[f'gpt_res_short_{ii}'] = 0

                try:
                    vv = this_gpt_res_short_r2[ii]
                    if b_key in vv.keys():
                        b_feature[f'gpt_res_short_{ii}_r2'] = float(vv[b_key])
                except:
                    b_feature[f'gpt_res_short_{ii}_r2'] = 0


            b_feature['opus_res'] = 0
            for ii in range(10):
                b_feature[f'turbo_res_{ii}'] = 0
                b_feature[f'turbo_res_r2_{ii}'] = 0
                b_feature[f'gpt4_res_note_{ii}'] = 0
            b_feature['gemini_res'] = 0
            b_feature['gemini_res_round2'] = 0
            b_feature['gemini_res_round3'] = 0
            b_feature['gemini_res_round4'] = 0
            
            ### opus特征
            if types == 'test':
                if b_key in this_opus_res.keys():
                    b_feature['opus_res'] = this_opus_res[b_key]
                
                if b_key in this_gemini_res.keys():
                    if isinstance(this_gemini_res[b_key], (int, float)):
                        b_feature['gemini_res'] = this_gemini_res[b_key]

                if b_key in this_gemini_res_round2.keys():
                    if isinstance(this_gemini_res_round2[b_key], (int, float)):
                        b_feature['gemini_res_round2'] = this_gemini_res_round2[b_key]

                if b_key in this_gemini_res_round3.keys():
                    if isinstance(this_gemini_res_round3[b_key], (int, float)):
                        b_feature['gemini_res_round3'] = this_gemini_res_round3[b_key]

                if b_key in this_gemini_res_round4.keys():
                    if isinstance(this_gemini_res_round4[b_key], (int, float)):
                        b_feature['gemini_res_round4'] = this_gemini_res_round4[b_key]

            ### turbo特征
                for ii in range(10):
                    try:
                        vv = this_turbo_res[ii]
                        if b_key in vv.keys():
                            b_feature[f'turbo_res_{ii}'] = vv[b_key]
                    except:
                        pass

            ### turbo特征v2
                for ii in range(len(this_turbo_res_v2)):
                    try:
                        vv = this_turbo_res_v2[ii]
                        if b_key in vv.keys():
                            b_feature[f'turbo_res_r2_{ii}'] = float(vv[b_key])
                    except:
                        pass
                
            ### gpt4 note 特征
                for ii in range(len(gpt4_res_note)):
                    try:
                        vv = this_gpt4_res_note[ii]
                        if b_key in vv.keys():
                            b_feature[f'gpt4_res_note_{ii}'] = float(vv[b_key])
                    except:
                        pass

    
        labels_train[key] = labels_key
        features_train[key] = features_key

    ## 建立模型

    wide_features = []
    total_labels = []

    for key in features_train:
        this_features = features_train[key]
        this_labels = labels_train[key]
        max_ref_count = -1
        max_positive_count = -1

        for b_key in this_features:
            this_feature = this_features[b_key]
            if this_feature['ref_count'] > max_ref_count:
                max_ref_count = this_feature['ref_count']
            if this_feature['positive_count'] > max_positive_count:
                max_positive_count = this_feature['positive_count']
        
        for b_key in this_features:
            this_feature = this_features[b_key]
            this_feature['max_ref_ratio'] = this_feature['ref_count'] / max_ref_count
            this_feature['positive_ref_ratio'] = this_feature['positive_count'] / (max_positive_count + 0.001)
            this_feature['paper_key'] = key
            this_feature['b_key'] = b_key
            total_labels.append(this_labels[b_key])
            wide_features.append(this_feature)

    import pandas as pd
    wide_features = pd.DataFrame(wide_features)
    total_labels = pd.DataFrame(total_labels)
    wide_features['label'] = total_labels
    return wide_features

In [21]:
train_feat = make_train_features(types='train')
test_feat = make_train_features(types='test')

train_feat.to_csv('train_feat.csv')
test_feat.to_csv('test_feat.csv')

100%|██████████| 788/788 [00:01<00:00, 415.94it/s]
100%|██████████| 394/394 [00:02<00:00, 179.47it/s]


In [22]:
## XGB
## 用random forest 二分类
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import catboost as cb

y_col = 'label'
x_col = [x for x in train_feat.columns if x != y_col]
x_col = [x for x in x_col if x != 'b_key' and x != 'paper_key' and x != 'pred' and x != 'positive_ref_ratio' and x != 'journal']
x_col = [x for x in x_col if x != 'positive_count']
x_col = [x for x in x_col if x != 'venue' and x != 'Unnamed: 0']
x_col = [x for x in x_col if x != 'venue_type' and x != 'journal_type' and x != 'context']
x_col = [x for x in x_col if x != 'gemini_res' and x != 'gemini_res_round2' and x!= 'gemini_res_round3']

X_train, X_test, y_train, y_test = train_test_split(train_feat[x_col], train_feat['label'], test_size=0.1, random_state=0)

In [None]:
## save 
import sklearn.externals
import joblib

In [19]:
    clf = lgb.LGBMClassifier(n_estimators = 50, 
                             learning_rate = 0.1, 
                             num_leaves = 50, 
                             max_depth = 5,
                             sample_fraction=0.8,
                             random_state=42, 
                             n_jobs=-1,
                             verbose=-1)
    clf.fit(X_train, y_train)
    pred = clf.predict_proba(test_feat[x_col])
    pred = pred[:,1]
    test_feat['pred_lgb'] = pred
    joblib.dump(clf, 'lgb.pkl')


    clf = cb.CatBoostClassifier(iterations = 50, 
                                learning_rate = 0.077, 
                                depth = 5,
                                random_seed=42, 
                                verbose=0)
    
    clf.fit(X_train, y_train)
    pred = clf.predict_proba(test_feat[x_col])
    pred = pred[:,1]
    test_feat['pred_cb'] = pred
    joblib.dump(clf, 'catboost.pkl')

    test_feat['pred'] = test_feat['pred_lgb'] * 0.4 + test_feat['pred_cb'] * 0.6

    def get_rule_score(row):
        gpt_labels = []
        gpt_shorts = []
        turbo_res = []
        turbo_res_r2 = []
        turbo_res_note = []
        gpt_shorts_v2 = []
        for ii in range(5):
            gpt_shorts.append(row[f'gpt_res_short_{ii}'])
            gpt_labels.append(row[f'gpt_res_label_{ii}'])
            gpt_shorts_v2.append(row[f'gpt_res_short_{ii}_r2'])
        opus_res = row['opus_res']
        for ii in range(10):
            turbo_res.append(row[f'turbo_res_{ii}'])
            turbo_res_r2.append(row[f'turbo_res_r2_{ii}'])
            turbo_res_note.append(row[f'gpt4_res_note_{ii}'])

        gemini_res = row['gemini_res']
        gemini_res_r2 =  row['gemini_res_round2']
        gemini_res_r3 =  row['gemini_res_round3']
        gemini_res_r4 =  row['gemini_res_round4']

        num_inspired_by = row['num_inspired_by']
        num_motivated_by = row['num_motivated_by']
            

        def score2rank(x):
            if x >= 0.9:
                return 3
            elif x >= 0.5:
                return 2
            elif x > 0.4:
                return 1
            else:
                return 0
            
        opus_rank = score2rank(opus_res)
        gemini_rank = score2rank(gemini_res)
        gemini_rank_r2 = score2rank(gemini_res_r2)
        gemini_rank_r3 = score2rank(gemini_res_r3)
        gemini_rank_r4 = score2rank(gemini_res_r4)
        gemini_res_total = [gemini_res, gemini_res_r2, gemini_res_r3, gemini_res_r4]

        turbo_rank_max = np.max([score2rank(x) for x in turbo_res])
        turbo_rank_mean = np.median([score2rank(x) for x in turbo_res])
        turbo_rank_min = np.min([score2rank(x) for x in turbo_res])

        gpt4o_rank_max = np.max([score2rank(x) for x in gpt_shorts])
        gpt4o_rank_mean = np.median([score2rank(x) for x in gpt_shorts])
        gpt4o_rank_min = np.min([score2rank(x) for x in gpt_shorts])

        turbo_rank_max_r2 = np.max([score2rank(x) for x in turbo_res_r2])
        turbo_rank_mean_r2 = np.median([score2rank(x) for x in turbo_res_r2])
        turbo_rank_min_r2 = np.min([score2rank(x) for x in turbo_res_r2])

        turbo_res_note_max = np.max([score2rank(x)  for x in turbo_res_note])
        turbo_res_note_mean = np.median([score2rank(x)  for x in turbo_res_note])
        turbo_res_note_min = np.min([score2rank(x)  for x in turbo_res_note])

        col1 =  [opus_rank] 
        col2 =  [gemini_rank] + [gemini_rank_r3] + [gemini_rank_r2]
        col3 =  [turbo_rank_max] + [gpt4o_rank_max] + [turbo_rank_max_r2] + [turbo_rank_mean_r2] + [turbo_rank_mean] + [gpt4o_rank_mean] 
        col4 = [turbo_res_note_max] + [turbo_res_note_mean] + [gemini_rank_r4]
        
        
        rule_counter1 = Counter(col1)
        rule_counter2 = Counter(col2)
        rule_counter3 = Counter(col3)
        rule_counter4 = Counter(col4)

        num_31, num_21, num_11 = rule_counter1[3] , rule_counter1[2], rule_counter1[1]
        num_32, num_22, num_12 = rule_counter2[3] , rule_counter2[2], rule_counter2[1]
        num_33, num_23, num_13 = rule_counter3[3] , rule_counter3[2], rule_counter3[1]
        num_34, num_24, num_14 = rule_counter4[3] , rule_counter4[2], rule_counter4[1]

        k1 = 2
        k2 = 1
        k3 = 0.5
        k4 = 1

        score1 = num_31 * k1 + num_32 * k2 + num_33 * k3 + num_34 * k4
        score2 = num_21 * k1 + num_22 * k2 + num_23 * k3 + num_24 * k4
        score3 = num_11 * k1 + num_12 * k2 + num_13 * k3 + num_14 * k4
        
        score = score1 * 4 + score2 * 2 + score3 
        
        

        ### ADD Penalty
        total_single_score = gpt_shorts + gpt_labels + gpt_shorts_v2 + turbo_res_note + turbo_res + turbo_res_r2 + gemini_res_total + [opus_res]
        total_single_score = sorted(total_single_score)
        if total_single_score[20] <= 0.2:
             score = score / 4
        
        return score 

    
    test_feat['rule_score'] = test_feat.apply(get_rule_score, axis = 1)


    def rerank(df, rate_rule):
        df['pred_rule'] = df['pred'] + df['rule_score'] * rate_rule
        return df

    test_feat_rule =  test_feat.groupby(['paper_key']).apply(rerank, rate_rule = 0.035).reset_index(drop = True)

  test_feat_rule =  test_feat.groupby(['paper_key']).apply(rerank, rate_rule = 0.035).reset_index(drop = True)


In [20]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


# 写入submission
sample_submission = "submission_example_test.json"
sample_submission = json.load(open(sample_submission, "r"))
import copy
sample_submission_submit = copy.deepcopy(sample_submission)

for sub_key in sample_submission.keys():
    my_ans = []
    my_pred_res = test_feat_rule.loc[test_feat_rule['paper_key'] == sub_key]
    num_pred = len(sample_submission[sub_key])
    for i in range(num_pred):
        b_key = 'b' + str(i)
        try:
            #my_ans.append(sigmoid(my_pred_res.loc[my_pred_res['b_key'] == b_key]['pred'].values[0]))
            my_ans.append(sigmoid(my_pred_res.loc[my_pred_res['b_key'] == b_key]['pred_rule'].values[0]))
        except:
            my_ans.append(0)

    sample_submission_submit[sub_key] = my_ans


import json
with open('sample_submission_0607_1.json', 'w') as f:
    json.dump(sample_submission_submit, f)



In [7]:
len(total_id)

1576