In [1]:
import re

def remove_arxiv_entries(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 使用正则表达式提取每一条记录
    entries = re.findall(r'TY  - .*?ER  -', content, flags=re.DOTALL)

    # 筛选不包含 'PB  - arXiv' 的条目
    filtered_entries = [entry for entry in entries if 'PB  - arXiv' not in entry]

    # 合并为一个字符串，两个换行符分隔
    new_content = '\n\n'.join(filtered_entries)

    # 保存到输出文件
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(new_content)

# 用法示例
remove_arxiv_entries(R'D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal copy.ris', R'D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal_copy_output_without_arxiv.ris')


In [3]:
import requests
from requests.exceptions import ReadTimeout, RequestException

# 用户直接输入文件路径
input_path = R"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal_copy_output_without_arxiv.ris"
output_path = R"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal_copy_output_without_arxiv_withdoi.ris"
log_path = R"../../data/all/arxiv_results_multi_1647_tad_tal_withdo_.log"
error_log_path = R"../../data/all/arxiv_results_multi_1647_tad_tal_withdo_.log"
found_entries_path = R"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\found_doi_entries.txt"


def parse_ris_entries(lines):
    entries = []
    current = []
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('TY  -'):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith('ER  -'):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    info = {}
    for line in entry:
        if line.startswith('TI  -') and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith('AU  -'):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith('PY  -') and 'year' not in info:
            info['year'] = line[6:].strip()
    return info


def query_crossref(title, authors=None, year=None):
    params = {'query.title': title, 'rows': 1}
    if year:
        params['filter'] = f'from-pub-date:{year},until-pub-date:{year}'
    if authors:
        params['query.author'] = authors[0]
    try:
        response = requests.get('https://api.crossref.org/works', params=params, timeout=10)
        response.raise_for_status()
        items = response.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except ReadTimeout:
        error_log.append({'title': title, 'authors': authors, 'year': year})
        return None
    except RequestException:
        return None
    return None


def enrich_entry_with_doi(entry, error_log):
    info = extract_key_info(entry)
    title = info.get('title')
    authors = info.get('authors')
    year = info.get('year')
    doi = query_crossref(title, authors, year)
    if doi:
        for idx, line in enumerate(entry):
            if line.startswith('ER  -'):
                entry.insert(idx, f'DO  - {doi}')
                break
    return doi


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    missing_count = 0
    enriched_count = 0
    error_log = []  # 超时或错误需后续处理
    found_entries = []  # 存放成功补全条目信息

    with open(output_path, 'w', encoding='utf-8') as fout, open(log_path, 'w', encoding='utf-8') as flog:
        for entry in entries:
            if any(line.startswith('DO  -') for line in entry):
                fout.write('\n'.join(entry) + '\n\n')
            else:
                doi = enrich_entry_with_doi(entry, error_log)
                info = extract_key_info(entry)
                title = info.get('title', '<无标题>')
                year = info.get('year', '<未知年份>')
                if doi:
                    enriched_count += 1
                    found_entries.append({'title': title, 'year': year, 'doi': doi})
                    # 实时打印获取到 DOI 的文献
                    print(f"{enriched_count}. {title} ({year}) - DOI: {doi}")
                    fout.write('\n'.join(entry) + '\n\n')
                else:
                    missing_count += 1
                    authors = '; '.join(info.get('authors', ['<未知作者>']))
                    flog.write(f"{missing_count}. 标题: {title}\n")
                    flog.write(f"   作者: {authors}\n")
                    flog.write(f"   出版年: {year}\n")
                    flog.write(f"   原始条目行数: {len(entry)}\n---\n")

    if error_log:
        with open(error_log_path, 'w', encoding='utf-8') as ef:
            for idx, info in enumerate(error_log, 1):
                ef.write(f"{idx}. 标题: {info.get('title','<无标题>')}\n")
                ef.write(f"   作者: {'; '.join(info.get('authors', ['<未知作者>']))}\n")
                ef.write(f"   出版年: {info.get('year','<未知年份>')}\n---\n")

    print(f"处理完成：共 {len(entries)} 条文献；缺失 DOI 的有 {missing_count} 条，成功补全 DOI 的有 {enriched_count} 条。")
    print(f"输出文件：{output_path}，日志文件：{log_path}")
    if found_entries:
        print(f"成功补全 DOI 的文献总览已打印。共有 {len(found_entries)} 条。")

if __name__ == '__main__':
    main()

1. Mpl: Mine the Pseudo Labels for Semi-Supervised Temporal Action Localization (2025) - DOI: 10.2139/ssrn.5215075
2. Graph-Based Framework for Temporal Human Action Recognition and Segmentation in Industrial Context (2025) - DOI: 10.2139/ssrn.5208198
3. Once and for All: Linking-Free Online Action Detection (2025) - DOI: 10.2139/ssrn.5200324
4. DSTAdapter:Divided Spatial-Temporal Adapter Fine-tuning Method for Sign Language Recognition (2025) - DOI: 10.21203/rs.3.rs-6259023/v1
5. Machine Learning-Powered Automatic Detection and Prediction of Crack and Corrosion Using Spatiotemporal Measurement from Distributed Fiber Optic Sensors (2025) - DOI: 10.2139/ssrn.5189560
6. Temporal Segmentation Modeling with Sample Augmentation for Moving Infrared Small Target Detection (2025) - DOI: 10.2139/ssrn.5181885
7. Integrated Seasonal-Trend Decomposition Using Loess for Multi-Head Self-Attention Mechanism and Bidirectional Long Short-Term Memory Based Reference Evapotranspiration Prediction (2025) 

In [1]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

# 全局锁用于控制API请求速率和输出同步
request_lock = threading.Lock()
last_request_time = 0

def get_arxiv_doi(title):
    """使用标题查询arXiv API，获取arXiv ID并构造DOI。"""
    global last_request_time
    
    # 如果标题包含冒号，只使用冒号后面的部分
    if ':' in title:
        title = title.split(':', 1)[1].strip()
    
    encoded_title = urllib.parse.quote(title)
    url = f'http://export.arxiv.org/api/query?search_query=ti:{encoded_title}&max_results=1'
    
    try:
        # 确保请求间隔至少3秒
        with request_lock:
            current_time = time.time()
            time_since_last = current_time - last_request_time
            if time_since_last < 3:
                time.sleep(3 - time_since_last)
            last_request_time = time.time()
        
        with urllib.request.urlopen(url) as response:
            data = response.read()
        
        root = ET.fromstring(data)
        ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
        
        entry = root.find('atom:entry', ns)
        if entry is None:
            return None, f"未找到标题为 '{title}' 的文章。"
        
        id_element = entry.find('atom:id', ns)
        if id_element is None or not id_element.text:
            return None, "无法获取arXiv ID。"
        
        arxiv_id = id_element.text.split('/')[-1]  # 例如：2504.13460v3
        doi_element = entry.find('arxiv:doi', ns)
        if doi_element is not None and doi_element.text:
            return doi_element.text, f"DOI: {doi_element.text}"
        
        constructed_doi = f"10.48550/arXiv.{arxiv_id}"
        return constructed_doi, f"构造的DOI: {constructed_doi} (元数据中无显式DOI)"
            
    except urllib.error.URLError as e:
        return None, f"获取数据时出错: {e}"
    except ET.ParseError:
        return None, "解析XML响应时出错。"

def parse_ris_file(file_path):
    """解析RIS文件，将其分为条目列表，每个条目为行列表。"""
    entries = []
    current_entry = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                current_entry.append(line)
            if line.startswith('ER  -'):
                entries.append(current_entry)
                current_entry = []
    
    return entries

def process_single_entry(entry, index):
    """处理单个RIS条目，返回处理后的条目和消息。"""
    publisher = None
    title = None
    
    for line in entry:
        if line.startswith('PB  -'):
            publisher = line[6:].strip()
        elif line.startswith('TI  -'):
            title = line[6:].strip()
    
    if publisher == 'arXiv' and title:
        with request_lock:
            print(f"\n处理标题: {title}")
        doi, message = get_arxiv_doi(title)
        with request_lock:
            print(message)
        
        if doi:
            new_entry = [line for line in entry if not line.startswith('ER  -')]
            new_entry.append(f'DO  - {doi}')
            new_entry.append('ER  -')
            return index, new_entry, True
        else:
            return index, entry, True
    else:
        return index, entry, False

def process_ris_entries(entries):
    """使用多线程处理RIS条目，为arXiv条目添加DOI，并分类条目。"""
    arxiv_entries = [None] * len(entries)  # 保持顺序
    other_entries = []
    max_workers = 4  # 控制最大线程数，避免过载arXiv API
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 提交所有arXiv条目处理任务
        future_to_index = {
            executor.submit(process_single_entry, entry, i): i
            for i, entry in enumerate(entries)
        }
        
        # 收集结果
        for future in as_completed(future_to_index):
            index, processed_entry, is_arxiv = future.result()
            if is_arxiv:
                arxiv_entries[index] = processed_entry
            else:
                other_entries.append(processed_entry)
    
    # 过滤掉None（非arXiv条目已在other_entries中）
    arxiv_entries = [e for e in arxiv_entries if e is not None]
    
    return arxiv_entries, other_entries

def write_ris_file(entries, output_path):
    """将条目写入新的RIS文件。"""
    with open(output_path, 'w', encoding='utf-8') as f:
        for entry in entries:
            for line in entry:
                f.write(line + '\n')
            f.write('\n')

def main(input_path, output_path):
    print(f"读取RIS文件: {input_path}")
    entries = parse_ris_file(input_path)
    print(f"输入文件中找到 {len(entries)} 个条目。")
    
    arxiv_entries, other_entries = process_ris_entries(entries)
    print(f"\n处理了 {len(arxiv_entries)} 个arXiv条目和 {len(other_entries)} 个非arXiv条目。")
    
    # 合并条目：先arXiv，再其他
    all_entries = arxiv_entries + other_entries
    print(f"将 {len(all_entries)} 个条目写入输出文件: {output_path}")
    write_ris_file(all_entries, output_path)
    print("处理完成。")

if __name__ == "__main__":
    input_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal.ris"
    output_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal_updated_muilt.ris"
    main(input_path, output_path)

读取RIS文件: D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal.ris
输入文件中找到 1647 个条目。

处理标题: Chain-of-Thought Textual Reasoning for Few-shot Temporal Action Localization

处理标题: Attention in Diffusion Model: A Survey

处理标题: FDDet: Frequency-Decoupling for Boundary Refinement in Temporal Action Detection

处理标题: SMILE: Infusing Spatial and Motion Semantics in Masked Video Learning
构造的DOI: 10.48550/arXiv.2504.13460v3 (元数据中无显式DOI)

处理标题: Chapter-Llama: Efficient Chaptering in Hour-Long Videos with LLMs
构造的DOI: 10.48550/arXiv.1708.05296v1 (元数据中无显式DOI)
构造的DOI: 10.48550/arXiv.2007.06866v1 (元数据中无显式DOI)

处理标题: Towards Precise Action Spotting: Addressing Temporal Misalignment in Labels with Dynamic Label Assignment

处理标题: Modeling Multiple Normal Action Representations for Error Detection in Procedural Tasks
构造的DOI: 10.48550/arXiv.2504.00527v1 (元数据中无显式DOI)

处理标题: Automatic Detection of Intro and Credits in Video using CLIP and Multihead Attention
构造的D