In [1]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

def get_arxiv_doi(title):
    # Encode the title for URL safety and construct the query
    encoded_title = urllib.parse.quote(title)
    url = f'http://export.arxiv.org/api/query?search_query=ti:{encoded_title}&max_results=1'
    
    try:
        # Send request to arXiv API
        with urllib.request.urlopen(url) as response:
            data = response.read()
        
        # Parse XML response
        root = ET.fromstring(data)
        
        # Define namespaces
        ns = {
            'atom': 'http://www.w3.org/2005/Atom',
            'arxiv': 'http://arxiv.org/schemas/atom'
        }
        
        # Find the first entry
        entry = root.find('atom:entry', ns)
        if entry is None:
            return "No articles found for the given title."
        
        # Extract arXiv ID
        id_element = entry.find('atom:id', ns)
        if id_element is None or not id_element.text:
            return "Could not retrieve arXiv ID."
        arxiv_id = id_element.text.split('/')[-1]  # e.g., 2504.13460v3
        
        # Check for existing DOI
        doi_element = entry.find('arxiv:doi', ns)
        if doi_element is not None and doi_element.text:
            return f"DOI: {doi_element.text}"
        
        # Construct DOI from arXiv ID
        constructed_doi = f"10.48550/arXiv.{arxiv_id}"
        return f"Constructed DOI: {constructed_doi} (no explicit DOI in metadata)"
            
    except urllib.error.URLError as e:
        return f"Error fetching data: {e}"
    except ET.ParseError:
        return "Error parsing XML response."

if __name__ == "__main__":
    # Test with the provided title
    title = "Chain-of-Thought Textual Reasoning for Few-shot Temporal Action Localization"
    result = get_arxiv_doi(title)
    print(result)

Constructed DOI: 10.48550/arXiv.2504.13460v3 (no explicit DOI in metadata)


In [None]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
import time

def get_arxiv_doi(title):
    """使用标题查询arXiv API，获取arXiv ID并构造DOI。"""
    # 如果标题包含冒号，只使用冒号后面的部分
    if ':' in title:
        title = title.split(':', 1)[1].strip()
    
    encoded_title = urllib.parse.quote(title)
    url = f'http://export.arxiv.org/api/query?search_query=ti:{encoded_title}&max_results=1'
    
    try:
        with urllib.request.urlopen(url) as response:
            data = response.read()
        
        root = ET.fromstring(data)
        ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
        
        entry = root.find('atom:entry', ns)
        if entry is None:
            return None, f"未找到标题为 '{title}' 的文章。"
        
        id_element = entry.find('atom:id', ns)
        if id_element is None or not id_element.text:
            return None, "无法获取arXiv ID。"
        
        arxiv_id = id_element.text.split('/')[-1]  # 例如：2504.13460v3
        doi_element = entry.find('arxiv:doi', ns)
        if doi_element is not None and doi_element.text:
            return doi_element.text, f"DOI: {doi_element.text}"
        
        constructed_doi = f"10.48550/arXiv.{arxiv_id}"
        return constructed_doi, f"构造的DOI: {constructed_doi} (元数据中无显式DOI)"
            
    except urllib.error.URLError as e:
        return None, f"获取数据时出错: {e}"
    except ET.ParseError:
        return None, "解析XML响应时出错。"

def parse_ris_file(file_path):
    """解析RIS文件，将其分为条目列表，每个条目为行列表。"""
    entries = []
    current_entry = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                current_entry.append(line)
            if line.startswith('ER  -'):
                entries.append(current_entry)
                current_entry = []
    
    return entries

def process_ris_entries(entries):
    """处理RIS条目，为arXiv条目添加DOI，并分类条目。"""
    arxiv_entries = []
    other_entries = []
    
    for entry in entries:
        publisher = None
        title = None
        
        # 提取PB和TI
        for line in entry:
            if line.startswith('PB  -'):
                publisher = line[6:].strip()
            elif line.startswith('TI  -'):
                title = line[6:].strip()
        
        if publisher == 'arXiv' and title:
            print(f"\n处理标题: {title}")
            doi, message = get_arxiv_doi(title)
            print(message)
            
            if doi:
                # 在ER之前插入DO字段
                new_entry = [line for line in entry if not line.startswith('ER  -')]
                new_entry.append(f'DO  - {doi}')
                new_entry.append('ER  -')
                arxiv_entries.append(new_entry)
            else:
                # 如果DOI获取失败，保持条目不变
                arxiv_entries.append(entry)
            
            # 遵守arXiv API速率限制
            time.sleep(0)
        else:
            other_entries.append(entry)
    
    return arxiv_entries, other_entries

def write_ris_file(entries, output_path):
    """将条目写入新的RIS文件。"""
    with open(output_path, 'w', encoding='utf-8') as f:
        for entry in entries:
            for line in entry:
                f.write(line + '\n')
            f.write('\n')

def main(input_path, output_path):
    print(f"读取RIS文件: {input_path}")
    entries = parse_ris_file(input_path)
    print(f"输入文件中找到 {len(entries)} 个条目。")
    
    arxiv_entries, other_entries = process_ris_entries(entries)
    print(f"\n处理了 {len(arxiv_entries)} 个arXiv条目和 {len(other_entries)} 个非arXiv条目。")
    
    # 合并条目：先arXiv，再其他
    all_entries = arxiv_entries + other_entries
    print(f"将 {len(all_entries)} 个条目写入输出文件: {output_path}")
    write_ris_file(all_entries, output_path)
    print("处理完成。")

if __name__ == "__main__":
    input_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal.ris"
    output_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal_updated.ris"
    main(input_path, output_path)

读取RIS文件: D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal.ris
输入文件中找到 1647 个条目。

处理标题: Chain-of-Thought Textual Reasoning for Few-shot Temporal Action Localization
构造的DOI: 10.48550/arXiv.2504.13460v3 (元数据中无显式DOI)

处理标题: Attention in Diffusion Model: A Survey
构造的DOI: 10.48550/arXiv.1708.05296v1 (元数据中无显式DOI)

处理标题: FDDet: Frequency-Decoupling for Boundary Refinement in Temporal Action Detection
构造的DOI: 10.48550/arXiv.2007.06866v1 (元数据中无显式DOI)

处理标题: SMILE: Infusing Spatial and Motion Semantics in Masked Video Learning
构造的DOI: 10.48550/arXiv.2504.00527v1 (元数据中无显式DOI)

处理标题: Chapter-Llama: Efficient Chaptering in Hour-Long Videos with LLMs
构造的DOI: 10.48550/arXiv.2504.00072v1 (元数据中无显式DOI)

处理标题: Towards Precise Action Spotting: Addressing Temporal Misalignment in Labels with Dynamic Label Assignment
构造的DOI: 10.48550/arXiv.2504.00149v1 (元数据中无显式DOI)

处理标题: Modeling Multiple Normal Action Representations for Error Detection in Procedural Task

: 

In [None]:
# 筛选出没有DO的条目
def parse_ris_entries(ris_text):
    entries = ris_text.strip().split('\nER  -')
    entries = [entry.strip() + '\nER  -' for entry in entries if entry.strip()]
    return entries

def filter_entries_without_doi(entries):
    no_doi_entries = []
    for entry in entries:
        if 'DO  -' not in entry:
            no_doi_entries.append(entry)
    return no_doi_entries

# 示例 RIS 内容（可替换为从文件读取）
with open(R'D:\Programs\Codes\Skill-Up\search-for-papers\data\scopus\3892_tad_tal_scopus_20250515.ris', 'r', encoding='utf-8') as f:
    ris_content = f.read()

# 处理和筛选
entries = parse_ris_entries(ris_content)
no_doi_entries = filter_entries_without_doi(entries)

# 打印结果
for i, entry in enumerate(no_doi_entries, 1):
    print(f"Entry {i} without DOI:\n{entry}\n{'-'*40}")


Entry 1 without DOI:
TY  - CONF
AU  - Xiong, T.
AU  - Wei, W.
AU  - Xu, K.
AU  - Chen, D.
TI  - SA-DETR:Span Aware Detection Transformer for Moment Retrieval
PY  - 2025
T2  - Proceedings - International Conference on Computational Linguistics, COLING
VL  - Part F206484-1
SP  - 7634
EP  - 7647
UR  - https://www.scopus.com/inward/record.uri?eid=2-s2.0-85218506386&partnerID=40&md5=a00effe39f4c900caf8b1435095241e5
AD  - Cognitive Computing and Intelligent Information Processing (CCIIP) Laboratory, School of Computer Science and Technology, Huazhong University of Science and Technology, China
AD  - Joint Laboratory of HUST and Pingan Property & Casualty Research (HPL), China
AD  - Ping An Property & Casualty Insurance company of China, Ltd., China
AB  - Moment Retrieval aims to locate specific video segments related to the given text. Recently, DETR-based methods, originating from Object Detection, have emerged as effective solutions for Moment Retrieval. These approaches focus on multimoda

In [1]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
import requests
from requests.exceptions import ReadTimeout, RequestException
import time

input_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\scopus\3892_tad_tal_scopus_20250515.ris"
output_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\scopus\3892_tad_tal_scopus_20250515_updated.ris"
log_path = r"../../data/all/arxiv_results_multi_1647_tad_tal_withdo.log"
error_log_path = r"../../data/all/arxiv_results_multi_1647_tad_tal_withdo_error.log"


def parse_ris_entries(lines):
    entries = []
    current = []
    for line in lines:
        line = line.strip()
        if line.startswith("TY  -"):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith("ER  -"):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    info = {}
    for line in entry:
        if line.startswith("TI  -") and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith("AU  -"):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith("PY  -") and 'year' not in info:
            info['year'] = line[6:].strip()
        elif line.startswith("PB  -") and 'publisher' not in info:
            info['publisher'] = line[6:].strip()
    return info


def get_arxiv_doi(title):
    # Encode the title for URL safety and construct the query
    encoded_title = urllib.parse.quote(title)
    url = f'http://export.arxiv.org/api/query?search_query=ti:{encoded_title}&max_results=1'
    
    try:
        # Send request to arXiv API
        with urllib.request.urlopen(url) as response:
            data = response.read()
        
        # Parse XML response
        root = ET.fromstring(data)
        
        # Define namespaces
        ns = {
            'atom': 'http://www.w3.org/2005/Atom',
            'arxiv': 'http://arxiv.org/schemas/atom'
        }
        
        # Find the first entry
        entry = root.find('atom:entry', ns)
        if entry is None:
            return None
        
        # Extract arXiv ID
        id_element = entry.find('atom:id', ns)
        if id_element is None or not id_element.text:
            return None
        arxiv_id = id_element.text.split('/')[-1]  # e.g., 2504.13460v3
        
        # Check for existing DOI
        doi_element = entry.find('arxiv:doi', ns)
        if doi_element is not None and doi_element.text:
            return doi_element.text
        
        # Construct DOI from arXiv ID
        return f"10.48550/arXiv.{arxiv_id}"
            
    except (urllib.error.URLError, ET.ParseError):
        return None


def query_crossref(title, authors=None, year=None):
    params = {'query.title': title, 'rows': 1}
    if year:
        params['filter'] = f'from-pub-date:{year},until-pub-date:{year}'
    if authors:
        params['query.author'] = authors[0]
    try:
        response = requests.get("https://api.crossref.org/works", params=params, timeout=10)
        response.raise_for_status()
        items = response.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except ReadTimeout:
        raise
    except RequestException:
        return None
    return None


def insert_doi(entry, doi):
    for idx, line in enumerate(entry):
        if line.startswith("ER  -"):
            entry.insert(idx, f"DO  - {doi}")
            break


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    error_log = []
    missing = []

    with open(output_path, 'w', encoding='utf-8') as fout, \
         open(log_path, 'w', encoding='utf-8') as flog:

        for entry in entries:
            # 检查是否已有 DO
            has_doi = any(line.startswith('DO  -') for line in entry)
            
            if has_doi:
                # 直接写入已有 DOI 的条目
                for ln in entry:
                    fout.write(ln + '\n')
                fout.write('\n')
                continue

            # 只有没有 DOI 的条目才进行处理
            info = extract_key_info(entry)
            title = info.get('title', '<无标题>')
            authors = info.get('authors', [])
            year = info.get('year', '')
            publisher = info.get('publisher', '')

            doi = None
            # 首先尝试 arXiv
            if 1:
                doi = get_arxiv_doi(title)
                if doi:
                    print(f"arXiv DOI found: {title} -> {doi}")
                    insert_doi(entry, doi)
                else:
                    time.sleep(0.1)  # 遵守 arXiv 请求频率限制

            # 如果 arXiv 没找到，尝试 Crossref
            if not doi:
                try:
                    doi = query_crossref(title, authors, year)
                    if doi:
                        print(f"Crossref DOI found: {title} -> {doi}")
                        insert_doi(entry, doi)
                except ReadTimeout:
                    error_log.append(info)
                    print(f"Timeout error for: {title}")
                    flog.write(f"Timeout: {title} | Authors: {'; '.join(authors)} | Year: {year}\n")
                    # 即使超时，也写入原条目

            # 如果仍然没有 DOI，记录并输出
            if not doi:
                print(f"Failed to find DOI for: {title}")
                missing.append(title)
                flog.write(f"Missing: {title} | Authors: {'; '.join(authors)} | Year: {year}\n")

            # 写入条目（无论是否找到 DOI）
            for ln in entry:
                fout.write(ln + '\n')
            fout.write('\n')

    # 写入错误日志
    if error_log:
        with open(error_log_path, 'w', encoding='utf-8') as ef:
            for idx, info in enumerate(error_log, 1):
                ef.write(f"{idx}. 标题: {info.get('title', '<无标题>')}\n")
                ef.write(f"   作者: {'; '.join(info.get('authors', ['<未知作者>']))}\n")
                ef.write(f"   出版年: {info.get('year', '<未知年份>')}\n---\n")

    print("\n处理完成。")
    print(f"总共处理条目数：{len(entries)}")
    print(f"无DOI且获取失败的条目数：{len(missing)}")
    print(f"超时/需重试条目数：{len(error_log)}")
    print(f"输出文件路径：{output_path}")


if __name__ == '__main__':
    main()

Crossref DOI found: SA-DETR:Span Aware Detection Transformer for Moment Retrieval -> 10.1016/j.compag.2025.110373
Crossref DOI found: VideoQA-TA: Temporal-Aware Multi-Modal Video Question Answering -> 10.1109/tcsvt.2024.3490665
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th European Conference on Computer Vision, ECCV 2024 -> 10.4204/EPTCS.282
arXiv DOI found: 18th Europe