In [1]:
# 1.将SingFile导出的html文件进行解析生成ris文件
# 首先鉴于当前的scopus并不支持对于arxiv的批量导出，同时arxiv尽管是预印版，但是有着一些高引的文章，值得参考
# 51s 1326篇文献
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# 清洗文本
def clean_text(text):
    if not text:
        return ""
    return re.sub(r'\s+', ' ', text).strip()

# 处理单个 HTML 文件，提取文献信息
def process_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    current_time = datetime.now().strftime("%Y/%m/%d/%H:%M:%S")
    documents = []
    rows = soup.find_all('tr', class_='TableItems-module__m0Z0b')

    for row in rows:
        preprint_span = row.find('span', string=re.compile('Preprint.*开放获取'))
        if preprint_span and not row.find('h3'):
            continue

        title_div = row.find('div', class_='TableItems-module__sHEzP')
        if title_div and title_div.find('h3'):
            doc = {}
            title_span = title_div.find('h3').find('span')
            doc['title'] = clean_text(title_span.get_text(strip=True)) if title_span else ''

            author_div = row.find('div', class_='author-list')
            authors = []
            if author_div:
                author_buttons = author_div.find_all('button')
                for button in author_buttons:
                    author_name = button.find('span', class_='Typography-module__lVnit')
                    if author_name:
                        authors.append(clean_text(author_name.get_text(strip=True)))
            doc['authors'] = authors

            source_div = row.find('div', class_='DocumentResultsList-module__tqiI3')
            doc['publisher'] = clean_text(source_div.find('span').get_text(strip=True)) if source_div else ''

            year_div = row.find('div', class_='TableItems-module__TpdzW')
            doc['year'] = clean_text(year_div.find('span').get_text(strip=True)) if year_div else ''

            abstract = ''
            current_row = row
            while True:
                next_row = current_row.find_next('tr')
                if not next_row:
                    break
                abstract_div = next_row.find('div', class_='Abstract-module__ukTwj')
                if abstract_div:
                    abstract = clean_text(abstract_div.get_text(strip=True))
                    break
                current_row = next_row
            doc['abstract'] = abstract
            doc['current_time'] = current_time
            documents.append(doc)
    return documents

# 将文献信息写入 RIS 格式
def write_ris(documents, output_path):
    with open(output_path, "a", encoding="utf-8") as f:
        for doc in documents:
            f.write("TY  - GEN\n")
            for author in doc['authors']:
                f.write(f"AU  - {author}\n")
            f.write(f"TI  - {doc['title']}\n")
            if doc['abstract']:
                f.write(f"AB  - {doc['abstract']}\n")
            if doc['publisher']:
                f.write(f"PB  - {doc['publisher']}\n")
            if doc['year']:
                f.write(f"PY  - {doc['year']}\n")
            st = doc['title'].split(":")[0] if ":" in doc['title'] else doc['title']
            f.write(f"ST  - {st}\n")
            f.write(f"Y2  - {doc['current_time']}\n")
            f.write("ER  -\n\n")

# 主函数：多线程读取多个文件并合并写入 RIS 文件
def main():
    html_dir = R"../../data/arxiv/html"
    ris_output_path = "../../data/arxiv/html_result/arxiv_results_multi.ris"
    
    filenames = [
        # "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：53：23).html",
        # "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：53：02).html",
        # "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：52：37).html",
        # "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：52：11).html",
        # "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：51：47).html",
        # "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：51：20).html",
        # "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：49：05).html",
        R"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html\Scopus - 文献搜索结果 ｜ 已登录 (2025_5_27 17：08：04).html"
    ]

    filepaths = [os.path.join(html_dir, name) for name in filenames]

    # 多线程处理文件
    all_documents = []
    with ThreadPoolExecutor(max_workers=7) as executor:
        results = list(executor.map(process_html_file, filepaths))
        for docs in results:
            all_documents.extend(docs)

    # 写入 RIS
    write_ris(all_documents, ris_output_path)
    print(f"已成功处理 {len(filepaths)} 个文件，共导出 {len(all_documents)} 条文献。")

if __name__ == "__main__":
    main()


已成功处理 1 个文件，共导出 7 条文献。


In [1]:
# 2.对于每次scopus更新的arxiv多出来的条目列出来，并且生成相应的ris文件
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# 清洗文本
def clean_text(text):
    if not text:
        return ""
    return re.sub(r'\s+', ' ', text).strip()

# 解析 RIS 文件，提取条目
def parse_ris_file(ris_filepath):
    entries = []
    current_entry = {}
    with open(ris_filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("TY  -"):
                current_entry = {}
            elif line.startswith("ER  -"):
                if current_entry:
                    entries.append(current_entry)
            elif line:
                key, value = map(str.strip, line.split(" - ", 1))
                if key in current_entry:
                    if isinstance(current_entry[key], list):
                        current_entry[key].append(value)
                    else:
                        current_entry[key] = [current_entry[key], value]
                else:
                    current_entry[key] = value
    return entries

# 处理单个 HTML 文件，提取文献信息
def process_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    current_time = datetime.now().strftime("%Y/%m/%d/%H:%M:%S")
    documents = []
    rows = soup.find_all('tr', class_='TableItems-module__m0Z0b')

    for row in rows:
        preprint_span = row.find('span', string=re.compile('Preprint.*开放获取'))
        if preprint_span and not row.find('h3'):
            continue

        title_div = row.find('div', class_='TableItems-module__sHEzP')
        if title_div and title_div.find('h3'):
            doc = {}
            title_span = title_div.find('h3').find('span')
            doc['title'] = clean_text(title_span.get_text(strip=True)) if title_span else ''

            author_div = row.find('div', class_='author-list')
            authors = []
            if author_div:
                author_buttons = author_div.find_all('button')
                for button in author_buttons:
                    author_name = button.find('span', class_='Typography-module__lVnit')
                    if author_name:
                        authors.append(clean_text(author_name.get_text(strip=True)))
            doc['authors'] = authors

            source_div = row.find('div', class_='DocumentResultsList-module__tqiI3')
            doc['publisher'] = clean_text(source_div.find('span').get_text(strip=True)) if source_div else ''

            year_div = row.find('div', class_='TableItems-module__TpdzW')
            doc['year'] = clean_text(year_div.find('span').get_text(strip=True)) if year_div else ''

            abstract = ''
            current_row = row
            while True:
                next_row = current_row.find_next('tr')
                if not next_row:
                    break
                abstract_div = next_row.find('div', class_='Abstract-module__ukTwj')
                if abstract_div:
                    abstract = clean_text(abstract_div.get_text(strip=True))
                    break
                current_row = next_row
            doc['abstract'] = abstract
            doc['current_time'] = current_time
            documents.append(doc)
    return documents

# 将文献信息写入 RIS 格式
def write_ris(documents, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for doc in documents:
            f.write("TY  - GEN\n")
            for author in doc['authors']:
                f.write(f"AU  - {author}\n")
            f.write(f"TI  - {doc['title']}\n")
            if doc['abstract']:
                f.write(f"AB  - {doc['abstract']}\n")
            if doc['publisher']:
                f.write(f"PB  - {doc['publisher']}\n")
            if doc['year']:
                f.write(f"PY  - {doc['year']}\n")
            st = doc['title'].split(":")[0] if ":" in doc['title'] else doc['title']
            f.write(f"ST  - {st}\n")
            f.write(f"Y2  - {doc['current_time']}\n")
            f.write("ER  -\n\n")

# 检测重复并返回不重复的文献
def deduplicate_documents(new_documents, existing_ris_filepath):
    existing_entries = parse_ris_file(existing_ris_filepath)
    existing_titles = {entry['TI'].lower() for entry in existing_entries if 'TI' in entry}
    
    unique_documents = []
    duplicate_count = 0
    
    for doc in new_documents:
        if doc['title'].lower() not in existing_titles:
            unique_documents.append(doc)
        else:
            duplicate_count += 1
    
    return unique_documents, duplicate_count

# 主函数
def main():
    # 配置路径（需要用户修改）
    html_dir = "../../data/arxiv/html"  # HTML 文件目录
    existing_ris_filepath = "../../data/arxiv/html_result/arxiv_results_multi_1647_tad_tal.ris"  # 现有 RIS 文件
    output_ris_filepath = "../../data/arxiv/html_result/unique_arxiv_results.ris"  # 输出不重复 RIS 文件
    
    # HTML 文件列表（示例，用户需替换为实际文件名）
    filenames = [
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_21 16：10：18).html",
    ]
    filepaths = [os.path.join(html_dir, name) for name in filenames]

    # 多线程处理 HTML 文件
    all_documents = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(process_html_file, filepaths))
        for docs in results:
            all_documents.extend(docs)

    # 去重
    unique_documents, duplicate_count = deduplicate_documents(all_documents, existing_ris_filepath)

    # 写入不重复的 RIS 文件
    if unique_documents:
        write_ris(unique_documents, output_ris_filepath)
    
    # 打印结果
    print(f"处理了 {len(filepaths)} 个 HTML 文件，共提取 {len(all_documents)} 条文献。")
    print(f"发现 {duplicate_count} 条重复文献。")
    print(f"导出了 {len(unique_documents)} 条不重复文献到 {output_ris_filepath}。")

if __name__ == "__main__":
    main()

处理了 1 个 HTML 文件，共提取 200 条文献。
发现 193 条重复文献。
导出了 7 条不重复文献到 ../../data/arxiv/html_result/unique_arxiv_results.ris。


In [9]:
import requests
from requests.exceptions import ReadTimeout, RequestException

# 用户直接输入文件路径
input_path = R"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal.ris"
output_path = R"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal_withdoi.ris"
log_path = R"../../data/all/arxiv_results_multi_1647_tad_tal_withdo.log"
error_log_path = R"../../data/all/arxiv_results_multi_1647_tad_tal_withdo.log"


def parse_ris_entries(lines):
    entries = []
    current = []
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('TY  -'):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith('ER  -'):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    info = {}
    for line in entry:
        if line.startswith('TI  -') and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith('AU  -'):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith('PY  -') and 'year' not in info:
            info['year'] = line[6:].strip()
    return info


def query_crossref(title, authors=None, year=None):
    params = {'query.title': title, 'rows': 1}
    if year:
        params['filter'] = f'from-pub-date:{year},until-pub-date:{year}'
    if authors:
        params['query.author'] = authors[0]
    try:
        response = requests.get('https://api.crossref.org/works', params=params, timeout=10)
        response.raise_for_status()
        items = response.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except ReadTimeout:
        # 超时，不影响主流程，将在后续重新处理
        raise
    except RequestException:
        # 其他网络错误
        return None
    return None


def enrich_entry_with_doi(entry, error_log):
    info = extract_key_info(entry)
    title = info.get('title')
    authors = info.get('authors')
    year = info.get('year')
    try:
        doi = query_crossref(title, authors, year)
    except ReadTimeout:
        # 记录超时条目以便后续重试
        error_log.append(info)
        return None
    if doi:
        for idx, line in enumerate(entry):
            if line.startswith('ER  -'):
                entry.insert(idx, f'DO  - {doi}')
                break
    return doi


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    missing_count = 0
    enriched_count = 0
    error_log = []  # 存放超时或 API 错误需后续处理的条目信息

    with open(output_path, 'w', encoding='utf-8') as fout, \
         open(log_path, 'w', encoding='utf-8') as flog:

        for entry in entries:
            if any(line.startswith('DO  -') for line in entry):
                for ln in entry:
                    fout.write(ln + '\n')
                fout.write('\n')
            else:
                doi = enrich_entry_with_doi(entry, error_log)
                if doi:
                    enriched_count += 1
                    for ln in entry:
                        fout.write(ln + '\n')
                    fout.write('\n')
                else:
                    missing_count += 1
                    info = extract_key_info(entry)
                    title = info.get('title', '<无标题>')
                    authors = '; '.join(info.get('authors', ['<未知作者>']))
                    year = info.get('year', '<未知年份>')
                    flog.write(f"{missing_count}. 标题: {title}\n")
                    flog.write(f"   作者: {authors}\n")
                    flog.write(f"   出版年: {year}\n")
                    flog.write(f"   原始条目行数: {len(entry)}\n")
                    flog.write("---\n")

    # 将需后续重试的条目写入单独日志
    if error_log:
        with open(error_log_path, 'w', encoding='utf-8') as ef:
            for idx, info in enumerate(error_log, 1):
                ef.write(f"{idx}. 标题: {info.get('title','<无标题>')}\n")
                ef.write(f"   作者: {'; '.join(info.get('authors', ['<未知作者>']))}\n")
                ef.write(f"   出版年: {info.get('year','<未知年份>')}\n")
                ef.write("---\n")

    print(f"处理完成：共 {len(entries)} 条文献；缺失 DOI 的有 {missing_count} 条，成功补全 DOI 的有 {enriched_count} 条。")
    if error_log:
        print(f"有 {len(error_log)} 条因网络超时或错误未处理，将记录至 {error_log_path} 以便后续重试。")
    print(f"输出文件：{output_path}，日志文件：{log_path}")


if __name__ == '__main__':
    main()

处理完成：共 1647 条文献；缺失 DOI 的有 110 条，成功补全 DOI 的有 1537 条。
有 50 条因网络超时或错误未处理，将记录至 ../../data/all/arxiv_results_multi_1647_tad_tal_withdo.log 以便后续重试。
输出文件：D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_1647_tad_tal_withdoi.ris，日志文件：../../data/all/arxiv_results_multi_1647_tad_tal_withdo.log


In [4]:
#!/usr/bin/env python3
"""
脚本功能：
1. 读取一个 .ris 文件，
2. 提取所有条目（以 'TY  -' 开始，以 'ER  -' 结束），
3. 判断每个条目中是否包含 DOI（以 'DO  -' 开头的行），
4. 将包含 DOI 的条目写入新的 .ris 文件，
5. 将缺失 DOI 的条目的关键信息（如 TI、AU、PY）记录到日志文件。

使用方法：
    直接运行脚本，脚本会提示输入文件路径和输出文件路径，无需命令行参数。
"""

# 用户直接输入文件路径
input_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep.ris"
output_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.ris"
log_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.log"


def parse_ris_entries(lines):
    """
    将 RIS 文件内容分割成若干条目，每条目是若干行列表。
    依据：条目以 'TY  -' 行开始，以 'ER  -' 行结束。"""
    entries = []
    current = []
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('TY  -'):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith('ER  -'):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    """
    从条目中提取关键信息，用于日志：
    标题 (TI)、作者 (AU)、出版年份 (PY)。"""
    info = {}
    for line in entry:
        if line.startswith('TI  -') and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith('AU  -'):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith('PY  -') and 'year' not in info:
            info['year'] = line[6:].strip()
    return info


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    missing_count = 0
    with open(output_path, 'w', encoding='utf-8') as fout, open(log_path, 'w', encoding='utf-8') as flog:
        for entry in entries:
            has_doi = any(line.startswith('DO  -') for line in entry)
            if has_doi:
                for ln in entry:
                    fout.write(ln + '\n')
                fout.write('\n')
            else:
                info = extract_key_info(entry)
                title = info.get('title', '<无标题>')
                authors = '; '.join(info.get('authors', ['<未知作者>']))
                year = info.get('year', '<未知年份>')
                flog.write(f"{missing_count+1}. 标题: {title}\n")
                flog.write(f"   作者: {authors}\n")
                flog.write(f"   出版年: {year}\n")
                flog.write(f"   原始条目行数: {len(entry)}\n")
                flog.write("---\n")
                missing_count += 1

    print(f"处理完成：共 {len(entries)} 条文献；缺失 DOI 的有 {missing_count} 条，已记录至 {log_path}。输出文件：{output_path}")


if __name__ == '__main__':
    main()

处理完成：共 4500 条文献；缺失 DOI 的有 806 条，已记录至 ../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.log。输出文件：../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.ris


In [1]:
# 这个主要是对于LNCS已经归档的文献找到其归属的会议
# 去除Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)的杂项
import re
from pathlib import Path

# 扩展会议缩写列表
abbrs = [
    'ECCV', 'ACCV', 'ACPR', 'CGI', 'DAGM-GCPR', 'ICANN', 'ICIAP', 'ICIC',
    'ICIG', 'ICIRA', 'ICONIP', 'ICPR', 'ISVC', 'MMM', 'PCM', 'PRCV',
    'PRICAI', 'SCIA', 'CAAI', 'NPC', 'ADMA', 'CVM', 'HCC', 'ICSI',
    'CCBR', 'ICFEM', 'MICCAI', 'DMAH', 'MICCAI', 'CAAI',
    'ICR', 'Euro-Par', 'MLDM', 'IbPRIA', 'ICPRAI',
    'ICCSA', 'ICPRAI', 'CAIP', 'ICDAR', 'CICAI', 'BDA'
    
]

# 处理单个 RIS 条目的函数
def process_ris_entry(entry: str) -> str:
    # 匹配 T2 和 N1 字段
    t2_match = re.search(r'^(T2  - .+)$', entry, re.MULTILINE)
    n1_match = re.search(r'^N1  - (.*?)$', entry, re.MULTILINE | re.DOTALL)

    if t2_match and n1_match:
        t2_value = t2_match.group(1)
        n1_content = n1_match.group(1)

        # 仅处理 Lecture Notes in Computer Science 条目
        if 'Lecture Notes in Computer Science' in t2_value:
            # 在 N1 中寻找缩写和年份，例如 'ECCV 2018'
            pattern = r'\b(' + '|'.join(map(re.escape, abbrs)) + r')\s+(\d{4})\b'
            match = re.search(pattern, n1_content)
            if match:
                abbr = match.group(1)
                year = match.group(2)
                new_t2 = f'T2  - {abbr} {year}'
                # 替换原有 T2 行
                entry = re.sub(r'^(T2  - .+)$', new_t2, entry, flags=re.MULTILINE)

    return entry

# 主处理函数
def process_ris_file(input_path: str, output_path: str):
    text = Path(input_path).read_text(encoding='utf-8')
    # 保留 ER 结束标记并分割条目
    raw_entries = re.split(r'\nER  -\s*\n', text.strip(), flags=re.DOTALL)
    processed = []

    for raw in raw_entries:
        entry = raw.strip()
        if not entry:
            continue
        # 恢复 ER 标记，处理后再添加
        processed_entry = process_ris_entry(entry + '\nER  -\n')
        processed.append(processed_entry)

    # 写入输出文件
    result = '\n'.join(processed).strip() + '\n'
    Path(output_path).write_text(result, encoding='utf-8')

# 命令行执行示例
if __name__ == '__main__':
    import argparse

    # parser = argparse.ArgumentParser(description='Process RIS file, updating T2 based on conference abbreviations.')
    # parser.add_argument('input', help='Input RIS file path')
    # parser.add_argument('output', help='Output RIS file path')
    # args = parser.parse_args()

    process_ris_file(R"D:\Programs\Codes\Skill-Up\search-for-papers\data\all\20250516_3892_888_1647-6427.ris", R"D:\Programs\Codes\Skill-Up\search-for-papers\data\all\20250516_3892_888_1647-6427_withoutLNCS.ris")
    # print(f"Processed RIS file saved to D:\Users\tang\Desktop\LNCS_20250511_303\LNCS_20250511_303.ris")

In [2]:
# 3.处理重复的条目，保留最详细的，并且在日志里面输出当前去重过的期刊 二重 去重逻辑 题目 DOI 现在存在bug -和空格区分不开
# 20250508最好的条目是arxiv\data\20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal_3891-3_3888_deduplication_end.ris
import uuid
import logging
import datetime
import re
from collections import defaultdict

def normalize_title(title):
    """Normalize title for comparison by removing case and punctuation."""
    return re.sub(r'[^\w\s]', '', title.lower()).strip()

def parse_ris_file(file_path):
    """Parse RIS file and return a list of entries."""
    entries = []
    current_entry = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line == 'ER  -':
                if current_entry:
                    entries.append(current_entry)
                    current_entry = {}
            elif line:
                tag, value = line.split('  - ', 1) if '  - ' in line else (line, '')
                current_entry[tag] = current_entry.get(tag, []) + [value]
    return entries

def count_entry_fields(entry):
    """Count the number of fields (data lines) in an entry."""
    return sum(len(values) for values in entry.values())

def deduplicate_by_field(entries, field, normalize=False):
    """Deduplicate entries based on a specified field, keeping the one with most fields."""
    field_to_entries = defaultdict(list)
    for entry in entries:
        field_value = entry.get(field, [''])[0]
        if field_value:  # Only process entries with the field
            key = normalize_title(field_value) if normalize else field_value
            field_to_entries[key].append(entry)
    
    deduplicated = []
    log_messages = []
    
    for key, entries_group in field_to_entries.items():
        if len(entries_group) > 1:
            # Sort by number of fields (descending) and keep the one with most fields
            entries_group.sort(key=count_entry_fields, reverse=True)
            kept_entry = entries_group[0]
            deduplicated.append(kept_entry)
            # Log removed entries
            for removed_entry in entries_group[1:]:
                log_messages.append(
                    f"Removed duplicate entry with {field} '{key}' "
                    f"(kept {count_entry_fields(kept_entry)} fields, "
                    f"removed {count_entry_fields(removed_entry)} fields, "
                    f"title: '{removed_entry.get('TI', [''])[0]}')"
                )
        else:
            deduplicated.append(entries_group[0])
    
    # Add entries that didn't have the field
    for entry in entries:
        if not entry.get(field, [''])[0]:
            deduplicated.append(entry)
    
    return deduplicated, log_messages

def deduplicate_entries(entries):
    """Deduplicate entries first by TI, then by DO."""
    # Step 1: Deduplicate by TI
    entries, ti_log_messages = deduplicate_by_field(entries, 'TI', normalize=True)
    
    # Step 2: Deduplicate by DO
    entries, do_log_messages = deduplicate_by_field(entries, 'DO', normalize=False)
    
    return entries, ti_log_messages + do_log_messages

def write_ris_file(entries, output_path):
    """Write deduplicated entries to a new RIS file with a blank line between entries."""
    with open(output_path, 'w', encoding='utf-8') as file:
        for i, entry in enumerate(entries):
            for tag, values in entry.items():
                for value in values:
                    file.write(f"{tag}  - {value}\n")
            file.write("ER  -\n")
            if i < len(entries) - 1:  # Add blank line between entries, but not after the last
                file.write("\n")

def setup_logging():
    """Set up logging to a file."""
    logging.basicConfig(
        filename=f'deduplication_log_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.txt',
        level=logging.INFO,
        format='%(asctime)s - %(message)s'
    )

def main(input_file, output_file):
    setup_logging()
    
    # Parse RIS file
    entries = parse_ris_file(input_file)
    
    # Deduplicate entries
    deduplicated_entries, log_messages = deduplicate_entries(entries)
    
    # Write to output file
    write_ris_file(deduplicated_entries, output_file)
    
    # Log results
    for message in log_messages:
        logging.info(message)
    
    logging.info(f"Processed {len(entries)} entries, kept {len(deduplicated_entries)} entries")
    print(f"Deduplication complete. Output written to {output_file}")
    print(f"Log written to deduplication_log_*.txt")

if __name__ == "__main__":
    input_file = R"D:\Programs\Codes\Skill-Up\search-for-papers\data\all\20250516_3892_888_1647-6427_withoutLNCS.ris"  # Replace with your input RIS file path
    output_file = R"D:\Programs\Codes\Skill-Up\search-for-papers\data\all\20250516_3892_888_1647-6427_withoutLNCS_deplication.ris"  # Replace with your desired output RIS file path
    main(input_file, output_file)

Deduplication complete. Output written to D:\Programs\Codes\Skill-Up\search-for-papers\data\all\20250516_3892_888_1647-6427_withoutLNCS_deplication.ris
Log written to deduplication_log_*.txt


In [10]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

def get_arxiv_doi(title):
    # Encode the title for URL safety and construct the query
    encoded_title = urllib.parse.quote(title)
    url = f'http://export.arxiv.org/api/query?search_query=ti:{encoded_title}&max_results=1'
    
    try:
        # Send request to arXiv API
        with urllib.request.urlopen(url) as response:
            data = response.read()
        
        # Parse XML response
        root = ET.fromstring(data)
        
        # Define namespaces
        ns = {
            'atom': 'http://www.w3.org/2005/Atom',
            'arxiv': 'http://arxiv.org/schemas/atom'
        }
        
        # Find the first entry
        entry = root.find('atom:entry', ns)
        if entry is None:
            return "No articles found for the given title."
        
        # Extract DOI
        doi_element = entry.find('arxiv:doi', ns)
        if doi_element is not None and doi_element.text:
            return doi_element.text
        else:
            return "No DOI available for this article."
            
    except urllib.error.URLError as e:
        return f"Error fetching data: {e}"
    except ET.ParseError:
        return "Error parsing XML response."

if __name__ == "__main__":
    # Get title input from user
    title = input("Enter the article title: ")
    result = get_arxiv_doi("Chain-of-Thought Textual Reasoning for Few-shot Temporal Action Localization")
    print(f"DOI: {result}")

DOI: No DOI available for this article.


In [1]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
import requests
from requests.exceptions import ReadTimeout, RequestException
import time

input_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\wos\889_tad_tal_wos_20250515_888.ris"
output_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\wos\889_tad_tal_wos_20250515_888_updatedoi.ris"
log_path = r"../../data/all/arxiv_results_multi_1647_tad_tal_withdo.log"
error_log_path = r"../../data/all/arxiv_results_multi_1647_tad_tal_withdo_error.log"


def parse_ris_entries(lines):
    entries = []
    current = []
    for line in lines:
        line = line.strip()
        if line.startswith("TY  -"):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith("ER  -"):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    info = {}
    for line in entry:
        if line.startswith("TI  -") and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith("AU  -"):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith("PY  -") and 'year' not in info:
            info['year'] = line[6:].strip()
        elif line.startswith("PB  -") and 'publisher' not in info:
            info['publisher'] = line[6:].strip()
    return info


def get_arxiv_doi(title):
    if ':' in title:
        title = title.split(':', 1)[1].strip()
    encoded_title = urllib.parse.quote(title)
    url = f"http://export.arxiv.org/api/query?search_query=ti:{encoded_title}&max_results=1"

    try:
        with urllib.request.urlopen(url) as response:
            data = response.read()
        root = ET.fromstring(data)
        ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
        entry = root.find('atom:entry', ns)
        if entry is None:
            return None
        id_elem = entry.find('atom:id', ns)
        if id_elem is None or not id_elem.text:
            return None
        arxiv_id = id_elem.text.split('/')[-1]
        doi_elem = entry.find('arxiv:doi', ns)
        if doi_elem is not None and doi_elem.text:
            return doi_elem.text
        return f"10.48550/arXiv.{arxiv_id}"
    except:
        return None


def query_crossref(title, authors=None, year=None):
    params = {'query.title': title, 'rows': 1}
    if year:
        params['filter'] = f'from-pub-date:{year},until-pub-date:{year}'
    if authors:
        params['query.author'] = authors[0]
    try:
        response = requests.get("https://api.crossref.org/works", params=params, timeout=10)
        response.raise_for_status()
        items = response.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except ReadTimeout:
        raise
    except RequestException:
        return None
    return None


def insert_doi(entry, doi):
    for idx, line in enumerate(entry):
        if line.startswith("ER  -"):
            entry.insert(idx, f"DO  - {doi}")
            break


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    error_log = []
    enriched = []
    missing = []

    with open(output_path, 'w', encoding='utf-8') as fout, \
         open(log_path, 'w', encoding='utf-8') as flog:

        for entry in entries:
            if any(line.startswith('DO  -') for line in entry):
                for ln in entry:
                    fout.write(ln + '\n')
                fout.write('\n')
                continue

            info = extract_key_info(entry)
            title = info.get('title', '<无标题>')
            authors = info.get('authors', [])
            year = info.get('year', '')
            publisher = info.get('publisher', '')

            doi = None
            if publisher.lower() == 'arxiv':
                doi = get_arxiv_doi(title)
                if doi:
                    print(f"arXiv DOI: {title}")
                    insert_doi(entry, doi)
                else:
                    time.sleep(0.1)  # 遵守 arXiv 请求频率限制

            if not doi:
                try:
                    doi = query_crossref(title, authors, year)
                    if doi:
                        print(f"Crossref DOI: {title}")
                        insert_doi(entry, doi)
                except ReadTimeout:
                    error_log.append(info)
                    continue

            if not doi:
                print(f"Missing DOI: {title}")
                missing.append(title)
                flog.write(f"Missing: {title} | Authors: {'; '.join(authors)} | Year: {year}\n")

            for ln in entry:
                fout.write(ln + '\n')
            fout.write('\n')

    if error_log:
        with open(error_log_path, 'w', encoding='utf-8') as ef:
            for idx, info in enumerate(error_log, 1):
                ef.write(f"{idx}. 标题: {info.get('title', '<无标题>')}\n")
                ef.write(f"   作者: {'; '.join(info.get('authors', ['<未知作者>']))}\n")
                ef.write(f"   出版年: {info.get('year', '<未知年份>')}\n---\n")

    print("\n处理完成。")
    print(f"总共处理条目数：{len(entries)}")
    print(f"获取失败（无DOI）的条目数：{len(missing)}")
    print(f"超时/需重试条目数：{len(error_log)}")
    print(f"输出文件路径：{output_path}")


if __name__ == '__main__':
    main()

Missing DOI: Exploring Temporal Context and Human Movement Dynamics for Online Action Detection in Videos
Missing DOI: Weakly-Supervised Temporal Action Localization by Inferring Salient Snippet-Feature
Missing DOI: Self-Supervised Video Action Localization with Adversarial Temporal Transforms
Missing DOI: Learning Disentangled Classification and Localization Representations for Temporal Action Localization
Missing DOI: Large Receptive Field Boundary Matching Networks for Generating Better Proposals
Missing DOI: Temporal Perception and Reasoning in Videos
Missing DOI: Deep Learning for Action Understanding in Video
Missing DOI: Frequency Selective Augmentation for Video Representation Learning
Missing DOI: Bullying10K: A Large-Scale Neuromorphic Dataset towards Privacy-Preserving Bullying Recognition
Missing DOI: Anchor-Free Pipeline Temporal Action Localisation
Missing DOI: Visual Representation Learning with Progressive Data Scarcity

处理完成。
总共处理条目数：887
获取失败（无DOI）的条目数：11
超时/需重试条目数：45


In [5]:
# 筛选出没有DO的条目
def parse_ris_entries(ris_text):
    entries = ris_text.strip().split('\nER  -')
    entries = [entry.strip() + '\nER  -' for entry in entries if entry.strip()]
    return entries

def filter_entries_without_doi(entries):
    no_doi_entries = []
    for entry in entries:
        if 'DO  -' not in entry:
            no_doi_entries.append(entry)
    return no_doi_entries

# 示例 RIS 内容（可替换为从文件读取）
with open(R'D:\Programs\Codes\Skill-Up\search-for-papers\data\wos\889_tad_tal_wos_20250515_888.ris', 'r', encoding='utf-8') as f:
    ris_content = f.read()

# 处理和筛选
entries = parse_ris_entries(ris_content)
no_doi_entries = filter_entries_without_doi(entries)

# 打印结果
for i, entry in enumerate(no_doi_entries, 1):
    print(f"Entry {i} without DOI:\n{entry}\n{'-'*40}")


Entry 1 without DOI:
TY  - JOUR
AU  - Gwon, Huiwon
AU  - Jo, Hyejeong
AU  - Jo, Sunhee
AU  - Jung, Chanho
TI  - A Bi-directional Information Learning Method Using Reverse Playback Video for Fully Supervised Temporal Action Localization
TI  - 완전지도 시간적 행동 검출에서역재생 비디오를 이용한 양방향 정보 학습 방법
T2  - Journal of IKEEE
T2  - 전기전자학회논문지
M3  - research-article
AB  - Recently, research on temporal action localization has been actively conducted. In this paper, unlike existingmethods, we propose two approaches for learning bidirectional information by creating reverse playback videosfor fully supervised temporal action localization. One approach involves creating training data by combiningreverse playback videos and forward playback videos, while the other approach involves training separate modelson videos with different playback directions. Experiments were conducted on the THUMOS-14 dataset usingTALLFormer. When using both reverse and forward playback videos as training data, the performance was 5.1%l

In [2]:
# 查找完整的DOI号的完整代码
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
import requests
from requests.exceptions import ReadTimeout, RequestException
import time

input_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi.ris"
output_path = r"D:\Programs\Codes\Skill-Up\search-for-papers\data\arxiv\html_result\arxiv_results_multi_withdoi.ris"
log_path = r"../../data/all/arxiv_results_multi_1647_tad_tal_withdo.log"
error_log_path = r"../../data/all/arxiv_results_multi_1647_tad_tal_withdo_error.log"


def parse_ris_entries(lines):
    entries = []
    current = []
    for line in lines:
        line = line.strip()
        if line.startswith("TY  -"):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith("ER  -"):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    info = {}
    for line in entry:
        if line.startswith("TI  -") and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith("AU  -"):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith("PY  -") and 'year' not in info:
            info['year'] = line[6:].strip()
        elif line.startswith("PB  -") and 'publisher' not in info:
            info['publisher'] = line[6:].strip()
    return info


def get_arxiv_doi(title):
    if ':' in title:
        title = title.split(':', 1)[1].strip()
    encoded_title = urllib.parse.quote(title)
    url = f"http://export.arxiv.org/api/query?search_query=ti:{encoded_title}&max_results=1"

    try:
        with urllib.request.urlopen(url) as response:
            data = response.read()
        root = ET.fromstring(data)
        ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
        entry = root.find('atom:entry', ns)
        if entry is None:
            return None
        id_elem = entry.find('atom:id', ns)
        if id_elem is None or not id_elem.text:
            return None
        arxiv_id = id_elem.text.split('/')[-1]
        doi_elem = entry.find('arxiv:doi', ns)
        if doi_elem is not None and doi_elem.text:
            return doi_elem.text
        return f"10.48550/arXiv.{arxiv_id}"
    except:
        return None


def query_crossref(title, authors=None, year=None):
    params = {'query.title': title, 'rows': 1}
    if year:
        params['filter'] = f'from-pub-date:{year},until-pub-date:{year}'
    if authors:
        params['query.author'] = authors[0]
    try:
        response = requests.get("https://api.crossref.org/works", params=params, timeout=10)
        response.raise_for_status()
        items = response.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except ReadTimeout:
        raise
    except RequestException:
        return None
    return None


def insert_doi(entry, doi):
    for idx, line in enumerate(entry):
        if line.startswith("ER  -"):
            entry.insert(idx, f"DO  - {doi}")
            break


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    error_log = []
    missing = []

    with open(output_path, 'w', encoding='utf-8') as fout, \
         open(log_path, 'w', encoding='utf-8') as flog:

        for entry in entries:
            # 检查是否已有 DO
            has_doi = any(line.startswith('DO  -') for line in entry)
            
            if has_doi:
                # 直接写入已有 DOI 的条目
                for ln in entry:
                    fout.write(ln + '\n')
                fout.write('\n')
                continue

            # 只有没有 DOI 的条目才进行处理
            info = extract_key_info(entry)
            title = info.get('title', '<无标题>')
            authors = info.get('authors', [])
            year = info.get('year', '')
            publisher = info.get('publisher', '')

            doi = None
            # 首先尝试 arXiv
            if publisher.lower() == 'arxiv':
                doi = get_arxiv_doi(title)
                if doi:
                    print(f"arXiv DOI found: {title} -> {doi}")
                    insert_doi(entry, doi)
                else:
                    time.sleep(0.1)  # 遵守 arXiv 请求频率限制

            # 如果 arXiv 没找到，尝试 Crossref
            if not doi:
                try:
                    doi = query_crossref(title, authors, year)
                    if doi:
                        print(f"Crossref DOI found: {title} -> {doi}")
                        insert_doi(entry, doi)
                except ReadTimeout:
                    error_log.append(info)
                    print(f"Timeout error for: {title}")
                    flog.write(f"Timeout: {title} | Authors: {'; '.join(authors)} | Year: {year}\n")
                    # 即使超时，也写入原条目

            # 如果仍然没有 DOI，记录并输出
            if not doi:
                print(f"Failed to find DOI for: {title}")
                missing.append(title)
                flog.write(f"Missing: {title} | Authors: {'; '.join(authors)} | Year: {year}\n")

            # 写入条目（无论是否找到 DOI）
            for ln in entry:
                fout.write(ln + '\n')
            fout.write('\n')

    # 写入错误日志
    if error_log:
        with open(error_log_path, 'w', encoding='utf-8') as ef:
            for idx, info in enumerate(error_log, 1):
                ef.write(f"{idx}. 标题: {info.get('title', '<无标题>')}\n")
                ef.write(f"   作者: {'; '.join(info.get('authors', ['<未知作者>']))}\n")
                ef.write(f"   出版年: {info.get('year', '<未知年份>')}\n---\n")

    print("\n处理完成。")
    print(f"总共处理条目数：{len(entries)}")
    print(f"无DOI且获取失败的条目数：{len(missing)}")
    print(f"超时/需重试条目数：{len(error_log)}")
    print(f"输出文件路径：{output_path}")


if __name__ == '__main__':
    main()

arXiv DOI found: Learning Streaming Video Representation via Multitask Training -> 10.48550/arXiv.2504.20041v1
arXiv DOI found: Live: Learning Video LLM with Streaming Speech Transcription at Scale -> 10.48550/arXiv.2504.16030v1
arXiv DOI found: Bridge the Gap: From Weak to Full Supervision for Temporal Action Localization with PseudoFormer -> 10.48550/arXiv.2504.14860v1
arXiv DOI found: Grounding-MD: Grounded Video-language Pre-training for Open-World Moment Detection -> 10.48550/arXiv.2408.16990v2
arXiv DOI found: HDBFormer: Efficient RGB-D Semantic Segmentation with A Heterogeneous Dual-Branch Framework -> 10.1109/LSP.2024.3496588
Timeout error for: Msvt: Multi-Grained Spatial and Vmamba Temporal for Few-Shot Action Recognition
Failed to find DOI for: Msvt: Multi-Grained Spatial and Vmamba Temporal for Few-Shot Action Recognition
Timeout error for: F3SET: TOWARDS ANALYZING FAST, FREQUENT, AND FINE-GRAINED EVENTS FROM VIDEOS
Failed to find DOI for: F3SET: TOWARDS ANALYZING FAST, FREQ