In [1]:
# 1.将SingFile导出的html文件进行解析生成ris文件
# 首先鉴于当前的scopus并不支持对于arxiv的批量导出，同时arxiv尽管是预印版，但是有着一些高引的文章，值得参考
# 51s 1326篇文献
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# 清洗文本
def clean_text(text):
    if not text:
        return ""
    return re.sub(r'\s+', ' ', text).strip()

# 处理单个 HTML 文件，提取文献信息
def process_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    current_time = datetime.now().strftime("%Y/%m/%d/%H:%M:%S")
    documents = []
    rows = soup.find_all('tr', class_='TableItems-module__m0Z0b')

    for row in rows:
        preprint_span = row.find('span', string=re.compile('Preprint.*开放获取'))
        if preprint_span and not row.find('h3'):
            continue

        title_div = row.find('div', class_='TableItems-module__sHEzP')
        if title_div and title_div.find('h3'):
            doc = {}
            title_span = title_div.find('h3').find('span')
            doc['title'] = clean_text(title_span.get_text(strip=True)) if title_span else ''

            author_div = row.find('div', class_='author-list')
            authors = []
            if author_div:
                author_buttons = author_div.find_all('button')
                for button in author_buttons:
                    author_name = button.find('span', class_='Typography-module__lVnit')
                    if author_name:
                        authors.append(clean_text(author_name.get_text(strip=True)))
            doc['authors'] = authors

            source_div = row.find('div', class_='DocumentResultsList-module__tqiI3')
            doc['publisher'] = clean_text(source_div.find('span').get_text(strip=True)) if source_div else ''

            year_div = row.find('div', class_='TableItems-module__TpdzW')
            doc['year'] = clean_text(year_div.find('span').get_text(strip=True)) if year_div else ''

            abstract = ''
            current_row = row
            while True:
                next_row = current_row.find_next('tr')
                if not next_row:
                    break
                abstract_div = next_row.find('div', class_='Abstract-module__ukTwj')
                if abstract_div:
                    abstract = clean_text(abstract_div.get_text(strip=True))
                    break
                current_row = next_row
            doc['abstract'] = abstract
            doc['current_time'] = current_time
            documents.append(doc)
    return documents

# 将文献信息写入 RIS 格式
def write_ris(documents, output_path):
    with open(output_path, "a", encoding="utf-8") as f:
        for doc in documents:
            f.write("TY  - GEN\n")
            for author in doc['authors']:
                f.write(f"AU  - {author}\n")
            f.write(f"TI  - {doc['title']}\n")
            if doc['abstract']:
                f.write(f"AB  - {doc['abstract']}\n")
            if doc['publisher']:
                f.write(f"PB  - {doc['publisher']}\n")
            if doc['year']:
                f.write(f"PY  - {doc['year']}\n")
            st = doc['title'].split(":")[0] if ":" in doc['title'] else doc['title']
            f.write(f"ST  - {st}\n")
            f.write(f"Y2  - {doc['current_time']}\n")
            f.write("ER  -\n\n")

# 主函数：多线程读取多个文件并合并写入 RIS 文件
def main():
    html_dir = R"../../data/arxiv/html"
    ris_output_path = "../../data/arxiv/html_result/arxiv_results_multi.ris"
    
    filenames = [
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：53：23).html",
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：53：02).html",
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：52：37).html",
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：52：11).html",
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：51：47).html",
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：51：20).html",
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_5 21：49：05).html",
    ]

    filepaths = [os.path.join(html_dir, name) for name in filenames]

    # 多线程处理文件
    all_documents = []
    with ThreadPoolExecutor(max_workers=7) as executor:
        results = list(executor.map(process_html_file, filepaths))
        for docs in results:
            all_documents.extend(docs)

    # 写入 RIS
    write_ris(all_documents, ris_output_path)
    print(f"已成功处理 {len(filepaths)} 个文件，共导出 {len(all_documents)} 条文献。")

if __name__ == "__main__":
    main()


已成功处理 7 个文件，共导出 1326 条文献。


In [4]:
# 2.对于每次scopus更新的arxiv多出来的条目列出来，并且生成相应的ris文件
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# 清洗文本
def clean_text(text):
    if not text:
        return ""
    return re.sub(r'\s+', ' ', text).strip()

# 解析 RIS 文件，提取条目
def parse_ris_file(ris_filepath):
    entries = []
    current_entry = {}
    with open(ris_filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("TY  -"):
                current_entry = {}
            elif line.startswith("ER  -"):
                if current_entry:
                    entries.append(current_entry)
            elif line:
                key, value = map(str.strip, line.split(" - ", 1))
                if key in current_entry:
                    if isinstance(current_entry[key], list):
                        current_entry[key].append(value)
                    else:
                        current_entry[key] = [current_entry[key], value]
                else:
                    current_entry[key] = value
    return entries

# 处理单个 HTML 文件，提取文献信息
def process_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    current_time = datetime.now().strftime("%Y/%m/%d/%H:%M:%S")
    documents = []
    rows = soup.find_all('tr', class_='TableItems-module__m0Z0b')

    for row in rows:
        preprint_span = row.find('span', string=re.compile('Preprint.*开放获取'))
        if preprint_span and not row.find('h3'):
            continue

        title_div = row.find('div', class_='TableItems-module__sHEzP')
        if title_div and title_div.find('h3'):
            doc = {}
            title_span = title_div.find('h3').find('span')
            doc['title'] = clean_text(title_span.get_text(strip=True)) if title_span else ''

            author_div = row.find('div', class_='author-list')
            authors = []
            if author_div:
                author_buttons = author_div.find_all('button')
                for button in author_buttons:
                    author_name = button.find('span', class_='Typography-module__lVnit')
                    if author_name:
                        authors.append(clean_text(author_name.get_text(strip=True)))
            doc['authors'] = authors

            source_div = row.find('div', class_='DocumentResultsList-module__tqiI3')
            doc['publisher'] = clean_text(source_div.find('span').get_text(strip=True)) if source_div else ''

            year_div = row.find('div', class_='TableItems-module__TpdzW')
            doc['year'] = clean_text(year_div.find('span').get_text(strip=True)) if year_div else ''

            abstract = ''
            current_row = row
            while True:
                next_row = current_row.find_next('tr')
                if not next_row:
                    break
                abstract_div = next_row.find('div', class_='Abstract-module__ukTwj')
                if abstract_div:
                    abstract = clean_text(abstract_div.get_text(strip=True))
                    break
                current_row = next_row
            doc['abstract'] = abstract
            doc['current_time'] = current_time
            documents.append(doc)
    return documents

# 将文献信息写入 RIS 格式
def write_ris(documents, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for doc in documents:
            f.write("TY  - GEN\n")
            for author in doc['authors']:
                f.write(f"AU  - {author}\n")
            f.write(f"TI  - {doc['title']}\n")
            if doc['abstract']:
                f.write(f"AB  - {doc['abstract']}\n")
            if doc['publisher']:
                f.write(f"PB  - {doc['publisher']}\n")
            if doc['year']:
                f.write(f"PY  - {doc['year']}\n")
            st = doc['title'].split(":")[0] if ":" in doc['title'] else doc['title']
            f.write(f"ST  - {st}\n")
            f.write(f"Y2  - {doc['current_time']}\n")
            f.write("ER  -\n\n")

# 检测重复并返回不重复的文献
def deduplicate_documents(new_documents, existing_ris_filepath):
    existing_entries = parse_ris_file(existing_ris_filepath)
    existing_titles = {entry['TI'].lower() for entry in existing_entries if 'TI' in entry}
    
    unique_documents = []
    duplicate_count = 0
    
    for doc in new_documents:
        if doc['title'].lower() not in existing_titles:
            unique_documents.append(doc)
        else:
            duplicate_count += 1
    
    return unique_documents, duplicate_count

# 主函数
def main():
    # 配置路径（需要用户修改）
    html_dir = "../../data/arxiv/html"  # HTML 文件目录
    existing_ris_filepath = "../../data/arxiv/html_result/arxiv_results_multi_1635_tad_tal.ris"  # 现有 RIS 文件
    output_ris_filepath = "../../data/arxiv/html_result/unique_arxiv_results.ris"  # 输出不重复 RIS 文件
    
    # HTML 文件列表（示例，用户需替换为实际文件名）
    filenames = [
        "Scopus - 文献搜索结果 ｜ 已登录 (2025_5_9 16：50：37).html",
    ]
    filepaths = [os.path.join(html_dir, name) for name in filenames]

    # 多线程处理 HTML 文件
    all_documents = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(process_html_file, filepaths))
        for docs in results:
            all_documents.extend(docs)

    # 去重
    unique_documents, duplicate_count = deduplicate_documents(all_documents, existing_ris_filepath)

    # 写入不重复的 RIS 文件
    if unique_documents:
        write_ris(unique_documents, output_ris_filepath)
    
    # 打印结果
    print(f"处理了 {len(filepaths)} 个 HTML 文件，共提取 {len(all_documents)} 条文献。")
    print(f"发现 {duplicate_count} 条重复文献。")
    print(f"导出了 {len(unique_documents)} 条不重复文献到 {output_ris_filepath}。")

if __name__ == "__main__":
    main()

处理了 1 个 HTML 文件，共提取 200 条文献。
发现 195 条重复文献。
导出了 5 条不重复文献到 ../../data/arxiv/html_result/unique_arxiv_results.ris。


In [3]:
import requests
from requests.exceptions import ReadTimeout, RequestException

# 用户直接输入文件路径
input_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep.ris"
output_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi_enriched.ris"
log_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi_enriched.log"
error_log_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi_errors.log"


def parse_ris_entries(lines):
    entries = []
    current = []
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('TY  -'):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith('ER  -'):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    info = {}
    for line in entry:
        if line.startswith('TI  -') and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith('AU  -'):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith('PY  -') and 'year' not in info:
            info['year'] = line[6:].strip()
    return info


def query_crossref(title, authors=None, year=None):
    params = {'query.title': title, 'rows': 1}
    if year:
        params['filter'] = f'from-pub-date:{year},until-pub-date:{year}'
    if authors:
        params['query.author'] = authors[0]
    try:
        response = requests.get('https://api.crossref.org/works', params=params, timeout=10)
        response.raise_for_status()
        items = response.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except ReadTimeout:
        # 超时，不影响主流程，将在后续重新处理
        raise
    except RequestException:
        # 其他网络错误
        return None
    return None


def enrich_entry_with_doi(entry, error_log):
    info = extract_key_info(entry)
    title = info.get('title')
    authors = info.get('authors')
    year = info.get('year')
    try:
        doi = query_crossref(title, authors, year)
    except ReadTimeout:
        # 记录超时条目以便后续重试
        error_log.append(info)
        return None
    if doi:
        for idx, line in enumerate(entry):
            if line.startswith('ER  -'):
                entry.insert(idx, f'DO  - {doi}')
                break
    return doi


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    missing_count = 0
    enriched_count = 0
    error_log = []  # 存放超时或 API 错误需后续处理的条目信息

    with open(output_path, 'w', encoding='utf-8') as fout, \
         open(log_path, 'w', encoding='utf-8') as flog:

        for entry in entries:
            if any(line.startswith('DO  -') for line in entry):
                for ln in entry:
                    fout.write(ln + '\n')
                fout.write('\n')
            else:
                doi = enrich_entry_with_doi(entry, error_log)
                if doi:
                    enriched_count += 1
                    for ln in entry:
                        fout.write(ln + '\n')
                    fout.write('\n')
                else:
                    missing_count += 1
                    info = extract_key_info(entry)
                    title = info.get('title', '<无标题>')
                    authors = '; '.join(info.get('authors', ['<未知作者>']))
                    year = info.get('year', '<未知年份>')
                    flog.write(f"{missing_count}. 标题: {title}\n")
                    flog.write(f"   作者: {authors}\n")
                    flog.write(f"   出版年: {year}\n")
                    flog.write(f"   原始条目行数: {len(entry)}\n")
                    flog.write("---\n")

    # 将需后续重试的条目写入单独日志
    if error_log:
        with open(error_log_path, 'w', encoding='utf-8') as ef:
            for idx, info in enumerate(error_log, 1):
                ef.write(f"{idx}. 标题: {info.get('title','<无标题>')}\n")
                ef.write(f"   作者: {'; '.join(info.get('authors', ['<未知作者>']))}\n")
                ef.write(f"   出版年: {info.get('year','<未知年份>')}\n")
                ef.write("---\n")

    print(f"处理完成：共 {len(entries)} 条文献；缺失 DOI 的有 {missing_count} 条，成功补全 DOI 的有 {enriched_count} 条。")
    if error_log:
        print(f"有 {len(error_log)} 条因网络超时或错误未处理，将记录至 {error_log_path} 以便后续重试。")
    print(f"输出文件：{output_path}，日志文件：{log_path}")


if __name__ == '__main__':
    main()

处理完成：共 4500 条文献；缺失 DOI 的有 30 条，成功补全 DOI 的有 776 条。
有 1 条因网络超时或错误未处理，将记录至 ../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi_errors.log 以便后续重试。
输出文件：../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi_enriched.ris，日志文件：../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi_enriched.log


In [4]:
#!/usr/bin/env python3
"""
脚本功能：
1. 读取一个 .ris 文件，
2. 提取所有条目（以 'TY  -' 开始，以 'ER  -' 结束），
3. 判断每个条目中是否包含 DOI（以 'DO  -' 开头的行），
4. 将包含 DOI 的条目写入新的 .ris 文件，
5. 将缺失 DOI 的条目的关键信息（如 TI、AU、PY）记录到日志文件。

使用方法：
    直接运行脚本，脚本会提示输入文件路径和输出文件路径，无需命令行参数。
"""

# 用户直接输入文件路径
input_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep.ris"
output_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.ris"
log_path = "../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.log"


def parse_ris_entries(lines):
    """
    将 RIS 文件内容分割成若干条目，每条目是若干行列表。
    依据：条目以 'TY  -' 行开始，以 'ER  -' 行结束。"""
    entries = []
    current = []
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('TY  -'):
            if current:
                entries.append(current)
            current = [line]
        elif current:
            current.append(line)
            if line.startswith('ER  -'):
                entries.append(current)
                current = []
    if current:
        entries.append(current)
    return entries


def extract_key_info(entry):
    """
    从条目中提取关键信息，用于日志：
    标题 (TI)、作者 (AU)、出版年份 (PY)。"""
    info = {}
    for line in entry:
        if line.startswith('TI  -') and 'title' not in info:
            info['title'] = line[6:].strip()
        elif line.startswith('AU  -'):
            info.setdefault('authors', []).append(line[6:].strip())
        elif line.startswith('PY  -') and 'year' not in info:
            info['year'] = line[6:].strip()
    return info


def main():
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"无法找到输入文件: {input_path}")
        return

    entries = parse_ris_entries(lines)
    missing_count = 0
    with open(output_path, 'w', encoding='utf-8') as fout, open(log_path, 'w', encoding='utf-8') as flog:
        for entry in entries:
            has_doi = any(line.startswith('DO  -') for line in entry)
            if has_doi:
                for ln in entry:
                    fout.write(ln + '\n')
                fout.write('\n')
            else:
                info = extract_key_info(entry)
                title = info.get('title', '<无标题>')
                authors = '; '.join(info.get('authors', ['<未知作者>']))
                year = info.get('year', '<未知年份>')
                flog.write(f"{missing_count+1}. 标题: {title}\n")
                flog.write(f"   作者: {authors}\n")
                flog.write(f"   出版年: {year}\n")
                flog.write(f"   原始条目行数: {len(entry)}\n")
                flog.write("---\n")
                missing_count += 1

    print(f"处理完成：共 {len(entries)} 条文献；缺失 DOI 的有 {missing_count} 条，已记录至 {log_path}。输出文件：{output_path}")


if __name__ == '__main__':
    main()

处理完成：共 4500 条文献；缺失 DOI 的有 806 条，已记录至 ../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.log。输出文件：../../data/all/4500_4514_6415_arxiv_results_multi_1640_tad_tal_3890_tad_tal_scopus_20250509_886_tad_tal_wos_20250509_885_dep_doi.ris


In [2]:
# 这个主要是对于LNCS已经归档的文献找到其归属的会议
# 去除Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)的杂项
import re
from pathlib import Path

# 扩展会议缩写列表
abbrs = [
    'ECCV', 'ACCV', 'ACPR', 'CGI', 'DAGM-GCPR', 'ICANN', 'ICIAP', 'ICIC',
    'ICIG', 'ICIRA', 'ICONIP', 'ICPR', 'ISVC', 'MMM', 'PCM', 'PRCV',
    'PRICAI', 'SCIA', 'CAAI', 'NPC', 'ADMA', 'CVM', 'HCC', 'ICSI',
    'CCBR', 'ICFEM', 'MICCAI', 'DMAH', 'MICCAI', 'CAAI',
    'ICR', 'Euro-Par', 'MLDM', 'IbPRIA', 'ICPRAI',
    'ICCSA', 'ICPRAI', 'CAIP', 'ICDAR', 'CICAI'
    
]

# 处理单个 RIS 条目的函数
def process_ris_entry(entry: str) -> str:
    # 匹配 T2 和 N1 字段
    t2_match = re.search(r'^(T2  - .+)$', entry, re.MULTILINE)
    n1_match = re.search(r'^N1  - <p>(.*?)</p>$', entry, re.MULTILINE | re.DOTALL)

    if t2_match and n1_match:
        t2_value = t2_match.group(1)
        n1_content = n1_match.group(1)

        # 仅处理 Lecture Notes in Computer Science 条目
        if 'Lecture Notes in Computer Science' in t2_value:
            # 在 N1 中寻找缩写和年份，例如 'ECCV 2018'
            pattern = r'\b(' + '|'.join(map(re.escape, abbrs)) + r')\s+(\d{4})\b'
            match = re.search(pattern, n1_content)
            if match:
                abbr = match.group(1)
                year = match.group(2)
                new_t2 = f'T2  - {abbr} {year}'
                # 替换原有 T2 行
                entry = re.sub(r'^(T2  - .+)$', new_t2, entry, flags=re.MULTILINE)

    return entry

# 主处理函数
def process_ris_file(input_path: str, output_path: str):
    text = Path(input_path).read_text(encoding='utf-8')
    # 保留 ER 结束标记并分割条目
    raw_entries = re.split(r'\nER  -\s*\n', text.strip(), flags=re.DOTALL)
    processed = []

    for raw in raw_entries:
        entry = raw.strip()
        if not entry:
            continue
        # 恢复 ER 标记，处理后再添加
        processed_entry = process_ris_entry(entry + '\nER  -')
        processed.append(processed_entry)

    # 写入输出文件
    result = '\n'.join(processed).strip() + '\n'
    Path(output_path).write_text(result, encoding='utf-8')

# 命令行执行示例
if __name__ == '__main__':
    import argparse

    # parser = argparse.ArgumentParser(description='Process RIS file, updating T2 based on conference abbreviations.')
    # parser.add_argument('input', help='Input RIS file path')
    # parser.add_argument('output', help='Output RIS file path')
    # args = parser.parse_args()

    process_ris_file(R"D:\Users\tang\Desktop\LNCS_20250511_303\LNCS_20250511_303.ris", R"D:\Users\tang\Desktop\LNCS_20250511_303\LNCS_20250511_303.ris")
    # print(f"Processed RIS file saved to D:\Users\tang\Desktop\LNCS_20250511_303\LNCS_20250511_303.ris")