In [3]:
# 1.将SingleFile获取得到的文件提取出当前的具体ris文件 1635条->66.4s
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# 清洗文本
def clean_text(text):
    if not text:
        return ""
    return re.sub(r'\s+', ' ', text).strip()

# 处理单个 HTML 文件，提取文献信息
def process_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    current_time = datetime.now().strftime("%Y/%m/%d/%H:%M:%S")
    documents = []
    rows = soup.find_all('tr', class_='TableItems-module__m0Z0b')

    for row in rows:
        preprint_span = row.find('span', string=re.compile('Preprint.*开放获取'))
        if preprint_span and not row.find('h3'):
            continue

        title_div = row.find('div', class_='TableItems-module__sHEzP')
        if title_div and title_div.find('h3'):
            doc = {}
            title_span = title_div.find('h3').find('span')
            doc['title'] = clean_text(title_span.get_text(strip=True)) if title_span else ''

            author_div = row.find('div', class_='author-list')
            authors = []
            if author_div:
                author_buttons = author_div.find_all('button')
                for button in author_buttons:
                    author_name = button.find('span', class_='Typography-module__lVnit')
                    if author_name:
                        authors.append(clean_text(author_name.get_text(strip=True)))
            doc['authors'] = authors

            source_div = row.find('div', class_='DocumentResultsList-module__tqiI3')
            doc['publisher'] = clean_text(source_div.find('span').get_text(strip=True)) if source_div else ''

            year_div = row.find('div', class_='TableItems-module__TpdzW')
            doc['year'] = clean_text(year_div.find('span').get_text(strip=True)) if year_div else ''

            abstract = ''
            current_row = row
            while True:
                next_row = current_row.find_next('tr')
                if not next_row:
                    break
                abstract_div = next_row.find('div', class_='Abstract-module__ukTwj')
                if abstract_div:
                    abstract = clean_text(abstract_div.get_text(strip=True))
                    break
                current_row = next_row
            doc['abstract'] = abstract
            doc['current_time'] = current_time
            documents.append(doc)
    return documents

# 将文献信息写入 RIS 格式
def write_ris(documents, output_path):
    with open(output_path, "a", encoding="utf-8") as f:
        for doc in documents:
            f.write("TY  - GEN\n")
            for author in doc['authors']:
                f.write(f"AU  - {author}\n")
            f.write(f"TI  - {doc['title']}\n")
            if doc['abstract']:
                f.write(f"AB  - {doc['abstract']}\n")
            if doc['publisher']:
                f.write(f"PB  - {doc['publisher']}\n")
            if doc['year']:
                f.write(f"PY  - {doc['year']}\n")
            st = doc['title'].split(":")[0] if ":" in doc['title'] else doc['title']
            f.write(f"ST  - {st}\n")
            f.write(f"Y2  - {doc['current_time']}\n")
            f.write("ER  -\n\n")

# 主函数：多线程读取多个文件并合并写入 RIS 文件
def main():
    html_dir = R"D:\Users\tang\Downloads"
    ris_output_path = "../data/arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal.ris"
    
    filenames = [
        R"20250508_scopus_arxiv_0-200_tad_tal.html",
        R"20250508_scopus_arxiv_201-400_tad_tal.html",
        R"20250508_scopus_arxiv_401-600_tad_tal.html",
        R"20250508_scopus_arxiv_601-800_tad_tal.html",
        R"20250508_scopus_arxiv_801-1000_tad_tal.html",
        R"20250508_scopus_arxiv_1001-1200_tad_tal.html",
        R"20250508_scopus_arxiv_1201-1400_tad_tal.html",
        R"20250508_scopus_arxiv_1401-1600_tad_tal.html",
        R"20250508_scopus_arxiv_1601-1635_tad_tal.html",
    ]

    filepaths = [os.path.join(html_dir, name) for name in filenames]

    # 多线程处理文件
    all_documents = []
    with ThreadPoolExecutor(max_workers=9) as executor:
        results = list(executor.map(process_html_file, filepaths))
        for docs in results:
            all_documents.extend(docs)

    # 写入 RIS
    write_ris(all_documents, ris_output_path)
    print(f"已成功处理 {len(filepaths)} 个文件，共导出 {len(all_documents)} 条文献。")

if __name__ == "__main__":
    main()


已成功处理 9 个文件，共导出 1635 条文献。


In [4]:
# 2.后期为了评定当前arxiv的文献是否值得看，需要给定DOI号，然后看一下它的引用在进行分析
# 需要自己手动补充因为SSL失败而导入DOI失败的文件
import requests
import time
import re

def query_doi(title):
    """
    通过 CrossRef API 根据论文标题查询 DOI，返回 DOI 字符串或 None。
    """
    url = 'https://api.crossref.org/works'
    params = {
        'query.title': title,
        'rows': 1
    }
    try:
        r = requests.get(url, params=params, timeout=10)
        r.raise_for_status()
        items = r.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except Exception as e:
        print(f"Error querying CrossRef for title `{title}`: {e}")
    return None

def insert_doi_lines(ris_text):
    """
    对一整个 RIS 文本，拆分成多条记录，
    对每条记录提取 TI 字段，查询 DOI，
    并在 ER 之前插入 DO 字段。
    返回所有记录拼接后的新文本。
    """
    records = re.split(r'\nER  -.*', ris_text)
    endings = re.findall(r'(?:\nER  -.*)', ris_text)
    new_records = []

    for rec, ending in zip(records, endings):
        # 跳过空记录
        if not rec.strip():
            continue

        # 提取标题（TI  - 开头的一行）
        m = re.search(r'^TI  - (.+)$', rec, flags=re.MULTILINE)
        title = m.group(1).strip() if m else None

        doi = None
        if title:
            doi = query_doi(title)
            # time.sleep(1)  # 避免过快请求限流

        # 如果查到了 DOI，就插入 DO  - 行
        if doi:
            rec = rec + f'\nDO  - {doi}'

        # 把 ER 行放回
        new_records.append(rec + ending)

    return "\n".join(new_records)

def main():
    infile = R'../data/arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal.ris'
    outfile = '../data/arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal_with_doi.ris'

    # 读入原始 RIS
    with open(infile, 'r', encoding='utf-8') as f:
        text = f.read()

    # 插入 DOI
    new_text = insert_doi_lines(text)

    # 写入新文件
    with open(outfile, 'w', encoding='utf-8') as f:
        f.write(new_text)

    print(f"已生成带 DOI 的 RIS: {outfile}")

if __name__ == '__main__':
    main()


Error querying CrossRef for title `Injecting Explainability and Lightweight Design into Weakly Supervised Video Anomaly Detection Systems`: HTTPSConnectionPool(host='api.crossref.org', port=443): Read timed out. (read timeout=10)
Error querying CrossRef for title `MARINE: A Computer Vision Model for Detecting Rare Predator-Prey Interactions in Animal Videos`: HTTPSConnectionPool(host='api.crossref.org', port=443): Read timed out. (read timeout=10)
Error querying CrossRef for title `A Simulated Two-Stream Network via Multi-level Distillation of Reviewed Features and Decoupled Logits for Video Action Recognition`: HTTPSConnectionPool(host='api.crossref.org', port=443): Read timed out. (read timeout=10)
Error querying CrossRef for title `Structure-Aware Human Body Reshaping with Adaptive Affinity-Graph Network`: HTTPSConnectionPool(host='api.crossref.org', port=443): Read timed out. (read timeout=10)
Error querying CrossRef for title `Segment Anything Model Can Not Segment Anything: Asses

In [6]:
# 2.2 处理没有DOI查询失败的文献进行再次查询
import requests
import time
import re
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def setup_session():
    """
    Create a requests session with retry logic for handling transient errors.
    """
    session = requests.Session()
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

def query_doi(title, session):
    """
    Query DOI via CrossRef API using the paper title, return DOI string or None.
    """
    url = 'https://api.crossref.org/works'
    params = {
        'query.title': title,
        'rows': 1
    }
    try:
        r = session.get(url, params=params, timeout=20)
        r.raise_for_status()
        items = r.json().get('message', {}).get('items', [])
        if items:
            return items[0].get('DOI')
    except Exception as e:
        print(f"Error querying CrossRef for title `{title}`: {e}")
    return None

def update_doi_lines(ris_text):
    """
    Process RIS text, re-query DOIs for records without DO field,
    and insert DO field before ER.
    """
    # Split into records and preserve endings
    records = re.split(r'\nER  -.*', ris_text)
    endings = re.findall(r'(?:\nER  -.*)', ris_text)
    new_records = []

    session = setup_session()

    for rec, ending in zip(records, endings):
        # Skip empty records
        if not rec.strip():
            continue

        # Extract title (TI field)
        m = re.search(r'^TI  - (.+)$', rec, flags=re.MULTILINE)
        title = m.group(1).strip() if m else None

        # Check if record already has a DO field
        has_doi = re.search(r'^DO  - ', rec, flags=re.MULTILINE) is not None

        # If no DOI and title exists, re-query
        if not has_doi and title:
            doi = query_doi(title, session)
            if doi:
                rec = rec + f'\nDO  - {doi}'
            time.sleep(1)  # Avoid rate limiting

        new_records.append(rec + ending)

    return "\n".join(new_records)

def main():
    infile = '../data/arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal_with_doi.ris'
    outfile = '../data/arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal_with_doi_updated.ris'

    # Read the existing RIS file
    with open(infile, 'r', encoding='utf-8') as f:
        text = f.read()

    # Update with new DOIs
    new_text = update_doi_lines(text)

    # Write the updated RIS file
    with open(outfile, 'w', encoding='utf-8') as f:
        f.write(new_text)

    print(f"Updated RIS file with new DOIs: {outfile}")

if __name__ == '__main__':
    main()

Updated RIS file with new DOIs: ../data/arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal_with_doi_updated.ris


In [2]:
# 3.处理重复的条目，保留最详细的，并且在日志里面输出当前去重过的期刊 二重 去重逻辑 题目 DOI 现在存在bug -和空格区分不开
# 20250508最好的条目是arxiv\data\20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal_3891-3_3888_deduplication_end.ris
import uuid
import logging
import datetime
import re
from collections import defaultdict

def normalize_title(title):
    """Normalize title for comparison by removing case and punctuation."""
    return re.sub(r'[^\w\s]', '', title.lower()).strip()

def parse_ris_file(file_path):
    """Parse RIS file and return a list of entries."""
    entries = []
    current_entry = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line == 'ER  -':
                if current_entry:
                    entries.append(current_entry)
                    current_entry = {}
            elif line:
                tag, value = line.split('  - ', 1) if '  - ' in line else (line, '')
                current_entry[tag] = current_entry.get(tag, []) + [value]
    return entries

def count_entry_fields(entry):
    """Count the number of fields (data lines) in an entry."""
    return sum(len(values) for values in entry.values())

def deduplicate_by_field(entries, field, normalize=False):
    """Deduplicate entries based on a specified field, keeping the one with most fields."""
    field_to_entries = defaultdict(list)
    for entry in entries:
        field_value = entry.get(field, [''])[0]
        if field_value:  # Only process entries with the field
            key = normalize_title(field_value) if normalize else field_value
            field_to_entries[key].append(entry)
    
    deduplicated = []
    log_messages = []
    
    for key, entries_group in field_to_entries.items():
        if len(entries_group) > 1:
            # Sort by number of fields (descending) and keep the one with most fields
            entries_group.sort(key=count_entry_fields, reverse=True)
            kept_entry = entries_group[0]
            deduplicated.append(kept_entry)
            # Log removed entries
            for removed_entry in entries_group[1:]:
                log_messages.append(
                    f"Removed duplicate entry with {field} '{key}' "
                    f"(kept {count_entry_fields(kept_entry)} fields, "
                    f"removed {count_entry_fields(removed_entry)} fields, "
                    f"title: '{removed_entry.get('TI', [''])[0]}')"
                )
        else:
            deduplicated.append(entries_group[0])
    
    # Add entries that didn't have the field
    for entry in entries:
        if not entry.get(field, [''])[0]:
            deduplicated.append(entry)
    
    return deduplicated, log_messages

def deduplicate_entries(entries):
    """Deduplicate entries first by TI, then by DO."""
    # Step 1: Deduplicate by TI
    entries, ti_log_messages = deduplicate_by_field(entries, 'TI', normalize=True)
    
    # Step 2: Deduplicate by DO
    entries, do_log_messages = deduplicate_by_field(entries, 'DO', normalize=False)
    
    return entries, ti_log_messages + do_log_messages

def write_ris_file(entries, output_path):
    """Write deduplicated entries to a new RIS file with a blank line between entries."""
    with open(output_path, 'w', encoding='utf-8') as file:
        for i, entry in enumerate(entries):
            for tag, values in entry.items():
                for value in values:
                    file.write(f"{tag}  - {value}\n")
            file.write("ER  -\n")
            if i < len(entries) - 1:  # Add blank line between entries, but not after the last
                file.write("\n")

def setup_logging():
    """Set up logging to a file."""
    logging.basicConfig(
        filename=f'deduplication_log_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.txt',
        level=logging.INFO,
        format='%(asctime)s - %(message)s'
    )

def main(input_file, output_file):
    setup_logging()
    
    # Parse RIS file
    entries = parse_ris_file(input_file)
    
    # Deduplicate entries
    deduplicated_entries, log_messages = deduplicate_entries(entries)
    
    # Write to output file
    write_ris_file(deduplicated_entries, output_file)
    
    # Log results
    for message in log_messages:
        logging.info(message)
    
    logging.info(f"Processed {len(entries)} entries, kept {len(deduplicated_entries)} entries")
    print(f"Deduplication complete. Output written to {output_file}")
    print(f"Log written to deduplication_log_*.txt")

if __name__ == "__main__":
    input_file = R"D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal.ris"  # Replace with your input RIS file path
    output_file = R"D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal_output_deplication.ris"  # Replace with your desired output RIS file path
    main(input_file, output_file)

Deduplication complete. Output written to D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\arxiv_results_multi_20250508_scopus_arxiv_1635_tad_tal_output_deplication.ris
Log written to deduplication_log_*.txt


In [15]:
import re
from pathlib import Path

# 映射字典
conference_map = {
    'European Conference on Computer Vision': 'ECCV',
    'Asian Conference on Computer Vision': 'ACCV',
    'Asian Conference on Pattern Recognition': 'ACPR',
    'Computer Graphics International': 'CGI',
    'German Conference on Pattern Recognition': 'DAGM-GCPR',
    'International Conference on Artificial Neural Networks': 'ICANN',
    'International Conference on Image Analysis and Processing': 'ICIAP',
    'International Conference on Intelligent Computing': 'ICIC',
    'International Conference on Image and Graphics': 'ICIG',
    'International Conference on Intelligent Robotics and Applications': 'ICIRA',
    'International Conference on Neural Information Processing': 'ICONIP',
    'International Conference on Pattern Recognition': 'ICPR',
    'International Symposium on Visual Computing': 'ISVC',
    'International Conference on Multimedia Modeling': 'MMM',
    'Pacific-Rim Conference on Multimedia': 'PCM',
    'Chinese Conference on Pattern Recognition and Computer Vision': 'PRCV',
    'Pacific Rim International Conference on Artificial Intelligence': 'PRICAI',
    'Scandinavian Conference on Image Analysis': 'SCIA',
}

# 处理单个 RIS 条目的函数
def process_ris_entry(entry: str) -> str:
    t2_match = re.search(r'T2  - (.+)', entry)
    n1_match = re.search(r'N1  - <p>(.*?)</p>', entry, re.DOTALL)

    if t2_match and n1_match:
        t2_value = t2_match.group(1)
        n1_content = n1_match.group(1)

        if "Lecture Notes in Computer Science" in t2_value:
            for conf_name, conf_abbr in conference_map.items():
                if conf_name in n1_content:
                    eccv_code_match = re.search(rf'{re.escape(conf_abbr)}\s+\d{{4}}', n1_content)
                    if eccv_code_match:
                        new_t2 = f'{conf_name}, {eccv_code_match.group()}; Conference date'
                        entry = re.sub(r'(T2  - ).+', f"\\1{new_t2}", entry)
                        break

    return entry

# 主处理函数
def process_ris_file(input_path: str, output_path: str):
    text = Path(input_path).read_text(encoding='utf-8')
    entries = text.strip().split('\nER  -')

    processed_entries = [process_ris_entry(entry.strip()) + '\nER  -' for entry in entries if entry.strip()]
    Path(output_path).write_text('\n\n'.join(processed_entries), encoding='utf-8')

# 示例使用
if __name__ == '__main__':
    input_ris = R'D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\20250510_scopus_LNCS_tad_tal_303.ris'
    output_ris = R'D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\20250510_scopus_LNCS_tad_tal_303_processed.ris'
    process_ris_file(input_ris, output_ris)

    print(f"Processed RIS file saved to {output_ris}")

Processed RIS file saved to D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\20250510_scopus_LNCS_tad_tal_303_processed.ris


In [21]:
# 去除Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)的杂项
import re
from pathlib import Path

# 扩展会议缩写列表
abbrs = [
    'ECCV', 'ACCV', 'ACPR', 'CGI', 'DAGM-GCPR', 'ICANN', 'ICIAP', 'ICIC',
    'ICIG', 'ICIRA', 'ICONIP', 'ICPR', 'ISVC', 'MMM', 'PCM', 'PRCV',
    'PRICAI', 'SCIA', 'CAAI', 'NPC', 'ADMA', 'CVM', 'HCC', 'ICSI',
    'CCBR', 'ICFEM', 'MICCAI', 'DMAH', 'MICCAI', 'CAAI',
    'ICR', 'Euro-Par', 'MLDM', 'IbPRIA', 'ICPRAI',
    'ICCSA', 'ICPRAI', 'CAIP', 'ICDAR', 'CICAI'
    
]

# 处理单个 RIS 条目的函数
def process_ris_entry(entry: str) -> str:
    # 匹配 T2 和 N1 字段
    t2_match = re.search(r'^(T2  - .+)$', entry, re.MULTILINE)
    n1_match = re.search(r'^N1  - <p>(.*?)</p>$', entry, re.MULTILINE | re.DOTALL)

    if t2_match and n1_match:
        t2_value = t2_match.group(1)
        n1_content = n1_match.group(1)

        # 仅处理 Lecture Notes in Computer Science 条目
        if 'Lecture Notes in Computer Science' in t2_value:
            # 在 N1 中寻找缩写和年份，例如 'ECCV 2018'
            pattern = r'\b(' + '|'.join(map(re.escape, abbrs)) + r')\s+(\d{4})\b'
            match = re.search(pattern, n1_content)
            if match:
                abbr = match.group(1)
                year = match.group(2)
                new_t2 = f'T2  - {abbr} {year}'
                # 替换原有 T2 行
                entry = re.sub(r'^(T2  - .+)$', new_t2, entry, flags=re.MULTILINE)

    return entry

# 主处理函数
def process_ris_file(input_path: str, output_path: str):
    text = Path(input_path).read_text(encoding='utf-8')
    # 保留 ER 结束标记并分割条目
    raw_entries = re.split(r'\nER  -\s*\n', text.strip(), flags=re.DOTALL)
    processed = []

    for raw in raw_entries:
        entry = raw.strip()
        if not entry:
            continue
        # 恢复 ER 标记，处理后再添加
        processed_entry = process_ris_entry(entry + '\nER  -')
        processed.append(processed_entry)

    # 写入输出文件
    result = '\n'.join(processed).strip() + '\n'
    Path(output_path).write_text(result, encoding='utf-8')

# 命令行执行示例
if __name__ == '__main__':
    import argparse

    # parser = argparse.ArgumentParser(description='Process RIS file, updating T2 based on conference abbreviations.')
    # parser.add_argument('input', help='Input RIS file path')
    # parser.add_argument('output', help='Output RIS file path')
    # args = parser.parse_args()

    process_ris_file(R'D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\20250510_scopus_LNCS_tad_tal_303.ris', R'D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\20250510_scopus_LNCS_tad_tal_303_processed.ris')
    print(f"Processed RIS file saved to D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\20250510_scopus_LNCS_tad_tal_303_processed.ris")

Processed RIS file saved to D:\Programs\Codes\Skill-Up\learning-smorgasbordrxiv\data50510_scopus_LNCS_tad_tal_303_processed.ris


In [27]:
# 比较不同ris相同的条目
def extract_titles_from_ris(path):
    """
    从 RIS 文件中读取所有 TI 字段的值并返回一个集合。
    """
    titles = set()
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('TI  -'):
                # 去掉开头 'TI  - ' 并去除两端空白
                title = line[len('TI  -'):].strip()
                titles.add(title)
    return titles

def find_common_titles(ris_path1, ris_path2):
    """
    比较两个 RIS 文件，返回它们共有的 TI 字段值列表。
    """
    titles1 = extract_titles_from_ris(ris_path1)
    titles2 = extract_titles_from_ris(ris_path2)
    common = titles1 & titles2  # 取交集
    return sorted(common)

if __name__ == '__main__':
    # 替换成你的 RIS 文件路径
    file1 = R'D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\data\20250510_scopus_LNCS_tad_tal_303_processed_backup.ris'
    file2 = R"D:\Users\tang\Desktop\CCFC\CCFC.ris"

    common_titles = find_common_titles(file1, file2)
    if common_titles:
        print("两个 RIS 文件中相同的 TI 条目有：")
        for t in common_titles:
            print(f"- {t}")
    else:
        print("未发现相同的 TI 条目。")

两个 RIS 文件中相同的 TI 条目有：
- Dimensionality Deduction for Action Proposals: To Extract or to Select?
- Fast Video Clip Retrieval Method via Language Query
- Get the whole action event by action stage classification
- Improving saliency models by predicting human fixation patches
- Marine Vertebrate Predator Detection and Recognition in Underwater Videos by Region Convolutional Neural Network
- Modeling the temporality of saliency
- Multi-label discriminative weakly-supervised human activity recognition and localization
- Online Aggregated-Event Representation for Multiple Event Detection in Videos
- PON: Proposal Optimization Network for Temporal Action Proposal Generation
- Periodic Action Temporal Localization Method Based on Two-Path Architecture for Product Counting in Sewing Video
- STN-BA: Weakly-Supervised Few-Shot Temporal Action Localization
- TadML: A Fast Temporal Action Detection with Mechanics-MLP
- Temporal Relation-Aware Global Attention Network for Temporal Action Detection
