In [3]:
# 3.处理重复的条目，保留最详细的，并且在日志里面输出当前去重过的期刊
import uuid
import logging
import datetime
import re
from collections import defaultdict

def normalize_title(title):
    """Normalize title for comparison by removing case and punctuation."""
    return re.sub(r'[^\w\s]', '', title.lower()).strip()

def parse_ris_file(file_path):
    """Parse RIS file and return a list of entries."""
    entries = []
    current_entry = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line == 'ER  -':
                if current_entry:
                    entries.append(current_entry)
                    current_entry = {}
            elif line:
                tag, value = line.split('  - ', 1) if '  - ' in line else (line, '')
                current_entry[tag] = current_entry.get(tag, []) + [value]
    return entries

def count_entry_fields(entry):
    """Count the number of fields (data lines) in an entry."""
    return sum(len(values) for values in entry.values())

def deduplicate_by_field(entries, field, normalize=False):
    """Deduplicate entries based on a specified field, keeping the one with most fields."""
    field_to_entries = defaultdict(list)
    for entry in entries:
        field_value = entry.get(field, [''])[0]
        if field_value:  # Only process entries with the field
            key = normalize_title(field_value) if normalize else field_value
            field_to_entries[key].append(entry)
    
    deduplicated = []
    log_messages = []
    
    for key, entries_group in field_to_entries.items():
        if len(entries_group) > 1:
            # Sort by number of fields (descending) and keep the one with most fields
            entries_group.sort(key=count_entry_fields, reverse=True)
            kept_entry = entries_group[0]
            deduplicated.append(kept_entry)
            # Log removed entries
            for removed_entry in entries_group[1:]:
                log_messages.append(
                    f"Removed duplicate entry with {field} '{key}' "
                    f"(kept {count_entry_fields(kept_entry)} fields, "
                    f"removed {count_entry_fields(removed_entry)} fields, "
                    f"title: '{removed_entry.get('TI', [''])[0]}')"
                )
        else:
            deduplicated.append(entries_group[0])
    
    # Add entries that didn't have the field
    for entry in entries:
        if not entry.get(field, [''])[0]:
            deduplicated.append(entry)
    
    return deduplicated, log_messages

def deduplicate_entries(entries):
    """Deduplicate entries first by TI, then by DO."""
    # Step 1: Deduplicate by TI
    entries, ti_log_messages = deduplicate_by_field(entries, 'TI', normalize=True)
    
    # Step 2: Deduplicate by DO
    entries, do_log_messages = deduplicate_by_field(entries, 'DO', normalize=False)
    
    return entries, ti_log_messages + do_log_messages

def write_ris_file(entries, output_path):
    """Write deduplicated entries to a new RIS file with a blank line between entries."""
    with open(output_path, 'w', encoding='utf-8') as file:
        for i, entry in enumerate(entries):
            for tag, values in entry.items():
                for value in values:
                    file.write(f"{tag}  - {value}\n")
            file.write("ER  -\n")
            if i < len(entries) - 1:  # Add blank line between entries, but not after the last
                file.write("\n")

def setup_logging():
    """Set up logging to a file."""
    logging.basicConfig(
        filename=f'deduplication_log_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.txt',
        level=logging.INFO,
        format='%(asctime)s - %(message)s'
    )

def main(input_file, output_file):
    setup_logging()
    
    # Parse RIS file
    entries = parse_ris_file(input_file)
    
    # Deduplicate entries
    deduplicated_entries, log_messages = deduplicate_entries(entries)
    
    # Write to output file
    write_ris_file(deduplicated_entries, output_file)
    
    # Log results
    for message in log_messages:
        logging.info(message)
    
    logging.info(f"Processed {len(entries)} entries, kept {len(deduplicated_entries)} entries")
    print(f"Deduplication complete. Output written to {output_file}")
    print(f"Log written to deduplication_log_*.txt")

if __name__ == "__main__":
    input_file = "../data/20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal.ris"  # Replace with your input RIS file path
    output_file = "../data/20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal_deduplication.ris"  # Replace with your desired output RIS file path
    main(input_file, output_file)

Deduplication complete. Output written to ../data/20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal_deduplication.ris
Log written to deduplication_log_*.txt


In [4]:
import uuid
import logging
import datetime
import re
from collections import defaultdict

def normalize_title(title):
    """Normalize title for comparison by removing case, punctuation, and handling hyphens."""
    # Replace hyphens with spaces and remove multiple hyphens
    title = re.sub(r'-+', ' ', title)
    # Remove all punctuation and normalize to lowercase
    title = re.sub(r'[^\w\s]', '', title.lower()).strip()
    # Remove extra spaces
    title = re.sub(r'\s+', ' ', title)
    return title

def parse_ris_file(file_path):
    """Parse RIS file and return a list of entries."""
    entries = []
    current_entry = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line == 'ER  -':
                if current_entry:
                    entries.append(current_entry)
                    current_entry = {}
            elif line:
                tag, value = line.split('  - ', 1) if '  - ' in line else (line, '')
                current_entry[tag] = current_entry.get(tag, []) + [value]
    return entries

def count_entry_fields(entry):
    """Count the number of fields (data lines) in an entry."""
    return sum(len(values) for values in entry.values())

def deduplicate_by_field(entries, field, normalize=False):
    """Deduplicate entries based on a specified field, keeping the one with most fields."""
    field_to_entries = defaultdict(list)
    for entry in entries:
        field_value = entry.get(field, [''])[0]
        if field_value:  # Only process entries with the field
            key = normalize_title(field_value) if normalize else field_value
            field_to_entries[key].append(entry)
    
    deduplicated = []
    log_messages = []
    
    for key, entries_group in field_to_entries.items():
        if len(entries_group) > 1:
            # Sort by number of fields (descending) and keep the one with most fields
            entries_group.sort(key=count_entry_fields, reverse=True)
            kept_entry = entries_group[0]
            deduplicated.append(kept_entry)
            # Log removed entries
            for removed_entry in entries_group[1:]:
                log_messages.append(
                    f"Removed duplicate entry with {field} '{key}' "
                    f"(kept {count_entry_fields(kept_entry)} fields, "
                    f"removed {count_entry_fields(removed_entry)} fields, "
                    f"title: '{removed_entry.get('TI', [''])[0]}')"
                )
        else:
            deduplicated.append(entries_group[0])
    
    # Add entries that didn't have the field
    for entry in entries:
        if not entry.get(field, [''])[0]:
            deduplicated.append(entry)
    
    return deduplicated, log_messages

def deduplicate_entries(entries):
    """Deduplicate entries first by TI, then by DO."""
    # Step 1: Deduplicate by TI
    entries, ti_log_messages = deduplicate_by_field(entries, 'TI', normalize=True)
    
    # Step 2: Deduplicate by DO
    entries, do_log_messages = deduplicate_by_field(entries, 'DO', normalize=False)
    
    return entries, ti_log_messages + do_log_messages

def write_ris_file(entries, output_path):
    """Write deduplicated entries to a new RIS file with a blank line between entries."""
    with open(output_path, 'w', encoding='utf-8') as file:
        for i, entry in enumerate(entries):
            for tag, values in entry.items():
                for value in values:
                    file.write(f"{tag}  - {value}\n")
            file.write("ER  -\n")
            if i < len(entries) - 1:  # Add blank line between entries, but not after the last
                file.write("\n")

def setup_logging():
    """Set up logging to a file."""
    logging.basicConfig(
        filename=f'deduplication_log_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.txt',
        level=logging.INFO,
        format='%(asctime)s - %(message)s'
    )

def main(input_file, output_file):
    setup_logging()
    
    # Parse RIS file
    entries = parse_ris_file(input_file)
    
    # Deduplicate entries
    deduplicated_entries, log_messages = deduplicate_entries(entries)
    
    # Write to output file
    write_ris_file(deduplicated_entries, output_file)
    
    # Log results
    for message in log_messages:
        logging.info(message)
    
    logging.info(f"Processed {len(entries)} entries, kept {len(deduplicated_entries)} entries")
    print(f"Deduplication complete. Output written to {output_file}")
    print(f"Log written to deduplication_log_*.txt")

    
if __name__ == "__main__":
    input_file = "../data/20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal.ris"  # Replace with your input RIS file path
    output_file = "../data/20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal_deduplication.ris"  # Replace with your desired output RIS file path
    main(input_file, output_file)

Deduplication complete. Output written to ../data/20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal_deduplication.ris
Log written to deduplication_log_*.txt


In [1]:
# 找不同
import re
from collections import defaultdict
import uuid

def parse_ris_entries(file_path):
    """Parse a RIS file and return a list of dictionaries, each representing an entry."""
    entries = []
    current_entry = defaultdict(list)
    current_tag = None
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if not line:
                    continue
                
                # Match RIS tag format (e.g., "TY  - JOUR")
                match = re.match(r'^([A-Z0-9]{2})\s*-\s*(.*)$', line)
                if match:
                    tag, value = match.groups()
                    if tag == 'ER':
                        # End of entry, save it
                        if current_entry:
                            # Convert lists to single values where appropriate
                            for key in current_entry:
                                if len(current_entry[key]) == 1:
                                    current_entry[key] = current_entry[key][0]
                                elif key == 'AU' or key == 'KW':
                                    current_entry[key] = sorted(current_entry[key])
                            entries.append(dict(current_entry))
                            current_entry = defaultdict(list)
                        continue
                    if tag == 'TY' and current_entry:
                        # New entry starts, save previous
                        for key in current_entry:
                            if len(current_entry[key]) == 1:
                                current_entry[key] = current_entry[key][0]
                            elif key == 'AU' or key == 'KW':
                                current_entry[key] = sorted(current_entry[key])
                        entries.append(dict(current_entry))
                        current_entry = defaultdict(list)
                    current_tag = tag
                    current_entry[current_tag].append(value.strip())
                elif current_tag and line:
                    # Continuation of multi-line field
                    current_entry[current_tag][-1] += ' ' + line.strip()
    
        # Save the last entry if exists
        if current_entry:
            for key in current_entry:
                if len(current_entry[key]) == 1:
                    current_entry[key] = current_entry[key][0]
                elif key == 'AU' or key == 'KW':
                    current_entry[key] = sorted(current_entry[key])
            entries.append(dict(current_entry))
    
        return entries
    
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return []
    except Exception as e:
        print(f"Error parsing {file_path}: {str(e)}")
        return []

def find_missing_entries(file1_path, file2_path):
    """Find entries in file2 that are not in file1, based on title (TI)."""
    entries1 = parse_ris_entries(file1_path)  # 3882 entries
    entries2 = parse_ris_entries(file2_path)  # 3891 entries
    
    if not entries1 or not entries2:
        return {"error": "One or both files could not be parsed."}
    
    # Create sets of titles for comparison
    titles1 = {entry.get('TI', '') for entry in entries1 if entry.get('TI')}
    titles2 = {entry.get('TI', '') for entry in entries2 if entry.get('TI')}
    
    # Find titles in file2 but not in file1
    missing_titles = titles2 - titles1
    
    # Collect full entries for missing titles
    missing_entries = [entry for entry in entries2 if entry.get('TI') in missing_titles]
    
    return missing_entries

def generate_report(missing_entries):
    """Generate a report listing the missing entries."""
    report = ["Missing Entries Report", "=" * 30, ""]
    
    if isinstance(missing_entries, dict) and 'error' in missing_entries:
        report.append(f"Error: {missing_entries['error']}")
        return '\n'.join(report)
    
    report.append(f"Found {len(missing_entries)} entries in file2 that are not in file1:")
    report.append("-" * 50)
    
    for i, entry in enumerate(missing_entries, 1):
        title = entry.get('TI', 'No title')
        authors = entry.get('AU', 'No authors')
        doi = entry.get('DO', 'No DOI')
        journal = entry.get('T2', 'No journal')
        
        report.append(f"Entry {i}:")
        report.append(f"  Title: {title}")
        report.append(f"  Authors: {', '.join(authors) if isinstance(authors, list) else authors}")
        report.append(f"  DOI: {doi}")
        report.append(f"  Journal: {journal}")
        report.append("")
    
    return '\n'.join(report)

def main():
    # File paths
    file1_path = R'../data/20250508_scopus_3837_tad_tal 20250508_wos_886_tad_tal_3891-3_3888_deduplication_end_arxiv_1635_5523_deduplication_4374.ris'  # Smaller file
    file2_path = R'../data/4373.ris'  # Larger file
    
    missing_entries = find_missing_entries(file1_path, file2_path)
    report = generate_report(missing_entries)
    
    # Print report to console
    print(report)
    
    # Save report to file
    report_file = f'missing_entries_report_{uuid.uuid4()}.txt'
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write(report)
    print(f"\nReport saved to {report_file}")

if __name__ == "__main__":
    main()

Missing Entries Report

Found 2 entries in file2 that are not in file1:
--------------------------------------------------
Entry 1:
  Title: Weakly-Supervised Temporal Action Localization Through Local-Global Background Modeling
  Authors: Huang, Z., Qing, Z., Sang, N., Shao, Y., Wang, X.
  DOI: 10.48550/arXiv.2106.11811
  Journal: CVPR-2021 HACS Challenge - Weakly-supervised Learning Track champion solution (1st Place)

Entry 2:
  Title: Learning Spatio-Temporal Representation With Local and Global Diffusion
  Authors: Mei, T., Ngox, C.-W., Qiuy, Z., Tiany, X., Yaoz, T.
  DOI: 10.1109/cvpr.2019.01233
  Journal: 2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)


Report saved to missing_entries_report_348ae166-ba6a-46a6-ba8c-40c1ca093118.txt


In [1]:
# 找到ris文件之间的区别
import uuid
import logging
import datetime
import re
from collections import defaultdict

def normalize_title(title):
    """Normalize title for comparison by handling hyphens only."""
    # Replace single or multiple hyphens with a single space
    title = re.sub(r'-+', ' ', title)
    # Remove extra spaces
    title = re.sub(r'\s+', ' ', title).strip()
    return title

def parse_ris_file(file_path):
    """Parse RIS file and return a list of entries."""
    entries = []
    current_entry = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line == 'ER  -':
                if current_entry:
                    entries.append(current_entry)
                    current_entry = {}
            elif line:
                tag, value = line.split('  - ', 1) if '  - ' in line else (line, '')
                current_entry[tag] = current_entry.get(tag, []) + [value]
    return entries

def count_entry_fields(entry):
    """Count the number of fields (data lines) in an entry."""
    return sum(len(values) for values in entry.values())

def deduplicate_by_field(entries, field, normalize=False):
    """Deduplicate entries based on a specified field, keeping the one with most fields."""
    field_to_entries = defaultdict(list)
    for entry in entries:
        field_value = entry.get(field, [''])[0]
        if field_value:  # Only process entries with the field
            key = normalize_title(field_value) if normalize else field_value
            field_to_entries[key].append(entry)
    
    deduplicated = []
    log_messages = []
    
    for key, entries_group in field_to_entries.items():
        if len(entries_group) > 1:
            # Sort by number of fields (descending) and keep the one with most fields
            entries_group.sort(key=count_entry_fields, reverse=True)
            kept_entry = entries_group[0]
            deduplicated.append(kept_entry)
            # Log removed entries
            for removed_entry in entries_group[1:]:
                log_messages.append(
                    f"Removed duplicate entry with {field} '{key}' "
                    f"(kept {count_entry_fields(kept_entry)} fields, "
                    f"removed {count_entry_fields(removed_entry)} fields, "
                    f"title: '{removed_entry.get('TI', [''])[0]}')"
                )
        else:
            deduplicated.append(entries_group[0])
    
    # Add entries that didn't have the field
    for entry in entries:
        if not entry.get(field, [''])[0]:
            deduplicated.append(entry)
    
    return deduplicated, log_messages

def deduplicate_entries(entries):
    """Deduplicate entries first by TI, then by DO."""
    # Step 1: Deduplicate by TI
    entries, ti_log_messages = deduplicate_by_field(entries, 'TI', normalize=True)
    
    # Step 2: Deduplicate by DO
    entries, do_log_messages = deduplicate_by_field(entries, 'DO', normalize=False)
    
    return entries, ti_log_messages + do_log_messages

def write_ris_file(entries, output_path):
    """Write deduplicated entries to a new RIS file with a blank line between entries."""
    with open(output_path, 'w', encoding='utf-8') as file:
        for i, entry in enumerate(entries):
            for tag, values in entry.items():
                for value in values:
                    file.write(f"{tag}  - {value}\n")
            file.write("ER  -\n")
            if i < len(entries) - 1:  # Add blank line between entries, but not after the last
                file.write("\n")

def setup_logging():
    """Set up logging to a file."""
    logging.basicConfig(
        filename=f'deduplication_log_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.txt',
        level=logging.INFO,
        format='%(asctime)s - %(message)s'
    )

def main(input_file, output_file):
    setup_logging()
    
    # Parse RIS file
    entries = parse_ris_file(input_file)
    
    # Deduplicate entries
    deduplicated_entries, log_messages = deduplicate_entries(entries)
    
    # Write to output file
    write_ris_file(deduplicated_entries, output_file)
    
    # Log results
    for message in log_messages:
        logging.info(message)
    
    logging.info(f"Processed {len(entries)} entries, kept {len(deduplicated_entries)} entries")
    print(f"Deduplication complete. Output written to {output_file}")
    print(f"Log written to deduplication_log_*.txt")
    
if __name__ == "__main__":
    input_file = '../data/20250510_scopus_LNCS_tad_tal_303.ris'
    output_file = '../data/20250510_scopus_LNCS_tad_tal_303_processed.ris'
    main(input_file, output_file)



Deduplication complete. Output written to ../data/20250510_scopus_LNCS_tad_tal_303_processed.ris
Log written to deduplication_log_*.txt


In [1]:
import re
from typing import List, Set
import tkinter as tk
from tkinter import filedialog, messagebox
import os

def parse_ris_file(file_path: str) -> List[str]:
    """Parse RIS file and extract TI (title) entries."""
    titles = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Split into individual references by 'ER  -' delimiter
            references = content.split('ER  -')
            for ref in references:
                # Find all TI entries in the reference
                ti_matches = re.findall(r'^TI\s+-\s+(.+)$', ref, re.MULTILINE)
                titles.extend(ti_matches)
        return titles
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return []

def compare_ris_files(file1_path: str, file2_path: str, reference_title: str) -> Set[str]:
    """Compare two RIS files and find titles in file1 that match reference_title but are absent in file2."""
    titles1 = parse_ris_file(file1_path)
    titles2 = parse_ris_file(file2_path)
    
    # Find titles in file1 that match the reference title and are not in file2
    different_titles = set(t for t in titles1 if t.strip() == reference_title.strip() and t not in titles2)
    return different_titles

def select_file() -> str:
    """Open file dialog to select an RIS file."""
    file_path = filedialog.askopenfilename(filetypes=[("RIS files", "*.ris"), ("All files", "*.*")])
    return file_path

def main():
    # Initialize Tkinter root
    root = tk.Tk()
    root.withdraw()  # Hide the main window

    # Select first RIS file
    messagebox.showinfo("Select File", "Please select the first RIS file.")
    file1_path = select_file()
    if not file1_path:
        messagebox.showerror("Error", "No file selected for the first RIS file.")
        return

    # Select second RIS file
    messagebox.showinfo("Select File", "Please select the second RIS file.")
    file2_path = select_file()
    if not file2_path:
        messagebox.showerror("Error", "No file selected for the second RIS file.")
        return

    # Reference title to compare
    reference_title = "3D Human Pose Estimation with Dilated Sampled Frames"

    # Compare the files
    different_titles = compare_ris_files(file1_path, file2_path, reference_title)

    # Display results
    if different_titles:
        result = f"The following title was found in {os.path.basename(file1_path)} but not in {os.path.basename(file2_path)}:\n"
        result += "\n".join(different_titles)
    else:
        result = f"The title '{reference_title}' was either not found in {os.path.basename(file1_path)} or present in both files."

    messagebox.showinfo("Comparison Result", result)

if __name__ == "__main__":
    main()

: 