In [3]:
import re
import uuid

def parse_ris_file(file_path):
    # Target T2 value
    target_t2 = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)"
    
    # Lists to store results
    results = []
    current_entry = {}
    current_field = None
    
    # Read the RIS file
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if not line:
                    continue
                
                # Check if line starts with a tag (e.g., TY, AU, T2, N1)
                if len(line) >= 2 and line[1] == 'Y' or line[0:2] in ['AU', 'TI', 'T2', 'PY', 'VL', 'SP', 'EP', 'DO', 'UR', 'AD', 'AB', 'KW', 'A2', 'PB', 'SN', 'LA', 'J2', 'M3', 'DB', 'N1', 'ER']:
                    # Split tag and value
                    tag = line[0:2]
                    value = line[5:].strip() if len(line) > 5 else ""
                    current_field = tag
                    
                    if tag == 'TY':
                        # Start a new entry
                        if current_entry:
                            results.append(current_entry)
                        current_entry = {tag: value}
                    elif tag == 'ER':
                        # End of record, append the current entry
                        if current_entry:
                            results.append(current_entry)
                        current_entry = {}
                    else:
                        # Add to current entry
                        if current_field == 'AU' or current_field == 'KW' or current_field == 'A2' or current_field == 'AD':
                            # Handle multi-line fields (e.g., AU, KW, A2, AD)
                            if tag not in current_entry:
                                current_entry[tag] = [value]
                            else:
                                current_entry[tag].append(value)
                        else:
                            current_entry[tag] = value
                else:
                    # Continuation of the previous field
                    if current_field and current_entry:
                        if isinstance(current_entry[current_field], list):
                            current_entry[current_field][-1] += " " + line
                        else:
                            current_entry[current_field] += " " + line
                            
        # Append the last entry if it exists
        if current_entry:
            results.append(current_entry)
            
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []
    except Exception as e:
        print(f"Error parsing file: {e}")
        return []

    # Filter entries where T2 matches the target and extract conference name from N1
    filtered_results = []
    for entry in results:
        if entry.get('T2') == target_t2 and 'N1' in entry:
            # Extract conference name from N1
            n1_content = entry['N1']
            # Use regex to find the conference name between "Conference name:" and "Conference date:"
            match = re.search(r'Conference name:\s*(.*?)\s*; Conference date:', n1_content)
            if match:
                conference_name = match.group(1).strip()
                filtered_results.append({
                    'T2': entry['T2'],
                    'Conference_Name': conference_name
                })
    
    return filtered_results

def main():
    file_path = R'D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\top\scopus (3).ris'  # Adjust the path as needed
    parsed_data = parse_ris_file(file_path)
    
    # Print the results
    if parsed_data:
        print("Extracted Entries:")
        for i, entry in enumerate(parsed_data, 1):
            print(f"\nEntry {i}:")
            print(f"T2: {entry['T2']}")
            print(f"Conference Name: {entry['Conference_Name']}")
    else:
        print("No matching entries found or an error occurred.")

if __name__ == "__main__":
    main()

Extracted Entries:

Entry 1:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 18th European Conference on Computer Vision, ECCV 2024

Entry 2:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 20th Pacific Rim International Conference on Artificial Intelligence, PRICAI 2023

Entry 3:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 45th Annual Conference of the German Association for Pattern Recognition, DAGM-GCPR 2023

Entry 4:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 18th European Conference on Computer Vision, ECCV 2024

Entry 5:
T2: Lecture Notes in Computer Scie

In [1]:
import re
import uuid

def parse_ris_file(file_path):
    # Target T2 value
    target_t2 = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)"
    
    # Lists to store results
    results = []
    current_entry = {}
    current_field = None
    
    # Read the RIS file
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if not line:
                    continue
                
                # Check if line starts with a tag (e.g., TY, AU, T2, N1)
                if len(line) >= 2 and line[1] == 'Y' or line[0:2] in ['AU', 'TI', 'T2', 'PY', 'VL', 'SP', 'EP', 'DO', 'UR', 'AD', 'AB', 'KW', 'A2', 'PB', 'SN', 'LA', 'J2', 'M3', 'DB', 'N1', 'ER']:
                    # Split tag and value
                    tag = line[0:2]
                    value = line[5:].strip() if len(line) > 5 else ""
                    current_field = tag
                    
                    if tag == 'TY':
                        # Start a new entry
                        if current_entry:
                            results.append(current_entry)
                        current_entry = {tag: value}
                    elif tag == 'ER':
                        # End of record, append the current entry
                        if current_entry:
                            results.append(current_entry)
                        current_entry = {}
                    else:
                        # Add to current entry
                        if current_field == 'AU' or current_field == 'KW' or current_field == 'A2' or current_field == 'AD':
                            # Handle multi-line fields (e.g., AU, KW, A2, AD)
                            if tag not in current_entry:
                                current_entry[tag] = [value]
                            else:
                                current_entry[tag].append(value)
                        else:
                            current_entry[tag] = value
                else:
                    # Continuation of the previous field
                    if current_field and current_entry:
                        if isinstance(current_entry[current_field], list):
                            current_entry[current_field][-1] += " " + line
                        else:
                            current_entry[current_field] += " " + line
                            
        # Append the last entry if it exists
        if current_entry:
            results.append(current_entry)
            
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []
    except Exception as e:
        print(f"Error parsing file: {e}")
        return []

    # Filter entries where T2 matches the target and extract conference name from N1
    filtered_results = []
    for entry in results:
        if entry.get('T2') == target_t2 and 'N1' in entry:
            # Extract conference name from N1
            n1_content = entry['N1']
            # Use regex to find the conference name between "Conference name:" and "Conference date:"
            match = re.search(r'Conference name:\s*(.*?)\s*; Conference date:', n1_content)
            if match:
                conference_name = match.group(1).strip()
                filtered_results.append({
                    'T2': entry['T2'],
                    'Conference_Name': conference_name
                })
    
    return filtered_results

def extract_acronym(conference_name):
    # Regex to match acronyms: sequence of uppercase letters, numbers, or hyphens, often after a comma
    match = re.search(r',?\s*([A-Z0-9-]+)(?:\s+\d{4})?$', conference_name)
    if match:
        return match.group(1)
    # Fallback: if no comma-separated acronym, try to find a sequence of uppercase letters
    match = re.search(r'\b([A-Z]{2,}(?:-[A-Z]+)?)\b', conference_name)
    return match.group(1) if match else ""

def main():
    file_path = R'D:\Programs\Codes\Skill-Up\learning-smorgasbord\arxiv\top\862_2025_05_07_不限定任何内容_scopus.ris'  # Adjust the path as needed
    parsed_data = parse_ris_file(file_path)
    
    # Print the extracted entries
    if parsed_data:
        print("Extracted Entries:")
        for i, entry in enumerate(parsed_data, 1):
            print(f"\nEntry {i}:")
            print(f"T2: {entry['T2']}")
            print(f"Conference Name: {entry['Conference_Name']}")
        
        # Extract and print unique conference acronyms
        acronyms = set(extract_acronym(entry['Conference_Name']) for entry in parsed_data)
        acronyms.discard("")  # Remove any empty acronyms
        print("\nConference Acronyms:")
        for acronym in sorted(acronyms):
            print(acronym)
    else:
        print("No matching entries found or an error occurred.")

if __name__ == "__main__":
    main()

Extracted Entries:

Entry 1:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 18th European Conference on Computer Vision, ECCV 2024

Entry 2:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 18th European Conference on Computer Vision, ECCV 2024

Entry 3:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 18th European Conference on Computer Vision, ECCV 2024

Entry 4:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)
Conference Name: 18th European Conference on Computer Vision, ECCV 2024

Entry 5:
T2: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intellig