In [67]:
import re
import os
import pandas as pd
from typing import Dict, List, Pattern

# Define data patterns as a dictionary of named patterns
PATTERNS: Dict[str, str] = {
    'ACN': r'Aged Care ID}{\\f3\\fs24\\cf1 : \s*(AC\d{8})',
    'GivenName': r'\\header.*?\\b\\cf1 (\w+)\s+(\w+)\}',
    'FamilyName': r'\\header.*?\\b\\cf1 (\w+)\s+(\w+)\}',
    'BirthDate': r'Date of Birth\}\{\\f3\\fs24\\cf1 : \}\{\\f0\\fs24\\cf1 \s*(\d{2}/\d{2}/\d{4})',
    'other_data': r'\\pard\\intbl\\sl0\\lin108\\rin108 \{\\f0\\fs22\\cf1 ([^}]+)\}'
}

# Compile patterns once for better performance
COMPILED_PATTERNS: Dict[str, Pattern] = {
    name: re.compile(pattern) for name, pattern in PATTERNS.items()
}

def extract_matches(content: str, pattern: Pattern) -> List[str]:
    """Extract all groups from pattern matches in content."""
    matches = pattern.finditer(content)
    return [group for match in matches 
            for group in match.groups()]

def create_client_data(file_content: str) -> Dict[str, str]:
    """Create a client data dictionary from file content."""
    # Extract basic data using named patterns
    acn_matches = extract_matches(file_content, COMPILED_PATTERNS['ACN'])
    name_matches = extract_matches(file_content, COMPILED_PATTERNS['GivenName'])
    dob_matches = extract_matches(file_content, COMPILED_PATTERNS['BirthDate'])
    other_matches = extract_matches(file_content, COMPILED_PATTERNS['other_data'])

    # Map other_matches to their respective fields
    other_fields = {
        'Gender': 1,
        'Address': 4,
        'lives_with': 5,
        'accommodation_type': 6,
        'home_phone': 7,
        'mobile_phone': 8,
        'email': 9,
        'country_of_birth': 13,
        'preferred_language': 15,
        'aboriginal_or_torres_strait_islander_origin': 20,
        'dva_entitlement': 22
    }

    return {
        'ACN': acn_matches[0],
        'GivenName': name_matches[0],
        'FamilyName': name_matches[1],
        'BirthDate': dob_matches[0],
        **{field: other_matches[idx] for field, idx in other_fields.items()}
    }

def process_multiple_documents(directory_path: str, output_file: str) -> None:
    """Process multiple RTF documents and save results to Excel."""
    # Process all RTF files in directory
    client_data_list = []
    
    for filename in os.listdir(directory_path):
        if not filename.endswith('.RTF'):
            continue
            
        file_path = os.path.join(directory_path, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                client_data = create_client_data(content)
                client_data_list.append(client_data)
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    # Create DataFrame directly from list of dictionaries
    if client_data_list:
        df = pd.DataFrame(client_data_list)
        df.to_excel(output_file, index=False, sheet_name='Client Data')
        print(f"Data successfully saved to {output_file}")
    else:
        print("No data was processed. Check input directory and file contents.")

In [68]:
#  
directory_path = "/Users/byron/Desktop/Caura/scraper/client_reports/da"
output_file = "extracted_client_data.xlsx"
process_multiple_documents(directory_path, output_file)

Data successfully saved to extracted_client_data.xlsx
