In [None]:
# Install required packages
!pip install xmltodict pyyaml beautifulsoup4

from abc import ABC, abstractmethod
from typing import Dict, Any, List
import xmltodict
import yaml
import json
import re
from bs4 import BeautifulSoup

class CustomDocumentLoader(ABC):
   """Base class for custom document loaders"""

   def __init__(self, file_path: str):
       self.file_path = file_path
       self.metadata = {}

   @abstractmethod
   def _parse_content(self, content: str) -> Dict[str, Any]:
       """Parse the document content based on its format"""
       pass

   def load(self) -> Dict[str, Any]:
       """Load and process the document"""
       with open(self.file_path, 'r', encoding='utf-8') as f:
           content = f.read()

       return {
           'content': self._parse_content(content),
           'metadata': self.metadata
       }

class CustomXMLLoader(CustomDocumentLoader):
   """Handler for custom XML formats"""

   def _parse_content(self, content: str) -> Dict[str, Any]:
       """Parse XML content and extract structured data"""
       # Parse XML to dictionary
       xml_dict = xmltodict.parse(content)

       # Extract metadata if available
       if '@metadata' in xml_dict:
           self.metadata = xml_dict['@metadata']

       return self._process_xml_dict(xml_dict)

   def _process_xml_dict(self, xml_dict: Dict) -> Dict[str, Any]:
       """Process XML dictionary and extract relevant information"""
       result = {}

       # Handle common XML structures
       if 'document' in xml_dict:
           doc = xml_dict['document']
           result['title'] = doc.get('title', '')
           result['sections'] = self._extract_sections(doc)

       return result

   def _extract_sections(self, doc: Dict) -> List[Dict]:
       """Extract sections from XML document"""
       sections = []

       if 'section' in doc:
           raw_sections = doc['section']
           if isinstance(raw_sections, list):
               sections.extend(raw_sections)
           else:
               sections.append(raw_sections)

       return sections

class DomainSpecificLoader(CustomDocumentLoader):
   """Handler for domain-specific formats"""

   def __init__(self, file_path: str, domain_rules: Dict[str, Any]):
       super().__init__(file_path)
       self.domain_rules = domain_rules

   def _parse_content(self, content: str) -> Dict[str, Any]:
       """Parse content using domain-specific rules"""
       parsed_content = {}

       # Apply domain rules for parsing
       for rule_name, rule_pattern in self.domain_rules.items():
           matches = re.findall(rule_pattern, content)
           parsed_content[rule_name] = matches

       return parsed_content

class HybridDocumentLoader(CustomDocumentLoader):
   """Handler for documents with mixed formats"""

   def _parse_content(self, content: str) -> Dict[str, Any]:
       """Parse content with multiple format sections"""
       sections = []
       current_section = {'type': 'text', 'content': []}

       for line in content.split('\n'):
           # Detect format markers
           if line.startswith('```xml'):
               if current_section['content']:
                   sections.append(current_section)
               current_section = {'type': 'xml', 'content': []}
           elif line.startswith('```yaml'):
               if current_section['content']:
                   sections.append(current_section)
               current_section = {'type': 'yaml', 'content': []}
           elif line.startswith('```'):
               if current_section['content']:
                   sections.append(current_section)
               current_section = {'type': 'text', 'content': []}
           else:
               current_section['content'].append(line)

       if current_section['content']:
           sections.append(current_section)

       # Process each section based on its type
       processed_sections = []
       for section in sections:
           if section['type'] == 'xml':
               processed = xmltodict.parse('\n'.join(section['content']))
           elif section['type'] == 'yaml':
               processed = yaml.safe_load('\n'.join(section['content']))
           else:
               processed = '\n'.join(section['content'])
           processed_sections.append({
               'type': section['type'],
               'content': processed
           })

       return {'sections': processed_sections}

def test_custom_loaders():
   """Test various custom document loaders"""

   # Test XML loader
   xml_content = """<?xml version="1.0"?>
   <document>
       <title>Technical Specification</title>
       <section>
           <id>1</id>
           <heading>Overview</heading>
           <content>System specification details...</content>
       </section>
       <section>
           <id>2</id>
           <heading>Requirements</heading>
           <content>System requirements list...</content>
       </section>
   </document>
   """

   with open('test.xml', 'w') as f:
       f.write(xml_content)

   xml_loader = CustomXMLLoader('test.xml')
   xml_result = xml_loader.load()
   print("XML Document Results:")
   print(json.dumps(xml_result, indent=2))

   # Test domain-specific loader
   domain_content = """
   PATIENT_ID: 12345
   TEST_RESULTS:
   - CBC: Normal
   - Blood Pressure: 120/80
   NOTES: Patient showing good progress
   """

   domain_rules = {
       'patient_id': r'PATIENT_ID:\s*(\d+)',
       'test_results': r'TEST_RESULTS:\s*((?:-.+\n?)+)',
       'notes': r'NOTES:\s*(.+)'
   }

   with open('test.domain', 'w') as f:
       f.write(domain_content)

   domain_loader = DomainSpecificLoader('test.domain', domain_rules)
   domain_result = domain_loader.load()
   print("\nDomain-Specific Results:")
   print(json.dumps(domain_result, indent=2))

   # Test hybrid document
   hybrid_content = """
   # Documentation
   Regular text section

   ```xml
   <data>
       <item>Value</item>
   </data>
   ```

   ```yaml
   config:
     setting1: value1
     setting2: value2
   ```
   """

   with open('test.hybrid', 'w') as f:
       f.write(hybrid_content)

   hybrid_loader = HybridDocumentLoader('test.hybrid')
   hybrid_result = hybrid_loader.load()
   print("\nHybrid Document Results:")
   print(json.dumps(hybrid_result, indent=2))

if __name__ == "__main__":
   test_custom_loaders()