**Microsoft Office Documents**

In [None]:
# Install required packages
!pip install python-docx openpyxl python-pptx pandas

from typing import List, Dict, Any
import docx
import openpyxl
from pptx import Presentation
import os
import pandas as pd

class OfficeDocumentLoader:
    """Handle various Microsoft Office document formats"""
    def __init__(self):
        self.supported_formats = {
            '.docx': self._load_word,
            '.xlsx': self._load_excel,
            '.pptx': self._load_powerpoint
        }

In [None]:
# Install required packages
!pip install python-docx openpyxl python-pptx pandas

from typing import List, Dict, Any
import docx
import openpyxl
from pptx import Presentation
import os
import pandas as pd

class OfficeDocumentLoader:
   """Handle various Microsoft Office document formats"""
   def __init__(self):
       self.supported_formats = {
           '.docx': self._load_word,
           '.xlsx': self._load_excel,
           '.pptx': self._load_powerpoint
       }

   def load(self, file_path: str) -> Dict[str, Any]:
       """
       Load any supported Office document and return its content and metadata
       """
       file_ext = os.path.splitext(file_path)[1].lower()
       if file_ext not in self.supported_formats:
           raise ValueError(f"Unsupported file format: {file_ext}")

       return self.supported_formats[file_ext](file_path)

   def _load_word(self, file_path: str) -> Dict[str, Any]:
       """
       Load and process Word documents (.docx)
       """
       doc = docx.Document(file_path)
       content = []
       metadata = {
           'paragraphs': len(doc.paragraphs),
           'tables': len(doc.tables),
           'sections': len(doc.sections)
       }

       # Extract text from paragraphs
       for para in doc.paragraphs:
           if para.text.strip():
               content.append({
                   'type': 'paragraph',
                   'text': para.text,
                   'style': para.style.name
               })

       # Process tables
       for table in doc.tables:
           table_data = []
           for row in table.rows:
               row_data = [cell.text for cell in row.cells]
               table_data.append(row_data)
           content.append({
               'type': 'table',
               'data': table_data
           })

       return {
           'content': content,
           'metadata': metadata
       }

   def _load_excel(self, file_path: str) -> Dict[str, Any]:
       """
       Load and process Excel documents (.xlsx)
       """
       workbook = openpyxl.load_workbook(file_path, data_only=True)
       content = {}
       metadata = {
           'sheets': workbook.sheetnames,
           'sheet_count': len(workbook.sheetnames)
       }

       # Process each sheet
       for sheet_name in workbook.sheetnames:
           sheet = workbook[sheet_name]

           # Convert sheet to pandas DataFrame for easier handling
           data = []
           for row in sheet.iter_rows(values_only=True):
               data.append(row)

           if data:
               df = pd.DataFrame(data[1:], columns=data[0] if data else None)
               content[sheet_name] = {
                   'data': df.to_dict('records'),
                   'shape': df.shape
               }

       return {
           'content': content,
           'metadata': metadata
       }

   def _load_powerpoint(self, file_path: str) -> Dict[str, Any]:
       """
       Load and process PowerPoint documents (.pptx)
       """
       presentation = Presentation(file_path)
       content = []
       metadata = {
           'slides': len(presentation.slides),
           'layouts': [layout.name for layout in presentation.slide_layouts]
       }

       # Process each slide
       for slide_num, slide in enumerate(presentation.slides, 1):
           slide_content = {
               'slide_number': slide_num,
               'elements': []
           }

           # Extract text from shapes
           for shape in slide.shapes:
               if hasattr(shape, "text") and shape.text.strip():
                   slide_content['elements'].append({
                       'type': 'text',
                       'content': shape.text
                   })

           content.append(slide_content)

       return {
           'content': content,
           'metadata': metadata
       }

def test_office_loader():
   """Test the Office document loader with sample files"""

   # Create sample documents for testing
   def create_sample_documents():
       # Create Word document
       doc = docx.Document()
       doc.add_heading('Sample Document', 0)
       doc.add_paragraph('This is a test paragraph.')
       table = doc.add_table(rows=2, cols=2)
       table.cell(0, 0).text = 'Header 1'
       table.cell(0, 1).text = 'Header 2'
       doc.save('sample.docx')

       # Create Excel document
       wb = openpyxl.Workbook()
       ws = wb.active
       ws['A1'] = 'Name'
       ws['B1'] = 'Value'
       ws['A2'] = 'Test'
       ws['B2'] = 42
       wb.save('sample.xlsx')

       # Create PowerPoint document
       prs = Presentation()
       slide = prs.slides.add_slide(prs.slide_layouts[0])
       slide.shapes.title.text = 'Sample Slide'
       slide.placeholders[1].text = 'Sample content'
       prs.save('sample.pptx')

   # Create samples
   create_sample_documents()

   # Test loader
   loader = OfficeDocumentLoader()

   # Test Word document
   print("Testing Word document loading:")
   word_result = loader.load('sample.docx')
   print("\nWord Content:")
   print(word_result['content'])
   print("\nWord Metadata:")
   print(word_result['metadata'])

   # Test Excel document
   print("\nTesting Excel document loading:")
   excel_result = loader.load('sample.xlsx')
   print("\nExcel Content:")
   print(excel_result['content'])
   print("\nExcel Metadata:")
   print(excel_result['metadata'])

   # Test PowerPoint document
   print("\nTesting PowerPoint document loading:")
   ppt_result = loader.load('sample.pptx')
   print("\nPowerPoint Content:")
   print(ppt_result['content'])
   print("\nPowerPoint Metadata:")
   print(ppt_result['metadata'])

if __name__ == "__main__":
   test_office_loader()

**Markdown Documents**

In [None]:
# Install required packages
!pip install mistune python-frontmatter

import mistune
import frontmatter
from typing import Dict, Any, List
import re

class MarkdownDocumentLoader:
    """Process Markdown documents while preserving structure and formatting"""
    def __init__(self):
        self.markdown_parser = mistune.create_markdown()

    def load(self, file_path: str) -> Dict[str, Any]:
        """
        Load and process a Markdown document.
        Args:
            file_path: Path to the markdown file
        Returns:
            Dictionary containing content, metadata, and structure
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            post = frontmatter.load(f)

        return {
            'content': self._parse_content(post.content),
            'metadata': dict(post.metadata) if post.metadata else {},
            'structure': self._analyze_structure(post.content)
        }

    def _parse_content(self, content: str) -> List[Dict]:
        """Parse markdown content into structured sections"""
        sections = []
        current_section = {'type': 'text', 'content': []}
        lines = content.split('\n')
        in_code_block = False

        for line in lines:
            # Handle code blocks
            if line.startswith('```'):
                if not in_code_block:
                    if current_section['content']:
                        sections.append(current_section)
                    current_section = {
                        'type': 'code',
                        'language': line[3:].strip(),
                        'content': []
                    }
                    in_code_block = True
                else:
                    sections.append(current_section)
                    current_section = {'type': 'text', 'content': []}
                    in_code_block = False
                continue

            # Handle headers
            if not in_code_block and line.startswith('#'):
                if current_section['content']:
                    sections.append(current_section)
                level = len(re.match(r'^#+', line).group())
                current_section = {
                    'type': 'header',
                    'level': level,
                    'content': [line[level:].strip()]
                }
                continue

            current_section['content'].append(line)

        if current_section['content']:
            sections.append(current_section)

        return sections

    def _analyze_structure(self, content: str) -> Dict[str, List]:
        """Analyze document structure and extract components"""
        return {
            'headers': self._find_headers(content),
            'code_blocks': self._find_code_blocks(content),
            'lists': self._find_lists(content)
        }

    def _find_headers(self, content: str) -> List[Dict]:
        """Extract all headers with their levels"""
        headers = []
        for line in content.split('\n'):
            match = re.match(r'^(#+)\s+(.+)$', line)
            if match:
                headers.append({
                    'level': len(match.group(1)),
                    'text': match.group(2).strip()
                })
        return headers

    def _find_code_blocks(self, content: str) -> List[Dict]:
        """Extract code blocks and their languages"""
        blocks = []
        pattern = r'```(\w*)\n(.*?)```'
        matches = re.findall(pattern, content, re.DOTALL)
        for lang, code in matches:
            blocks.append({
                'language': lang,
                'code': code.strip()
            })
        return blocks

    def _find_lists(self, content: str) -> List[Dict]:
        """Extract both ordered and unordered lists"""
        lists = []
        current_list = []
        in_list = False

        for line in content.split('\n'):
            list_match = re.match(r'^(\s*)([-*+]|\d+\.)\s+(.+)$', line)
            if list_match:
                indent = len(list_match.group(1))
                marker = list_match.group(2)
                text = list_match.group(3)
                current_list.append({
                    'indent': indent,
                    'type': 'ordered' if marker[-1] == '.' else 'unordered',
                    'text': text
                })
                in_list = True
            elif in_list and not line.strip():
                if current_list:
                    lists.append(current_list)
                    current_list = []
                in_list = False

        if current_list:
            lists.append(current_list)

        return lists

def test_markdown_loader():
    """Test the markdown loader with a sample document"""

    # Create a test markdown file
    test_content = (
        "---\n"
        "title: Test Document\n"
        "author: John Doe\n"
        "---\n\n"
        "# Introduction\n"
        "This is a test markdown document.\n\n"
        "## Code Example\n"
        "Here's some Python code:\n"
        "```python\n"
        "def hello():\n"
        "    print('Hello, World!')\n"
        "```\n\n"
        "## Lists\n"
        "1. First item\n"
        "2. Second item\n"
        "   - Nested item\n"
        "   * Another item\n\n"
        "## Another Section\n"
        "Regular paragraph text.\n"
    )

    # Save test file
    with open('test.md', 'w', encoding='utf-8') as f:
        f.write(test_content)

    # Process the document
    loader = MarkdownDocumentLoader()
    result = loader.load('test.md')

    # Display results
    print("Metadata:")
    print(result['metadata'])

    print("\nHeaders:")
    for header in result['structure']['headers']:
        print(f"{'#' * header['level']} {header['text']}")

    print("\nCode Blocks:")
    for block in result['structure']['code_blocks']:
        print(f"Language: {block['language']}")
        print(block['code'])

    print("\nLists:")
    for lst in result['structure']['lists']:
        for item in lst:
            print(f"{'  ' * item['indent']}{item['type']}: {item['text']}")