In [None]:
import re
from pathlib import Path

DATA_DIR = Path(
    r"C:\Users\baran.be\OneDrive - GEA\Projects\VSCode_Workspace\LLMs_TextStudy"
)

def extract_shareholder_proposals(markdown_content):
    """
    Extracts shareholder proposal sections from a DEF 14A markdown string.
    """
    proposals = []

    # 1. Identify the start of proposals
    # Pattern looks for bold headers starting with "Shareholder Proposal"
    # and containing an "Item" reference (e.g., Item 4).
    # It is case-insensitive and handles multi-line headers.
    proposal_start_pattern = re.compile(
        r'\*\*Shareholder Proposal.*?\n?.*?\(Item\s+\d+.*?\)\*\*', 
        re.IGNORECASE | re.DOTALL
    )

    matches = list(proposal_start_pattern.finditer(markdown_content))

    if not matches:
        return ["No shareholder proposals found in this file."]

    # 2. Extract content for each proposal
    for i in range(len(matches)):
        start_index = matches[i].start()
        
        # Determine the end of the current proposal
        if i < len(matches) - 1:
            # The end is the start of the next proposal
            end_index = matches[i+1].start()
        else:
            # For the last proposal, find the start of the next major section
            # Common sections following proposals in your files:
            stop_patterns = [
                r'\*\*Approval Process',
                r'\*\*Additional Information',
                r'\*\*Other Matters',
                r'\*\*Date for Receipt',
                r'\*\*General\*\*',
                r'Annex I'
            ]
            
            combined_stop_pattern = "|".join(stop_patterns)
            stop_match = re.search(combined_stop_pattern, markdown_content[start_index:], re.IGNORECASE)
            
            if stop_match:
                end_index = start_index + stop_match.start()
            else:
                # If no stop section is found, take the rest of the text
                end_index = len(markdown_content)

        # Extract and clean the text
        full_proposal_text = markdown_content[start_index:end_index].strip()
        proposals.append(full_proposal_text)

    return proposals

# --- Example Usage ---

# Load your markdown file content (Replace with your actual file reading code)
#data_direct = open('a2222821zdef14a.md', 'r', encoding='utf-8') as f: content = f.read()
data_direct = Path("LLMs_TextStudy")

md_files = sorted(DATA_DIR.glob("*.md"))

print(f"Found {len(md_files)} markdown files:")
for f in md_files:
    print(" -", f.name)


# For demonstration, using a placeholder string mimicking your file structure:
# mock_content = """
# ...
# **Shareholder Proposal on Genetically Modified Ingredients (Item 4 on Proxy Card)**
# Proponent's Statement: ... text ...
# Board of Directors' Statement in Opposition: ... text ...

# **Shareholder Proposal on Independent Board Chairman (Item 5 on Proxy Card)**
# Proponent's Statement: ... text ...
# Board of Directors' Statement in Opposition: ... text ...

# **Approval Process for Related Person Transactions**
# ...
#  """
#extracted_proposals = extract_shareholder_proposals(mock_content)

# Loop through multiple files:
for md_path in md_files:
    content = md_path.read_text(encoding="utf-8", errors="replace")
    extracted_proposals = extract_shareholder_proposals(content)

    print("\n" + "=" * 200)
    print(md_path.name)

    for idx, prop in enumerate(extracted_proposals, 1):
        print(f"--- Proposal {idx} ---")
        print(prop[:400] + "...\n")

In [None]:
BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "LLMs_TextStudy"

print("BASE_DIR:", BASE_DIR)
print("DATA_DIR:", DATA_DIR)
print("Exists?", DATA_DIR.exists())

md_files = sorted(DATA_DIR.glob("*.md"))
print(f"Found {len(md_files)} markdown files:")
for f in md_files:
    print(" -", f.name)