In [None]:
import ollama
from pathlib import Path
from typing import List, Dict
import re
from collections import defaultdict
import yaml
from datetime import datetime

# Helper Functions
def sanitize_path_element(element: str) -> str:
    """Sanitize directory and filenames for all OSes"""
    return re.sub(r'[^\w\-_\.]', '', element.replace(' ', '-')).strip('-')

def is_horizontal_rule(line: str) -> bool:
    """Check if a line is a Markdown horizontal rule"""
    return bool(re.match(r'^[-*_]{3,}$', line.strip()))

# Core Processing Functions
def generate_metadata(content: str, model: str) -> Dict:
    """Generate metadata using local LLM"""
    try:
        if not content.strip():
            return {}

        prompt = f"""
        Analyze this text and extract key concepts following these rules:
        1. Identify primary concept (PascalCase)
        2. List 2-5 related concepts (PascalCase)
        3. Generate 1-3 tags (lowercase-with-dashes)
        4. Create a 1-sentence summary

        Text: {content[:2000]}
        """

        response = ollama.generate(
            model=model,
            prompt=prompt,
            format="json",
            options={"temperature": 0.2}
        )
        return response.model_dump_json().get("response", {})
    except:
        return {}

def generate_ai_content(title: str, concepts: List[str], folder_hierarchy: List[str], model: str) -> str:
    """Generate content using AI with folder context"""
    try:
        context_path = " > ".join(folder_hierarchy)
        prompt = f"""
        Generate comprehensive content for: {title}
        Context Hierarchy: {context_path}
        Include:
        - Core definitions
        - Practical applications
        - Relationships to parent concepts
        - Simple examples
        Use academic tone with Markdown sections
        """

        response = ollama.generate(
            model=model,
            prompt=prompt,
            options={"temperature": 0.5}
        )
        return f"> **AI Generated Content**\n{response['response']}"
    except:
        return "> **AI Generation Failed** - Content placeholder"

def merge_moc_content(existing_content: str, new_moc: str) -> str:
    """Merge existing note content with MOC sections"""
    if existing_content.startswith('---'):
        frontmatter_end = existing_content.find('---', 3)
        if frontmatter_end != -1:
            frontmatter = existing_content[:frontmatter_end+3]
            existing_content = existing_content[frontmatter_end+3:].lstrip()
            existing_content = re.sub(r'##+ (Subtopics|Notes)[\s\S]*?(?=##|$)', '', existing_content).strip()
            return f"{frontmatter}\n{existing_content}\n\n{new_moc}"
    return f"{existing_content}\n\n{new_moc}".strip()

# Main Processing Pipeline
def process_note_with_metadata(note_path: Path, output_dir: Path, model: str, notes_root: Path):
    """Process notes with full error handling and MOC merging"""
    try:
        rel_path = note_path.relative_to(notes_root)
        print(f"\n\U0001F4C1 Processing: {rel_path}")

        with open(note_path, 'r', encoding='utf-8') as f:
            content = f.read()

        sections = re.split(r'\n##+ ', content)
        print(f"  \U0001F50D Found {len(sections)} sections")

        sanitized_hierarchy = [sanitize_path_element(p) for p in rel_path.parent.parts]
        note_stem = sanitize_path_element(note_path.stem)

        output_folder = output_dir.joinpath(*sanitized_hierarchy, note_stem)

        if output_folder.exists() and output_folder.is_file():
            new_name = f"{note_stem}-{datetime.now().strftime('%Y%m%d%H%M%S')}"
            output_folder = output_dir.joinpath(*sanitized_hierarchy, new_name)
            print(f"  ⚠️  Renamed conflicting file to: {new_name}")

        output_folder.mkdir(parents=True, exist_ok=True)

        moc_entries = []
        for i, section in enumerate(sections, 1):
            if not section.strip():
                continue

            lines = section.split('\n')
            original_title = lines[0].strip('#').strip()

            if is_horizontal_rule(original_title) or not original_title:
                print(f"  ⚠️  Skipping invalid section: {original_title}")
                continue

            body = '\n'.join(lines[1:])
            print(f"  \U0001F4DD Section {i}: {original_title}")

            try:
                metadata = generate_metadata(f"{original_title}\n\n{body}", model)
                concepts = metadata.get('concepts', [])
                ai_generated = False

                if not body.strip() or not re.search(r'^#+ ', body, flags=re.MULTILINE):
                    print("    \U0001F916 Generating AI content...")
                    body = generate_ai_content(original_title, concepts, sanitized_hierarchy, model)
                    ai_generated = True

                sanitized_name = sanitize_path_element(original_title)
                output_path = output_folder / f"{sanitized_name}.md"
                counter = 1
                while output_path.exists():
                    output_path = output_folder / f"{sanitized_name}-{counter}.md"
                    counter += 1

                frontmatter = {
                    'created': datetime.now().isoformat(),
                    'modified': datetime.now().isoformat(),
                    'source': f"[[{note_stem}]]",
                    'hierarchy': sanitized_hierarchy,
                    'tags': metadata.get('tags', []),
                    'summary': metadata.get('summary', ''),
                    'concepts': concepts,
                    'ai_generated': ai_generated
                }
                yaml_front = yaml.safe_dump(frontmatter, sort_keys=False, allow_unicode=True)

                note_content = f"---\n{yaml_front}---\n\n"
                note_content += f"# {original_title}\n\n"
                note_content += f"## Context Path\n{' > '.join(sanitized_hierarchy)}\n\n" if sanitized_hierarchy else ""
                note_content += "## Content\n" + body + "\n\n"
                note_content += "## Related Concepts\n" + '\n'.join(f"[[{c}]]" for c in concepts[1:])

                output_path.write_text(note_content, encoding='utf-8')
                moc_entries.append(output_path)
                print(f"    \U0001F4BE Saved to: {output_path.relative_to(output_dir)}")

            except Exception as e:
                print(f"    ❌ Section processing failed: {str(e)}")
                continue

        create_folder_moc(output_folder, moc_entries)

    except Exception as e:
        print(f"❌ Failed to process {note_path.name}: {str(e)}")

def create_folder_moc(folder_path: Path, entries: List[Path]):
    """Create/update MOC for each folder with merging"""
    moc_name = f"{folder_path.name} MOC.md"
    moc_path = folder_path / moc_name

    existing_content = ""
    if moc_path.exists():
        try:
            existing_content = moc_path.read_text(encoding='utf-8')
            print(f"    🔄 Merging with existing MOC content")
        except Exception as e:
            print(f"    ⚠️  Error reading MOC: {str(e)}")

    new_moc = []
    subfolders = [f for f in folder_path.iterdir() if f.is_dir()]
    if subfolders:
        new_moc.append("## Subtopics")
        new_moc.extend(f"- [[{f.name}/{f.name} MOC]]" for f in subfolders)

    notes = [f for f in folder_path.iterdir() 
             if f.is_file() and f != moc_path and f.suffix == '.md']
    if notes:
        new_moc.append("\n## Notes")
        new_moc.extend(f"- [[{f.stem.replace('-', ' ')}]]" for f in notes)

    final_content = f"# {folder_path.name} Map of Content\n\n" + '\n'.join(new_moc)
    if existing_content:
        final_content = merge_moc_content(existing_content, '\n'.join(new_moc))

    if folder_path != folder_path.parent:
        parent_moc = folder_path.parent / f"{folder_path.parent.name} MOC.md"
        final_content += f"\n\n⤴️ Back to [[{parent_moc.stem.replace('-', ' ')}]]"

    try:
        moc_path.write_text(final_content, encoding='utf-8')
        print(f"    \U0001F4D1 Updated MOC at: {moc_path.relative_to(folder_path.parent)}")
    except Exception as e:
        print(f"    ❌ MOC creation failed: {str(e)}")

# Global Index and Main Execution
def create_global_indices(output_dir: Path):
    """Create hierarchical indices"""
    print("\n📚 Building global indices...")
    index_content = "# Knowledge Hierarchy Index\n\n"

    for path in sorted(output_dir.glob("**/* MOC.md")):
        if path.name.startswith('_'):
            continue

        relative_path = path.relative_to(output_dir)
        depth = len(relative_path.parent.parts) - 1
        indent = "  " * depth
        display_name = ' '.join(relative_path.parent.name.split('-'))

        index_content += f"{indent}- [[{display_name} MOC]]\n"

    index_path = output_dir / "_HIERARCHY.md"
    index_path.write_text(index_content, encoding='utf-8')
    print(f"    📖 Global index created at: {index_path}")

if __name__ == "__main__":
    VAULT_ROOT = Path("/home/vikk/Documents/GitHub/College-Notes")
    NOTES_ROOT = VAULT_ROOT / "Notes"
    OUTPUT_DIR = VAULT_ROOT / "Structured_Notes"
    MODEL = "gemma3:12b-it-qat"

    print("\U0001F680 Starting note processing pipeline")
    print(f"\U0001F527 Using model: {MODEL}")
    print(f"\U0001F4C2 Input directory: {NOTES_ROOT}")
    print(f"\U0001F4C2 Output directory: {OUTPUT_DIR}")

    total = sum(1 for _ in NOTES_ROOT.glob("**/*.md") if "MOC" not in _.name)
    processed = 0

    for note_path in NOTES_ROOT.glob("**/*.md"):
        if "MOC" in note_path.name:
            continue

        try:
            process_note_with_metadata(note_path, OUTPUT_DIR, MODEL, NOTES_ROOT)
            processed += 1
            print(f"✅ Progress: {processed}/{total} ({processed/total:.1%})")
        except Exception as e:
            print(f"❌ Critical error processing {note_path}, skipping: {str(e)}")

    create_global_indices(OUTPUT_DIR)

    print(f"\n🎉 Processing complete! Success: {processed}/{total} notes")
    print(f"🌐 Index available at: {OUTPUT_DIR}/_HIERARCHY.md")


🚀 Starting note processing pipeline
🔧 Using model: gemma3:12b-it-qat
📂 Input directory: /home/vikk/Documents/GitHub/College-Notes/Notes
📂 Output directory: /home/vikk/Documents/GitHub/College-Notes/Structured_Notes

📁 Processing: Engineering Chemistry/Chemistry.md
  🔍 Found 97 sections
  ⚠️  Skipping invalid section: ___
  📝 Section 2: Greenhouse Effect and Global Warming


/tmp/ipykernel_43969/767119788.py:41: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return response.json().get("response", {})


    🤖 Generating AI content...
    💾 Saved to: Engineering-Chemistry/Chemistry/Greenhouse-Effect-and-Global-Warming.md
  📝 Section 3: Scientific mechanisms behind the greenhouse effect
    🤖 Generating AI content...
    💾 Saved to: Engineering-Chemistry/Chemistry/Scientific-mechanisms-behind-the-greenhouse-effect.md
  📝 Section 4: Causes of increasing greenhouse gas concentrations
    🤖 Generating AI content...
    💾 Saved to: Engineering-Chemistry/Chemistry/Causes-of-increasing-greenhouse-gas-concentrations.md
  📝 Section 5: Current data and trends in global warming
    🤖 Generating AI content...
