# 📑 Legal Document Analyzer
This notebook runs the full pipeline on a legal PDF document.

In [None]:
# 📦 Install required packages
!pip install pdfplumber fpdf python-dotenv transformers torch openai google-generativeai ipywidgets

In [None]:
# 🔐 Load environment variables
import os
from dotenv import load_dotenv
load_dotenv()

if not (os.getenv("GOOGLE_API_KEY") or os.getenv("OPENAI_API_KEY")):
    raise ValueError("Missing API keys. Set GOOGLE_API_KEY or OPENAI_API_KEY in .env or code block.")

In [None]:
# 🧠 Import pipeline components
from pipeline.config import Config
from pipeline.parser import PDFParser
from pipeline.type_detector import DocumentTypeDetector
from pipeline.segmenter import DocumentSegmenter
from pipeline.citation_extractor import CitationExtractor
from pipeline.summarizer import DocumentSummarizer
from pipeline.llm_client import LLMClient

In [None]:
# ⚙️ Initialize pipeline
config = Config()
pdf_parser = PDFParser()
llm_client = LLMClient(config)
type_detector = DocumentTypeDetector(config)
segmenter = DocumentSegmenter(config, llm_client)
citation_extractor = CitationExtractor(llm_client)
summarizer = DocumentSummarizer(llm_client)

In [None]:
# 📤 Upload PDF (Google Colab)
from google.colab import files
from pathlib import Path

uploaded = files.upload()
file_name = next(iter(uploaded))
pdf_path = Path(file_name)

In [None]:
# 🧪 Run full analysis
text = pdf_parser.extract_text(str(pdf_path))
doc_type_info = type_detector.detect_type(text)
segments = segmenter.segment_document(text, doc_type_info['type'])

results = []
for segment in segments:
    label = segment['label']
    content = segment['content']
    try:
        citations = citation_extractor.extract_citations(content)
    except:
        citations = {}
    try:
        summary = summarizer.summarize_text(content, context=f"{doc_type_info['type']} - {label}")
    except:
        summary = "Summary could not be generated."
    results.append({
        'label': label,
        'content': content,
        'summary': summary,
        'citations': citations,
        **{k: v for k, v in segment.items() if k not in ['label', 'content']}
    })

print(f"📚 Document Type: {doc_type_info['type']} | Confidence: {doc_type_info['confidence']:.2f}")

In [None]:
# 📊 Display results
from IPython.display import Markdown, display

for i, res in enumerate(results, 1):
    display(Markdown(f"### {i}. {res['label']}"))
    display(Markdown(f"**Summary:** {res['summary']}"))
    if res['citations']:
        display(Markdown("**Citations:**"))
        for cat, cites in res['citations'].items():
            if cites:
                display(Markdown(f"- **{cat.replace('_', ' ').title()}**: {', '.join(cites)}"))