# Compliance and Risk 

This notebook is **fully step-by-step**. It supports two input modes:
1) **Chat Mode:** enter a prompt → agent responds → confidence estimated → if < 60%, auto redirect to human.  
2) **Document Mode:** provide a contract file path (PDF/DOCX/TXT) → extract → summarize → risk scan → Monte Carlo → chart + saved artifacts.

### 1. Load environment + print active config (Code)

In [None]:
import os, json
from dotenv import load_dotenv

ENV_PATH = ".env"
AGENTS_PATH = "agents.yaml"
TASKS_PATH  = "tasks.yaml"

print("Has .env? ", ENV_PATH.exists())
print("Has agents.yaml? ", AGENTS_PATH.exists())
print("Has tasks.yaml? ", TASKS_PATH.exists())

# Load .env values
load_dotenv(dotenv_path=ENV_PATH)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "ollama")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "http://localhost:11434/v1")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "llama3.1:8b-instruct")
OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", "0.2"))
OPENAI_MAX_TOKENS = int(os.getenv("OPENAI_MAX_TOKENS", "2048"))

EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
CHROMA_DIR = os.getenv("CHROMA_DIR", "./vectorstore")

print("Active LLM:", OPENAI_MODEL)
print("Base URL  :", OPENAI_BASE_URL)
print("Temperature:", OPENAI_TEMPERATURE)
print("Max tokens :", OPENAI_MAX_TOKENS)
print("Embeddings :", EMBED_MODEL)
print("Chroma dir :", CHROMA_DIR)


### 2. Load Agents

In [None]:
import yaml

with open(AGENTS_PATH, "r", encoding="utf-8") as f:
    agents_cfg = yaml.safe_load(f)
with open(TASKS_PATH, "r", encoding="utf-8") as f:
    tasks_cfg = yaml.safe_load(f)

print("Agents:", list(agents_cfg.keys()))
print("Tasks :", list(tasks_cfg.keys()))

print("\n--- Example agent (compliance_checker) ---")
print(json.dumps(agents_cfg.get("compliance_checker", {}), indent=2))

print("\n--- Example task (risk_task) ---")
print(json.dumps(tasks_cfg.get("risk_task", {}), indent=2))


### 3. Initialize the LLM (LangChain + OpenAI-compatible)

In [None]:
# Uses Ollama via OpenAI-compatible REST (base_url in .env)
try:
    from langchain_openai import ChatOpenAI
except Exception:
    from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    base_url=OPENAI_BASE_URL,
    api_key=OPENAI_API_KEY,
    model=OPENAI_MODEL,
    temperature=OPENAI_TEMPERATURE,
    max_tokens=OPENAI_MAX_TOKENS,
)

def llm_complete(system_prompt: str, user_prompt: str) -> str:
    """One-shot chat completion using system + user messages."""
    try:
        resp = llm.invoke([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ])
        return getattr(resp, "content", str(resp))
    except Exception as e:
        return f"[LLM error: {e}]"


### 4. Utilities: text extraction, confidence scoring, risk model, Monte Carlo

In [None]:
import math, random
from PyPDF2 import PdfReader
import docx2txt
import matplotlib.pyplot as plt
from pathlib import Path

def estimate_confidence(text: str) -> float:
    """Heuristic confidence 0..1 based on hedging vs certainty keywords."""
    if not text: return 0.0
    lowered = text.lower()
    hedges = ["might","may","possibly","unclear","not sure","cannot determine","ambiguous"]
    certs  = ["must","shall","clearly","definitely","is required","complies","non-compliant"]
    score = 0.5
    for h in hedges:
        if h in lowered: score -= 0.05
    for c in certs:
        if c in lowered: score += 0.05
    return max(0.0, min(1.0, score))

def extract_text_from_file(path: str) -> str:
    """Support PDF, DOCX, TXT/MD for quick prototyping."""
    ext = Path(path).suffix.lower()
    if ext == ".pdf":
        try:
            reader = PdfReader(path)
            return "\n".join([p.extract_text() or "" for p in reader.pages])
        except Exception as e:
            return f"[PDF parse error: {e}]"
    if ext == ".docx":
        try:
            return docx2txt.process(path) or ""
        except Exception as e:
            return f"[DOCX parse error: {e}]"
    if ext in {".txt",".md"}:
        try:
            return Path(path).read_text(encoding="utf-8", errors="ignore")
        except Exception as e:
            return f"[TXT read error: {e}]"
    return "[Unsupported file type. Use PDF, DOCX, or TXT.]"

# Lightweight keyword-based risk model (extend with your domain packs)
RISK_KEYWORDS = {
    "liability": 2.0, "indemnify": 2.5, "indemnification": 2.5, "penalty": 1.5,
    "termination": 1.8, "default": 2.2, "breach": 2.0, "confidentiality": 1.2,
    "data protection": 2.0, "gdpr": 2.0, "ccpa": 2.0, "hipaa": 2.0,
    "warranty": 1.2, "limitation of liability": 2.3, "force majeure": 1.0,
    "governing law": 0.8, "arbitration": 1.0
}

def keyword_risk_scan(text: str):
    """Return (hits, total_weighted, normalized_risk_0to1)."""
    lowered = text.lower()
    hits, total = {}, 0.0
    for k,w in RISK_KEYWORDS.items():
        c = lowered.count(k)
        if c>0:
            hits[k] = {"count":c,"weight":w,"weighted":c*w}
            total += c*w
    normalized = 1 - math.exp(-0.1*total)
    return hits, total, normalized

def monte_carlo_risk(base_risk: float, n: int = 2000):
    """Simulate incidents with mean probability ~= base_risk (Gaussian jitter)."""
    rng = random.Random(42)
    incidents = []
    for _ in range(n):
        p = max(0.0, min(1.0, rng.gauss(mu=base_risk, sigma=0.1)))
        incidents.append(1 if rng.random() < p else 0)
    incident_rate = sum(incidents)/n
    return 1 - incident_rate, incident_rate, incidents


### 5. Chat Mode: define system & function

In [None]:
CHAT_SYSTEM = f"""
You are {agents_cfg.get('compliance_checker', {}).get('role', 'a compliance agent')}.
Goal: {agents_cfg.get('compliance_checker', {}).get('goal', 'Ensure clause compliance and point out risks.')}
Be precise, reference frameworks at a high level (GDPR/CCPA/HIPAA/Labor), and propose practical amendments.
"""

def run_chat_mode(user_prompt: str, confidence_threshold: float = 0.60):
    reply = llm_complete(CHAT_SYSTEM, user_prompt)
    conf = estimate_confidence(reply)
    routed = False
    if conf < confidence_threshold:
        reply = reply.strip() + f"\n\n— Confidence {conf*100:.0f}% < {confidence_threshold*100:.0f}% threshold. Redirecting to human agent..."
        routed = True
    return {"reply": reply, "confidence": conf, "routed_to_human": routed}

# Change this prompt and run next cell to execute
chat_prompt = "Is this data processing clause compliant with GDPR cross-border transfer rules?"


### 6. Run Chat Mode

In [None]:
result = run_chat_mode(chat_prompt, confidence_threshold=0.60)
print("Reply:\n", result["reply"])
print("\nConfidence:", f"{result['confidence']*100:.1f}%")
print("Redirected to human?", result["routed_to_human"])


### 7. Document Mode: function to analyze a contract file

In [None]:
import json, csv

DOC_SYSTEM = f"""
You are a contract analysis specialist combining roles of:
- {agents_cfg.get('compliance_checker', {}).get('role', 'Compliance Checker')}
- {agents_cfg.get('risk_assessor', {}).get('role', 'Risk Assessor')}
Produce a concise summary, list key obligations, flag potential non-compliance, and suggest concrete clause updates.
"""

OUTPUT_DIR = PROJECT_DIR / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def analyze_document(file_path: str):
    # 1) Extract
    text = extract_text_from_file(file_path)
    if text.startswith("[") and "error" in text.lower(): return {"error": text}
    if text.startswith("[Unsupported"): return {"error": text}

    # 2) Summarize with LLM
    user_prompt = (
        "Summarize the following contract text. Provide:\n"
        "1) 5-8 bullet key points\n"
        "2) Notable obligations and data handling\n"
        "3) Potential compliance gaps\n"
        "4) Suggested clause updates (short, actionable)\n\n"
        f"---\n{text[:12000]}"
    )
    summary = llm_complete(DOC_SYSTEM, user_prompt)

    # 3) Risk scan + 4) Monte Carlo simulation
    hits, total_w, norm_risk = keyword_risk_scan(text)
    success_rate, incident_rate, incidents = monte_carlo_risk(norm_risk, n=2000)

    # 5) Clause-level proxy scores (split on blank lines)
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    clause_scores = []
    for i, p in enumerate(paragraphs[:50]):
        _, tw, nr = keyword_risk_scan(p)
        clause_scores.append({"clause_id": i+1, "risk_weight": tw, "risk_score": nr})

    # 6) Save artifacts (JSON + CSV)
    base = Path(file_path).stem
    json_path = OUTPUT_DIR / f"{base}_analysis.json"
    csv_path  = OUTPUT_DIR / f"{base}_clause_risks.csv"

    out = {
        "file": file_path,
        "summary": summary,
        "risk_keyword_hits": hits,
        "total_risk_weight": total_w,
        "normalized_risk_0to1": norm_risk,
        "monte_carlo": {
            "success_rate": success_rate,
            "incident_rate": incident_rate,
            "trials": len(incidents)
        },
        "clause_scores": clause_scores,
    }
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(out, f, indent=2)
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["clause_id","risk_weight","risk_score"])
        w.writeheader(); w.writerows(clause_scores)

    # 7) Visualize simulation outcomes
    plt.figure()
    plt.hist(incidents, bins=2)
    plt.title("Monte Carlo Incident Outcomes (0=no incident, 1=incident)")
    plt.xlabel("Outcome"); plt.ylabel("Frequency")
    plt.show()

    print("Saved:", json_path); print("Saved:", csv_path)
    return out


### 8. Run Document Mode

In [None]:
# Set path to your contract file (PDF/DOCX/TXT). For demo, we create a tiny TXT.
example_doc_path = PROJECT_DIR / "example_contract.txt"
if not example_doc_path.exists():
    example_doc_path.write_text(
        "This Agreement includes a limitation of liability clause.\n"
        "Either party may terminate for breach upon written notice.\n"
        "Data Protection: Controller shall comply with GDPR and CCPA.\n"
        "Indemnification applies in case of third-party claims arising from default.",
        encoding="utf-8"
    )

out = analyze_document(str(example_doc_path))
if "error" in out:
    print(out["error"])
else:
    print("\n=== Summary (LLM) ===\n", out["summary"][:1500])
    print("\nTotal Risk Weight:", out["total_risk_weight"])
    print("Normalized Risk (0..1):", f"{out['normalized_risk_0to1']:.3f}")
    print("Incident Rate (Monte Carlo):", f"{out['monte_carlo']['incident_rate']:.3f}")


### 9. Inspect loaded YAML

In [None]:
print("AGENTS\n======")
print(json.dumps(agents_cfg, indent=2))
print("\nTASKS\n=====")
print(json.dumps(tasks_cfg, indent=2))


### 10. RAG setup: embeddings + Chroma client

In [None]:
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions

# Where to persist the vector DB (uses CHROMA_DIR from .env, but scoped under project)
CHROMA_PATH = Path(PROJECT_DIR) / "vectorstore"
CHROMA_PATH.mkdir(parents=True, exist_ok=True)

# Initialize Chroma client (persistent)
chroma_client = chromadb.PersistentClient(path=str(CHROMA_PATH))

# Embedding function (Sentence-Transformers)
# Will download the model on first use; cached afterward.
EMBED_MODEL_NAME = EMBED_MODEL  # from .env (e.g., all-MiniLM-L6-v2)
st_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL_NAME)

# Create or get a collection
COLLECTION_NAME = "compliance_docs"
collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME, embedding_function=st_ef)

print("Vector store path:", CHROMA_PATH)
print("Collection:", COLLECTION_NAME)
print("Embedding model:", EMBED_MODEL_NAME)
