In [None]:

# ============================================================================
# INSTALLATION AND SETUP
# ============================================================================

# Install required packages
import subprocess
import sys

def install_packages():
    """Install all required packages"""
    packages = [
        'gradio>=4.46.0',
        'python-docx',
        'PyPDF2',
        'unidecode',
        'openai',
        'sentence-transformers',
        'faiss-cpu',
        'pandas',
        'numpy',
        'transformers',
        'torch',
        'datasets'
    ]

    for package in packages:
        try:
            subprocess.run([sys.executable, "-m", "pip", "install", "-q", "--upgrade", package],
                         check=True, capture_output=True)
            print(f"✓ Installed {package}")
        except subprocess.CalledProcessError as e:
            print(f"✗ Failed to install {package}: {e}")

# Uncomment the line below to install packages
# install_packages()

# ============================================================================
# IMPORTS
# ============================================================================

import os
import io
import re
import json
import tempfile
import shutil
import warnings
from typing import List, Dict, Any, Tuple, Optional
from datetime import datetime
import pandas as pd
import numpy as np

# Document processing
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
from docx.shared import Pt, RGBColor
from docx.oxml.shared import qn
from PyPDF2 import PdfReader
from unidecode import unidecode

# ML and NLP
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss

# UI
import gradio as gr

# Suppress warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION AND CONSTANTS
# ============================================================================

# ADGM reference file paths (update these paths according to your setup)
REFERENCE_FILES = {
    'checklist_1': '/content/ADGM CHECKLIST 2.pdf',
    'checklist_2': '/content/ADGM checklist.pdf',
    'employment_2019': '/content/ADGM Standard Employment Contract - ER 2019 - Short Version (May 2024).docx',
    'employment_2024': '/content/ADGM Standard Employment Contract Template - ER 2024 (Feb 2025).docx',
    'data_sources': '/content/Data Sources.docx',
    'data_protection': '/content/OFFICE OF DATA PROTECTION.pdf',
    'resolution_template': '/content/adgm-ra-resolution-multiple-incorporate-shareholders-LTD-incorporation-v2.docx'
}

# Enhanced field mapping for employment contracts
ENHANCED_FIELD_MAP = {
    "Employer name": {
        "variations": ["employer", "company name", "entity name", "organization"],
        "required": True,
        "adgm_ref": "ER 2024, Section 3.1"
    },
    "Employee name": {
        "variations": ["employee name", "individual", "worker", "staff member"],
        "required": True,
        "adgm_ref": "ER 2024, Section 3.2"
    },
    "Date of commencement": {
        "variations": ["start date", "joining date", "commencement date", "employment start"],
        "required": True,
        "adgm_ref": "ER 2024, Section 4.1"
    },
    "Job title": {
        "variations": ["job title", "designation", "position", "role"],
        "required": True,
        "adgm_ref": "ER 2024, Section 4.2"
    },
    "Wages": {
        "variations": ["salary", "remuneration", "compensation", "wages", "pay"],
        "required": True,
        "adgm_ref": "ER 2024, Section 5.1"
    },
    "Pay period": {
        "variations": ["pay period", "monthly", "payment frequency", "salary period"],
        "required": True,
        "adgm_ref": "ER 2024, Section 5.2"
    },
    "Hours or days of work": {
        "variations": ["working hours", "work time", "hours of work", "working days"],
        "required": True,
        "adgm_ref": "ER 2024, Section 6.1"
    },
    "Vacation leave": {
        "variations": ["annual leave", "holiday entitlement", "vacation", "leave"],
        "required": True,
        "adgm_ref": "ER 2024, Section 7.1"
    },
    "Sick leave": {
        "variations": ["sick leave", "medical leave", "illness leave"],
        "required": True,
        "adgm_ref": "ER 2024, Section 7.2"
    },
    "Notice period": {
        "variations": ["notice period", "termination notice", "resignation notice"],
        "required": True,
        "adgm_ref": "ER 2024, Section 8.1"
    },
    "Term (if fixed-term)": {
        "variations": ["contract term", "duration", "fixed term", "contract period"],
        "required": False,
        "adgm_ref": "ER 2024, Section 4.3"
    },
    "Place of work or Remote employee": {
        "variations": ["place of work", "remote employee", "work location", "workplace"],
        "required": True,
        "adgm_ref": "ER 2024, Section 4.4"
    },
    "Disciplinary/Grievance reference": {
        "variations": ["grievance procedure", "disciplinary", "disciplinary procedure"],
        "required": True,
        "adgm_ref": "ER 2024, Section 9.1"
    }
}

# Document type detection patterns
DOCUMENT_TYPES = {
    "employment_contract": {
        "keywords": ["employment contract", "er 2019", "er 2024", "employment regulations",
                    "terms of employment", "contract of employment"],
        "weight": 1.0
    },
    "apd": {
        "keywords": ["appropriate policy document", "apd", "data protection regulations",
                    "gdpr", "data protection policy"],
        "weight": 1.0
    },
    "resolution": {
        "keywords": ["resolution", "authorised signatory", "adoption of articles",
                    "board resolution", "shareholder resolution"],
        "weight": 1.0
    },
    "articles_of_association": {
        "keywords": ["articles of association", "aoa", "company articles"],
        "weight": 1.0
    },
    "memorandum_of_association": {
        "keywords": ["memorandum of association", "moa", "company memorandum"],
        "weight": 1.0
    },
    "branch_registration": {
        "keywords": ["branch registration", "branch in adgm", "branch application"],
        "weight": 1.0
    },
    "business_plan": {
        "keywords": ["business plan", "financial projections", "business model"],
        "weight": 1.0
    },
    "incorporation_form": {
        "keywords": ["incorporation application", "company registration form",
                    "incorporation form"],
        "weight": 1.0
    },
    "ubo_declaration": {
        "keywords": ["ubo declaration", "ultimate beneficial owner", "beneficial ownership"],
        "weight": 1.0
    }
}

# Process checklists based on ADGM requirements
PROCESS_CHECKLISTS = {
    "Company Incorporation": {
        "documents": [
            "Business plan",
            "Articles of Association",
            "Memorandum of Association",
            "Resolution authorising incorporation",
            "Incorporation application form",
            "Authorised Signatories (appointment evidence)",
            "Directors (details/evidence)",
            "Registered office (lease + lease registration)",
            "Shareholders details",
            "Ultimate Beneficial Owners (UBO Declaration)",
            "Data Protection contact"
        ],
        "description": "Complete company incorporation in ADGM"
    },
    "Branch Registration": {
        "documents": [
            "Parent Articles of Association (certified, recent)",
            "Latest audited financials of parent company",
            "Parent board resolution to register branch",
            "Registered office in ADGM (lease + registration)",
            "Authorised signatories (UAE/GCC/resident requirement)",
            "All parent directors and secretaries details",
            "Shareholders of parent company (details)",
            "Ultimate Beneficial Owners of parent (25%+ ownership)",
            "Data Protection contact"
        ],
        "description": "Registration of foreign company branch in ADGM"
    },
    "Employment/HR": {
        "documents": [
            "Employment contract (ER 2024 compliant)",
            "HR policy reference (disciplinary/grievance procedures)",
            "Evidence of ADGM entity (for jurisdiction verification)"
        ],
        "description": "Employment documentation compliance"
    },
    "Data Protection": {
        "documents": [
            "Appropriate Policy Document (APD)",
            "Record of Processing Activities (RoPA) reference",
            "Privacy Notice (internal/external) references",
            "Data Protection Impact Assessment (if required)"
        ],
        "description": "Data protection compliance documentation"
    }
}

# ADGM-specific red flags and compliance checks
ADGM_RED_FLAGS = {
    "jurisdiction": {
        "correct": ["abu dhabi global market", "adgm", "adgm courts", "adgm jurisdiction"],
        "incorrect": ["uae federal courts", "dubai courts", "abu dhabi courts", "sharjah courts"],
        "severity": "High",
        "reference": "ADGM Companies Regulations 2020, Article 6"
    },
    "governing_law": {
        "required": ["adgm common law", "english common law", "adgm laws"],
        "severity": "High",
        "reference": "ADGM Courts Law 2013"
    },
    "registered_office": {
        "required": ["registered office", "adgm address", "al maryah island"],
        "severity": "Medium",
        "reference": "ADGM Companies Regulations 2020, Article 15"
    }
}

# ============================================================================
# RAG SYSTEM IMPLEMENTATION
# ============================================================================

class ADGMKnowledgeBase:
    """RAG system for ADGM legal knowledge"""

    def __init__(self):
        self.embeddings_model = None
        self.knowledge_base = []
        self.embeddings = None
        self.index = None
        self.initialize_model()

    def initialize_model(self):
        """Initialize the sentence transformer model"""
        try:
            self.embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
            print("✓ RAG embeddings model initialized")
        except Exception as e:
            print(f"✗ Failed to initialize embeddings model: {e}")
            # Fallback to simpler keyword matching
            self.embeddings_model = None

    def load_reference_documents(self):
        """Load and process ADGM reference documents"""
        print("Loading ADGM reference documents...")

        for ref_name, file_path in REFERENCE_FILES.items():
            try:
                if os.path.exists(file_path):
                    content = self._extract_text_from_file(file_path)
                    if content:
                        # Split content into chunks
                        chunks = self._split_into_chunks(content, ref_name)
                        self.knowledge_base.extend(chunks)
                        print(f"✓ Loaded {ref_name}: {len(chunks)} chunks")
                else:
                    print(f"⚠ Reference file not found: {file_path}")
            except Exception as e:
                print(f"✗ Error loading {ref_name}: {e}")

        if self.knowledge_base and self.embeddings_model:
            self._create_embeddings_index()

        print(f"Knowledge base loaded with {len(self.knowledge_base)} chunks")

    def _extract_text_from_file(self, file_path: str) -> str:
        """Extract text from PDF or DOCX files"""
        try:
            if file_path.lower().endswith('.pdf'):
                reader = PdfReader(file_path)
                return "\n".join([page.extract_text() or "" for page in reader.pages])
            elif file_path.lower().endswith('.docx'):
                doc = Document(file_path)
                texts = [para.text for para in doc.paragraphs if para.text.strip()]
                # Also extract table content
                for table in doc.tables:
                    for row in table.rows:
                        texts.append(" | ".join([cell.text for cell in row.cells]))
                return "\n".join(texts)
        except Exception as e:
            print(f"Error extracting text from {file_path}: {e}")
        return ""

    def _split_into_chunks(self, content: str, source: str) -> List[Dict]:
        """Split content into manageable chunks"""
        # Split by sentences/paragraphs
        sentences = re.split(r'[.!?]\s+', content)
        chunks = []

        current_chunk = ""
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # If adding this sentence would make chunk too long, start new chunk
            if len(current_chunk) + len(sentence) > 500:
                if current_chunk:
                    chunks.append({
                        'content': current_chunk.strip(),
                        'source': source,
                        'length': len(current_chunk)
                    })
                current_chunk = sentence
            else:
                current_chunk += (" " if current_chunk else "") + sentence

        # Add final chunk
        if current_chunk.strip():
            chunks.append({
                'content': current_chunk.strip(),
                'source': source,
                'length': len(current_chunk)
            })

        return chunks

    def _create_embeddings_index(self):
        """Create FAISS index for similarity search"""
        if not self.embeddings_model:
            return

        try:
            texts = [chunk['content'] for chunk in self.knowledge_base]
            self.embeddings = self.embeddings_model.encode(texts)

            # Create FAISS index
            dimension = self.embeddings.shape[1]
            self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity

            # Normalize embeddings for cosine similarity
            faiss.normalize_L2(self.embeddings)
            self.index.add(self.embeddings)

            print(f"✓ Created FAISS index with {len(texts)} embeddings")
        except Exception as e:
            print(f"✗ Error creating embeddings index: {e}")

    def search_relevant_content(self, query: str, top_k: int = 5) -> List[Dict]:
        """Search for relevant content using RAG"""
        if not self.embeddings_model or not self.index:
            # Fallback to keyword search
            return self._keyword_search(query, top_k)

        try:
            # Encode query
            query_embedding = self.embeddings_model.encode([query])
            faiss.normalize_L2(query_embedding)

            # Search
            scores, indices = self.index.search(query_embedding, top_k)

            results = []
            for score, idx in zip(scores[0], indices[0]):
                if idx < len(self.knowledge_base):
                    chunk = self.knowledge_base[idx].copy()
                    chunk['relevance_score'] = float(score)
                    results.append(chunk)

            return results
        except Exception as e:
            print(f"Error in RAG search: {e}")
            return self._keyword_search(query, top_k)

    def _keyword_search(self, query: str, top_k: int) -> List[Dict]:
        """Fallback keyword-based search"""
        query_words = normalize_text(query).split()
        results = []

        for chunk in self.knowledge_base:
            content_normalized = normalize_text(chunk['content'])
            score = sum(1 for word in query_words if word in content_normalized)

            if score > 0:
                chunk_copy = chunk.copy()
                chunk_copy['relevance_score'] = score / len(query_words)
                results.append(chunk_copy)

        # Sort by relevance and return top_k
        results.sort(key=lambda x: x['relevance_score'], reverse=True)
        return results[:top_k]

# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def normalize_text(text: str) -> str:
    """Normalize text for comparison"""
    if not text:
        return ""
    return unidecode(re.sub(r'\s+', ' ', text.strip())).lower()

def read_docx_text(file_path: str) -> str:
    """Extract text from DOCX file"""
    try:
        doc = Document(file_path)
        texts = []

        # Extract paragraph text
        for para in doc.paragraphs:
            if para.text.strip():
                texts.append(para.text)

        # Extract table text
        for table in doc.tables:
            for row in table.rows:
                row_text = " | ".join([cell.text for cell in row.cells])
                if row_text.strip():
                    texts.append(row_text)

        return "\n".join(texts)
    except Exception as e:
        print(f"Error reading DOCX file {file_path}: {e}")
        return ""

def read_pdf_text(file_path: str) -> str:
    """Extract text from PDF file"""
    try:
        reader = PdfReader(file_path)
        texts = []
        for page in reader.pages:
            text = page.extract_text()
            if text:
                texts.append(text)
        return "\n".join(texts)
    except Exception as e:
        print(f"Error reading PDF file {file_path}: {e}")
        return ""

def detect_document_type(text: str, filename: str = "") -> str:
    """Detect document type using enhanced pattern matching"""
    normalized_text = normalize_text(text)
    normalized_filename = normalize_text(filename)

    best_type = "unknown"
    best_score = 0

    for doc_type, config in DOCUMENT_TYPES.items():
        score = 0

        # Check keywords in content
        for keyword in config["keywords"]:
            if normalize_text(keyword) in normalized_text:
                score += config["weight"]

        # Check keywords in filename
        for keyword in config["keywords"]:
            if normalize_text(keyword) in normalized_filename:
                score += config["weight"] * 0.5  # Lower weight for filename matches

        if score > best_score:
            best_score = score
            best_type = doc_type

    return best_type if best_score > 0 else "unknown"

# ============================================================================
# DOCUMENT ANALYSIS FUNCTIONS
# ============================================================================

def analyze_employment_contract(text: str, knowledge_base: ADGMKnowledgeBase) -> Dict:
    """Comprehensive analysis of employment contracts"""
    normalized_text = normalize_text(text)

    result = {
        "missing_fields": [],
        "found_fields": [],
        "issues": [],
        "compliance_score": 0,
        "adgm_references": []
    }

    # Check required fields
    for field_name, config in ENHANCED_FIELD_MAP.items():
        field_found = False

        for variation in config["variations"]:
            if normalize_text(variation) in normalized_text:
                result["found_fields"].append(field_name)
                field_found = True
                break

        if not field_found and config["required"]:
            result["missing_fields"].append(field_name)
            result["issues"].append({
                "issue": f"Missing required field: {field_name}",
                "severity": "High",
                "suggestion": f"Add {field_name} as required by {config['adgm_ref']}",
                "adgm_reference": config['adgm_ref']
            })

    # Check ADGM jurisdiction compliance
    adgm_jurisdiction_found = False
    for jurisdiction_term in ADGM_RED_FLAGS["jurisdiction"]["correct"]:
        if normalize_text(jurisdiction_term) in normalized_text:
            adgm_jurisdiction_found = True
            break

    if not adgm_jurisdiction_found:
        # Check for incorrect jurisdictions
        incorrect_jurisdiction = None
        for incorrect_term in ADGM_RED_FLAGS["jurisdiction"]["incorrect"]:
            if normalize_text(incorrect_term) in normalized_text:
                incorrect_jurisdiction = incorrect_term
                break

        issue_text = "No ADGM jurisdiction reference found"
        if incorrect_jurisdiction:
            issue_text = f"Incorrect jurisdiction reference found: {incorrect_jurisdiction}"

        result["issues"].append({
            "issue": issue_text,
            "severity": "High",
            "suggestion": "Add governing law clause referencing ADGM Courts and ADGM jurisdiction",
            "adgm_reference": ADGM_RED_FLAGS["jurisdiction"]["reference"]
        })

    # Check governing law
    governing_law_found = False
    for law_term in ADGM_RED_FLAGS["governing_law"]["required"]:
        if normalize_text(law_term) in normalized_text:
            governing_law_found = True
            break

    if not governing_law_found:
        result["issues"].append({
            "issue": "No explicit governing law clause found",
            "severity": "Medium",
            "suggestion": "Add clause specifying ADGM Common Law as governing law",
            "adgm_reference": ADGM_RED_FLAGS["governing_law"]["reference"]
        })

    # Use RAG to find relevant ADGM employment regulations
    if knowledge_base and knowledge_base.embeddings_model:
        employment_query = "employment contract requirements ADGM ER 2024"
        relevant_content = knowledge_base.search_relevant_content(employment_query, top_k=3)

        for content in relevant_content:
            if content['relevance_score'] > 0.3:  # Threshold for relevance
                result["adgm_references"].append({
                    "source": content["source"],
                    "content_preview": content["content"][:200] + "...",
                    "relevance": content["relevance_score"]
                })

    # Calculate compliance score
    total_required = sum(1 for config in ENHANCED_FIELD_MAP.values() if config["required"])
    found_required = len([f for f in result["found_fields"]
                         if ENHANCED_FIELD_MAP[f]["required"]])

    field_score = (found_required / total_required) * 70 if total_required > 0 else 0
    jurisdiction_score = 20 if adgm_jurisdiction_found else 0
    governing_law_score = 10 if governing_law_found else 0

    result["compliance_score"] = field_score + jurisdiction_score + governing_law_score

    return result

def analyze_corporate_document(text: str, doc_type: str, knowledge_base: ADGMKnowledgeBase) -> Dict:
    """Analyze corporate documents (Articles, Resolutions, etc.)"""
    normalized_text = normalize_text(text)

    result = {
        "issues": [],
        "compliance_score": 0,
        "adgm_references": []
    }

    # Common checks for corporate documents
    # Check ADGM jurisdiction
    if not any(normalize_text(term) in normalized_text
               for term in ADGM_RED_FLAGS["jurisdiction"]["correct"]):
        result["issues"].append({
            "issue": "ADGM jurisdiction not specified",
            "severity": "High",
            "suggestion": "Specify ADGM Courts jurisdiction",
            "adgm_reference": ADGM_RED_FLAGS["jurisdiction"]["reference"]
        })

    # Check registered office requirement
    if not any(normalize_text(term) in normalized_text
               for term in ADGM_RED_FLAGS["registered_office"]["required"]):
        result["issues"].append({
            "issue": "Registered office address not clearly specified",
            "severity": "Medium",
            "suggestion": "Include proper ADGM registered office address",
            "adgm_reference": ADGM_RED_FLAGS["registered_office"]["reference"]
        })

    # Document-specific checks
    if doc_type == "articles_of_association":
        result.update(analyze_articles_of_association(normalized_text))
    elif doc_type == "resolution":
        result.update(analyze_resolution(normalized_text))
    elif doc_type == "memorandum_of_association":
        result.update(analyze_memorandum(normalized_text))

    # Use RAG for document-specific guidance
    if knowledge_base and knowledge_base.embeddings_model:
        query = f"{doc_type} requirements ADGM incorporation"
        relevant_content = knowledge_base.search_relevant_content(query, top_k=3)

        for content in relevant_content:
            if content['relevance_score'] > 0.3:
                result["adgm_references"].append({
                    "source": content["source"],
                    "content_preview": content["content"][:200] + "...",
                    "relevance": content["relevance_score"]
                })

    # Calculate basic compliance score
    high_issues = len([i for i in result["issues"] if i["severity"] == "High"])
    medium_issues = len([i for i in result["issues"] if i["severity"] == "Medium"])

    result["compliance_score"] = max(0, 100 - (high_issues * 25) - (medium_issues * 10))

    return result

def analyze_articles_of_association(normalized_text: str) -> Dict:
    """Specific analysis for Articles of Association"""
    issues = []

    # Check for essential clauses
    essential_clauses = {
        "company name": ["company name", "name of the company"],
        "registered office": ["registered office", "registered address"],
        "objects": ["objects", "purposes", "business activities"],
        "share capital": ["share capital", "authorized capital", "capital"],
        "directors": ["directors", "board of directors"],
        "shareholders": ["shareholders", "members"]
    }

    for clause_name, keywords in essential_clauses.items():
        if not any(keyword in normalized_text for keyword in keywords):
            issues.append({
                "issue": f"Missing or unclear {clause_name} clause",
                "severity": "High",
                "suggestion": f"Include proper {clause_name} clause as per ADGM requirements",
                "adgm_reference": "ADGM Companies Regulations 2020"
            })

    return {"issues": issues}

def analyze_resolution(normalized_text: str) -> Dict:
    """Specific analysis for Resolutions"""
    issues = []

    # Check for resolution essentials
    resolution_essentials = {
        "date": ["date", "dated"],
        "quorum": ["quorum", "present"],
        "authorization": ["authorize", "authorise", "resolved"],
        "signature": ["signature", "signed", "director"]
    }

    for essential, keywords in resolution_essentials.items():
        if not any(keyword in normalized_text for keyword in keywords):
            issues.append({
                "issue": f"Missing {essential} in resolution",
                "severity": "Medium",
                "suggestion": f"Include proper {essential} as required for valid resolutions",
                "adgm_reference": "ADGM Companies Regulations 2020"
            })

    return {"issues": issues}

def analyze_memorandum(normalized_text: str) -> Dict:
    """Specific analysis for Memorandum of Association"""
    issues = []

    # Check for memorandum essentials
    memo_essentials = {
        "subscribers": ["subscribers", "founding members"],
        "incorporation": ["incorporation", "form a company"],
        "liability": ["liability", "limited liability"],
        "capital": ["capital", "share capital"]
    }

    for essential, keywords in memo_essentials.items():
        if not any(keyword in normalized_text for keyword in keywords):
            issues.append({
                "issue": f"Missing {essential} clause in memorandum",
                "severity": "High",
                "suggestion": f"Include {essential} clause as required by ADGM",
                "adgm_reference": "ADGM Companies Regulations 2020"
            })

    return {"issues": issues}

# ============================================================================
# PROCESS DETECTION AND CHECKLIST VERIFICATION
# ============================================================================

def guess_legal_process(document_types: List[str], filenames: List[str]) -> str:
    """Intelligently guess the legal process from uploaded documents"""

    # Count document type occurrences
    type_counts = {}
    for doc_type in document_types:
        type_counts[doc_type] = type_counts.get(doc_type, 0) + 1

    # Analyze filenames for additional context
    filename_text = " ".join(filenames).lower()

    # Decision logic
    if any(dt in ["articles_of_association", "memorandum_of_association", "resolution"]
           for dt in document_types):
        if "branch" in filename_text or any("branch" in dt for dt in document_types):
            return "Branch Registration"
        else:
            return "Company Incorporation"

    elif any(dt == "employment_contract" for dt in document_types):
        return "Employment/HR"

    elif any(dt == "apd" for dt in document_types):
        return "Data Protection"

    elif "branch" in filename_text:
        return "Branch Registration"

    elif any(keyword in filename_text for keyword in ["incorporation", "company", "articles"]):
        return "Company Incorporation"

    # Default to most common process
    return "Company Incorporation"

def check_process_completeness(process: str, uploaded_filenames: List[str]) -> Tuple[int, int, List[str]]:
    """Check completeness of uploaded documents against process requirements"""

    if process not in PROCESS_CHECKLISTS:
        return 0, 0, []

    required_docs = PROCESS_CHECKLISTS[process]["documents"]
    uploaded_normalized = [normalize_text(name) for name in uploaded_filenames]

    present_docs = []
    missing_docs = []

    for required_doc in required_docs:
        doc_found = False
        required_normalized = normalize_text(required_doc)
        required_keywords = required_normalized.split()

        # Check if any uploaded file matches this requirement
        for uploaded_name in uploaded_normalized:
            # Simple keyword matching - if 2+ keywords match, consider it present
            matches = sum(1 for keyword in required_keywords if keyword in uploaded_name)

            if matches >= min(2, len(required_keywords)):
                present_docs.append(required_doc)
                doc_found = True
                break

        if not doc_found:
            missing_docs.append(required_doc)

    return len(present_docs), len(required_docs), missing_docs

# ============================================================================
# DOCUMENT ANNOTATION AND OUTPUT GENERATION
# ============================================================================

def annotate_docx_document(input_path: str, issues: List[Dict], output_path: str = None) -> bytes:
    """Add review comments to DOCX document"""
    try:
        doc = Document(input_path)

        # Add a review summary at the beginning
        if issues:
            summary_para = doc.paragraphs[0].insert_paragraph_before()
            summary_run = summary_para.add_run(
                f"\n=== ADGM COMPLIANCE REVIEW SUMMARY ===\n"
                f"Total Issues Found: {len(issues)}\n"
                f"Review Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
                f"{'='*50}\n"
            )
            summary_run.font.size = Pt(10)
            summary_run.font.color.rgb = RGBColor(255, 0, 0)  # Red color
            summary_run.bold = True

        # Process each issue
        for i, issue in enumerate(issues):
            issue_inserted = False

            # Try to find relevant paragraph to insert comment
            for para in doc.paragraphs:
                para_text_normalized = normalize_text(para.text)

                # Look for keywords from the issue in the paragraph
                issue_keywords = normalize_text(issue.get("issue", "")).split()[:3]  # First 3 words

                if any(keyword in para_text_normalized for keyword in issue_keywords if keyword):
                    # Insert comment after this paragraph
                    comment_para = para.insert_paragraph_after()

                    # Create formatted comment
                    comment_text = (
                        f"[REVIEW #{i+1}] "
                        f"Issue: {issue.get('issue', 'Unknown issue')}\n"
                        f"Severity: {issue.get('severity', 'Medium')}\n"
                        f"Suggestion: {issue.get('suggestion', 'Please review')}\n"
                        f"ADGM Reference: {issue.get('adgm_reference', 'General ADGM Regulations')}\n"
                    )

                    comment_run = comment_para.add_run(comment_text)
                    comment_run.font.size = Pt(9)

                    # Color code by severity
                    if issue.get('severity') == 'High':
                        comment_run.font.color.rgb = RGBColor(255, 0, 0)  # Red
                    elif issue.get('severity') == 'Medium':
                        comment_run.font.color.rgb = RGBColor(255, 165, 0)  # Orange
                    else:
                        comment_run.font.color.rgb = RGBColor(0, 0, 255)  # Blue

                    comment_run.italic = True
                    issue_inserted = True
                    break

            # If issue wasn't inserted, add it at the end
            if not issue_inserted:
                end_para = doc.add_paragraph()
                comment_text = (
                    f"\n[REVIEW #{i+1}] "
                    f"Issue: {issue.get('issue', 'Unknown issue')}\n"
                    f"Severity: {issue.get('severity', 'Medium')}\n"
                    f"Suggestion: {issue.get('suggestion', 'Please review')}\n"
                    f"ADGM Reference: {issue.get('adgm_reference', 'General ADGM Regulations')}\n"
                )

                comment_run = end_para.add_run(comment_text)
                comment_run.font.size = Pt(9)

                if issue.get('severity') == 'High':
                    comment_run.font.color.rgb = RGBColor(255, 0, 0)
                elif issue.get('severity') == 'Medium':
                    comment_run.font.color.rgb = RGBColor(255, 165, 0)
                else:
                    comment_run.font.color.rgb = RGBColor(0, 0, 255)

                comment_run.italic = True

        # Save to BytesIO
        output_buffer = io.BytesIO()
        doc.save(output_buffer)
        output_buffer.seek(0)

        # If output path provided, also save to file
        if output_path:
            doc.save(output_path)

        return output_buffer.getvalue()

    except Exception as e:
        print(f"Error annotating document {input_path}: {e}")
        return b""

def generate_compliance_report(analysis_results: Dict) -> Dict:
    """Generate comprehensive compliance report"""

    # Calculate overall compliance metrics
    total_issues = len(analysis_results.get("issues_found", []))
    high_severity_issues = len([i for i in analysis_results.get("issues_found", [])
                               if i.get("severity") == "High"])
    medium_severity_issues = len([i for i in analysis_results.get("issues_found", [])
                                 if i.get("severity") == "Medium"])

    # Calculate overall compliance score
    if total_issues == 0:
        overall_compliance = 100
    else:
        penalty = (high_severity_issues * 25) + (medium_severity_issues * 10)
        overall_compliance = max(0, 100 - penalty)

    # Determine compliance status
    if overall_compliance >= 90:
        compliance_status = "Excellent"
    elif overall_compliance >= 75:
        compliance_status = "Good"
    elif overall_compliance >= 50:
        compliance_status = "Needs Improvement"
    else:
        compliance_status = "Poor - Significant Issues"

    # Generate recommendations
    recommendations = []
    if high_severity_issues > 0:
        recommendations.append("Address all high-severity issues immediately before submission")
    if medium_severity_issues > 0:
        recommendations.append("Review and resolve medium-severity issues for better compliance")
    if analysis_results.get("missing_document"):
        recommendations.append("Prepare and upload all missing required documents")

    if not recommendations:
        recommendations.append("Documents appear to be in good compliance with ADGM requirements")

    # Prepare final report
    report = {
        "analysis_summary": {
            "process_type": analysis_results.get("process", "Unknown"),
            "documents_uploaded": analysis_results.get("documents_uploaded", 0),
            "required_documents": analysis_results.get("required_documents", 0),
            "completeness_percentage": round(
                (analysis_results.get("documents_uploaded", 0) /
                 max(1, analysis_results.get("required_documents", 1))) * 100, 1
            ),
            "missing_documents": analysis_results.get("missing_document", []),
            "overall_compliance_score": overall_compliance,
            "compliance_status": compliance_status
        },
        "issue_breakdown": {
            "total_issues": total_issues,
            "high_severity": high_severity_issues,
            "medium_severity": medium_severity_issues,
            "low_severity": total_issues - high_severity_issues - medium_severity_issues
        },
        "detailed_issues": analysis_results.get("issues_found", []),
        "recommendations": recommendations,
        "next_steps": [
            "Review all flagged issues in the annotated documents",
            "Consult ADGM regulations for specific requirements",
            "Consider professional legal review before final submission",
            "Prepare any missing documents identified in the analysis"
        ],
        "adgm_references": list(set([
            issue.get("adgm_reference", "")
            for issue in analysis_results.get("issues_found", [])
            if issue.get("adgm_reference")
        ])),
        "generated_on": datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC"),
        "disclaimer": "This automated review is for guidance only. Professional legal advice should be sought for final compliance verification."
    }

    return report

# ============================================================================
# MAIN ANALYSIS ENGINE
# ============================================================================

def analyze_uploaded_documents(file_paths: List[str], knowledge_base: ADGMKnowledgeBase) -> Tuple[List[str], str]:
    """Main document analysis function"""

    print(f"Starting analysis of {len(file_paths)} documents...")

    # Initialize results
    results = {
        "process": None,
        "documents_uploaded": 0,
        "required_documents": 0,
        "missing_document": [],
        "issues_found": [],
        "document_analysis": []
    }

    annotated_paths = []
    document_types = []
    filenames = []

    # Process each uploaded file
    for file_path in file_paths:
        try:
            filename = os.path.basename(file_path)
            filenames.append(filename)

            print(f"Processing: {filename}")

            # Extract text based on file type
            if file_path.lower().endswith('.docx'):
                document_text = read_docx_text(file_path)
                can_annotate = True
            elif file_path.lower().endswith('.pdf'):
                document_text = read_pdf_text(file_path)
                can_annotate = False
            else:
                print(f"Unsupported file type: {filename}")
                continue

            if not document_text.strip():
                print(f"No text extracted from: {filename}")
                continue

            # Detect document type
            doc_type = detect_document_type(document_text, filename)
            document_types.append(doc_type)

            print(f"Detected type: {doc_type}")

            # Analyze document based on type
            doc_issues = []
            doc_analysis = {
                "filename": filename,
                "document_type": doc_type,
                "issues": [],
                "compliance_score": 0
            }

            if doc_type == "employment_contract":
                analysis_result = analyze_employment_contract(document_text, knowledge_base)
                doc_issues.extend(analysis_result.get("issues", []))
                doc_analysis.update({
                    "missing_fields": analysis_result.get("missing_fields", []),
                    "found_fields": analysis_result.get("found_fields", []),
                    "compliance_score": analysis_result.get("compliance_score", 0)
                })

            elif doc_type in ["articles_of_association", "memorandum_of_association", "resolution"]:
                analysis_result = analyze_corporate_document(document_text, doc_type, knowledge_base)
                doc_issues.extend(analysis_result.get("issues", []))
                doc_analysis["compliance_score"] = analysis_result.get("compliance_score", 0)

            else:
                # Generic document analysis
                analysis_result = analyze_corporate_document(document_text, doc_type, knowledge_base)
                doc_issues.extend(analysis_result.get("issues", []))
                doc_analysis["compliance_score"] = analysis_result.get("compliance_score", 0)

            # Add document reference to issues
            for issue in doc_issues:
                issue["document"] = filename
                issue["document_type"] = doc_type

            doc_analysis["issues"] = doc_issues
            results["document_analysis"].append(doc_analysis)
            results["issues_found"].extend(doc_issues)

            # Annotate DOCX files if there are issues
            if can_annotate and doc_issues:
                try:
                    reviewed_path = file_path.replace('.docx', '_REVIEWED.docx')
                    annotate_docx_document(file_path, doc_issues, reviewed_path)
                    annotated_paths.append(reviewed_path)
                    print(f"✓ Annotated: {filename}")
                except Exception as e:
                    print(f"✗ Failed to annotate {filename}: {e}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

    # Determine legal process and check completeness
    if document_types:
        legal_process = guess_legal_process(document_types, filenames)
        present_count, required_count, missing_docs = check_process_completeness(legal_process, filenames)

        results.update({
            "process": legal_process,
            "documents_uploaded": present_count,
            "required_documents": required_count,
            "missing_document": missing_docs
        })

        print(f"Detected process: {legal_process}")
        print(f"Document completeness: {present_count}/{required_count}")

    # Generate comprehensive report
    comprehensive_report = generate_compliance_report(results)

    # Format final output
    final_output = {
        "basic_results": results,
        "comprehensive_report": comprehensive_report,
        "annotated_files": [os.path.basename(path) for path in annotated_paths]
    }

    return annotated_paths, json.dumps(final_output, indent=2, ensure_ascii=False)

# ============================================================================
# GRADIO INTERFACE
# ============================================================================

def create_gradio_interface():
    """Create the Gradio interface for the ADGM Corporate Agent"""

    # Initialize knowledge base
    print("Initializing ADGM Knowledge Base...")
    kb = ADGMKnowledgeBase()
    kb.load_reference_documents()

    def process_files_interface(files):
        """Interface function for Gradio"""
        if not files:
            return "❌ No files uploaded", "Please upload at least one document for analysis."

        try:
            # Create temporary directory
            temp_dir = tempfile.mkdtemp(prefix="adgm_analysis_")
            file_paths = []

            # Copy uploaded files to temp directory
            for file_obj in files:
                temp_path = os.path.join(temp_dir, os.path.basename(file_obj.name))
                shutil.copy2(file_obj.name, temp_path)
                file_paths.append(temp_path)

            # Perform analysis
            annotated_files, report_json = analyze_uploaded_documents(file_paths, kb)

            # Parse the report for summary
            try:
                report_data = json.loads(report_json)
                comprehensive_report = report_data.get("comprehensive_report", {})
                analysis_summary = comprehensive_report.get("analysis_summary", {})

                # Create status summary
                status_summary = f"""
✅ **Analysis Complete**

📊 **Process Detected**: {analysis_summary.get('process_type', 'Unknown')}

📄 **Document Status**: {analysis_summary.get('documents_uploaded', 0)}/{analysis_summary.get('required_documents', 0)} required documents uploaded ({analysis_summary.get('completeness_percentage', 0)}% complete)

🎯 **Compliance Score**: {analysis_summary.get('overall_compliance_score', 0)}/100 ({analysis_summary.get('compliance_status', 'Unknown')})

⚠️ **Issues Found**: {comprehensive_report.get('issue_breakdown', {}).get('total_issues', 0)} total
   - High Severity: {comprehensive_report.get('issue_breakdown', {}).get('high_severity', 0)}
   - Medium Severity: {comprehensive_report.get('issue_breakdown', {}).get('medium_severity', 0)}
   - Low Severity: {comprehensive_report.get('issue_breakdown', {}).get('low_severity', 0)}

📝 **Reviewed Files**: {len(annotated_files)} document(s) annotated with review comments

🔍 **Missing Documents**: {len(analysis_summary.get('missing_documents', []))} required document(s) missing
"""

                # Add missing documents list if any
                if analysis_summary.get('missing_documents'):
                    status_summary += "\n**Missing Documents:**\n"
                    for doc in analysis_summary.get('missing_documents', []):
                        status_summary += f"   - {doc}\n"

                return status_summary, report_json

            except json.JSONDecodeError:
                return "✅ Analysis complete (parsing error)", report_json

        except Exception as e:
            error_msg = f"❌ Analysis failed: {str(e)}"
            return error_msg, json.dumps({"error": str(e)}, indent=2)

        finally:
            # Cleanup temp directory
            try:
                shutil.rmtree(temp_dir, ignore_errors=True)
            except:
                pass

    # Create Gradio interface
    with gr.Blocks(
        title="ADGM Corporate Agent",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        }
        .title {
            text-align: center;
            background: linear-gradient(90deg, #1e3a8a 0%, #3b82f6 100%);
            color: white;
            padding: 20px;
            margin-bottom: 20px;
            border-radius: 10px;
        }
        .info-box {
            background-color: #f0f9ff;
            border: 1px solid #0284c7;
            padding: 15px;
            border-radius: 8px;
            margin: 10px 0;
        }
        """
    ) as demo:

        # Title and description
        gr.HTML("""
        <div class="title">
            <h1>🏛️ ADGM Corporate Agent</h1>
            <h3>AI-Powered Legal Document Intelligence & Compliance Checker</h3>
            <p>Abu Dhabi Global Market (ADGM) Document Review System</p>
        </div>
        """)

        gr.HTML("""
        <div class="info-box">
            <h4>📋 What this system does:</h4>
            <ul>
                <li><strong>Document Analysis:</strong> Reviews DOCX/PDF files for ADGM compliance</li>
                <li><strong>Process Detection:</strong> Automatically identifies legal processes (Incorporation, Employment, etc.)</li>
                <li><strong>Compliance Checking:</strong> Validates documents against ADGM regulations</li>
                <li><strong>Document Annotation:</strong> Adds review comments directly to DOCX files</li>
                <li><strong>Completeness Verification:</strong> Checks if all required documents are present</li>
                <li><strong>RAG-Enhanced Analysis:</strong> Uses ADGM reference documents for accurate guidance</li>
            </ul>
        </div>
        """)

        # File upload section
        with gr.Row():
            with gr.Column():
                files_input = gr.File(
                    file_count="multiple",
                    file_types=[".docx", ".pdf"],
                    label="📁 Upload Legal Documents",

                )

                analyze_btn = gr.Button(
                    "🔍 Analyze Documents",
                    variant="primary",
                    size="lg"
                )

        # Results section
        with gr.Row():
            with gr.Column():
                status_output = gr.Markdown(
                    label="📊 Analysis Summary",
                    value="Upload documents and click 'Analyze Documents' to begin..."
                )

        with gr.Row():
            with gr.Column():
                report_output = gr.Code(
                    label="📄 Detailed JSON Report",
                    language="json",
                    lines=30,
                    value="Detailed analysis report will appear here..."
                )

        # Footer information
        gr.HTML("""
        <div class="info-box">
            <h4>📚 Supported Document Types:</h4>
            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px;">
                <div>
                    <strong>Company Formation:</strong>
                    <ul style="margin: 5px 0;">
                        <li>Articles of Association</li>
                        <li>Memorandum of Association</li>
                        <li>Board Resolutions</li>
                        <li>Incorporation Forms</li>
                    </ul>
                </div>
                <div>
                    <strong>Employment & HR:</strong>
                    <ul style="margin: 5px 0;">
                        <li>Employment Contracts</li>
                        <li>HR Policies</li>
                        <li>Disciplinary Procedures</li>
                    </ul>
                </div>
                <div>
                    <strong>Data Protection:</strong>
                    <ul style="margin: 5px 0;">
                        <li>Appropriate Policy Documents</li>
                        <li>Privacy Notices</li>
                        <li>Data Processing Records</li>
                    </ul>
                </div>
                <div>
                    <strong>Branch Registration:</strong>
                    <ul style="margin: 5px 0;">
                        <li>Parent Company Documents</li>
                        <li>Financial Statements</li>
                        <li>Registration Applications</li>
                    </ul>
                </div>
            </div>
        </div>
        """)

        gr.HTML("""
        <div style="text-align: center; padding: 20px; color: #6b7280; font-size: 12px;">
            <p><strong>Disclaimer:</strong> This automated review is for guidance only. Professional legal advice should be sought for final compliance verification.</p>
            <p>© 2024 ADGM Corporate Agent - Powered by AI & RAG Technology</p>
        </div>
        """)

        # Event handlers
        analyze_btn.click(
            fn=process_files_interface,
            inputs=[files_input],
            outputs=[status_output, report_output],
            show_progress=True
        )

    return demo

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("🚀 Starting ADGM Corporate Agent...")
    print("📚 Loading knowledge base and initializing system...")

    try:
        # Create and launch Gradio interface
        demo = create_gradio_interface()

        # Launch with public sharing enabled
        demo.launch(
            share=True,
            inbrowser=True,
            server_name="0.0.0.0",
            server_port=7860,
            show_error=True,
            debug=True
        )

    except Exception as e:
        print(f"❌ Failed to start application: {e}")
        import traceback
        traceback.print_exc()

# ============================================================================
# TESTING FUNCTIONS (Optional)
# ============================================================================

def test_system_with_sample_documents():
    """Test function to verify system functionality with sample documents"""
    print("🧪 Running system tests...")

    # Initialize knowledge base
    kb = ADGMKnowledgeBase()
    kb.load_reference_documents()

    # Test document type detection
    sample_texts = {
        "employment": "This Employment Contract is made between the Employer and Employee...",
        "articles": "Articles of Association of XYZ Company Limited registered in ADGM...",
        "resolution": "Board Resolution for incorporation of company in Abu Dhabi Global Market..."
    }

    print("\n📝 Testing document type detection:")
    for doc_type, text in sample_texts.items():
        detected = detect_document_type(text)
        print(f"   {doc_type}: detected as '{detected}' ✓")

    print("\n🔍 Testing RAG search:")
    if kb.embeddings_model:
        results = kb.search_relevant_content("employment contract requirements", top_k=2)
        print(f"   Found {len(results)} relevant chunks ✓")
    else:
        print("   RAG search using keyword fallback ✓")

    print("\n✅ System test completed successfully!")
    return True

# Uncomment to run tests
# test_system_with_sample_documents()

🚀 Starting ADGM Corporate Agent...
📚 Loading knowledge base and initializing system...
Initializing ADGM Knowledge Base...
✓ RAG embeddings model initialized
Loading ADGM reference documents...
✓ Loaded checklist_1: 0 chunks
✓ Loaded checklist_2: 0 chunks
✓ Loaded employment_2019: 40 chunks
✓ Loaded employment_2024: 50 chunks
✓ Loaded data_sources: 2 chunks
✓ Loaded data_protection: 0 chunks
✓ Loaded resolution_template: 9 chunks
✓ Created FAISS index with 101 embeddings
Knowledge base loaded with 101 chunks
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://02e4f2f966d1148fcb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Starting analysis of 1 documents...
Processing: ADGM Standard Employment Contract - ER 2019 - Short Version May 2024.docx
Detected type: employment_contract
Error annotating document /tmp/adgm_analysis_4jqrg7go/ADGM Standard Employment Contract - ER 2019 - Short Version May 2024.docx: 'Paragraph' object has no attribute 'insert_paragraph_after'
✓ Annotated: ADGM Standard Employment Contract - ER 2019 - Short Version May 2024.docx
Detected process: Employment/HR
Document completeness: 1/3
Starting analysis of 1 documents...
Processing: ADGM Standard Employment Contract Template - ER 2024 Feb 2025.docx
Detected type: employment_contract
Error annotating document /tmp/adgm_analysis_7_gkjmj2/ADGM Standard Employment Contract Template - ER 2024 Feb 2025.docx: 'Paragraph' object has no attribute 'insert_paragraph_after'
✓ Annotated: ADGM Standard Employment Contract Template - ER 2024 Feb 2025.docx
Detected process: Employment/HR
Document completeness: 1/3
Starting analysis of 1 documents...