Note for Developers:

For quicker access during development, the Gemini API key is currently exposed publicly. It will be secured and stored properly in a protected format soon.

The NGROK link is also shared publicly only for this project’s preview purposes and will remain temporary.

Please be aware that the Gemini API may occasionally fail due to free-tier rate limits.

This setup can also be integrated with local LLMs (e.g., newly released GPT open-source models or DeepSeek) using frameworks like Transformers or Ollama for offline or hybrid processing.

Thank you for your understanding.

In [None]:
# @title INSTALL ALL PACKAGES 🎓
!pip install pyngrok streamlit
!pip install docx
!pip install -q -U google-genai
# Install packages
!pip install python-docx
!pip install chromadb python-docx pdfplumber beautifulsoup4


In [None]:
# @title DOWNLOAD ALL DATA OF ADGM 🌴
import os
import requests

# --- 1. Make two folders ---
user_dir = "/content/user_uploads"
ref_dir = "/content/adgm_reference"
os.makedirs(user_dir, exist_ok=True)
os.makedirs(ref_dir, exist_ok=True)

# --- 2. List of user company files (uploaded for checking) ---
user_files = {
    # User's company docs
    "Articles_of_Association.docx": "https://assets.adgm.com/download/assets/adgm-ra-model-articles-private-company-limited-by-shares.docx/015402647f0111ef91cdea7ac70a8286",
    "Register_of_Directors.docx": "https://assets.adgm.com/download/assets/Register-of-Directors-template-v1.docx/5fe5cdc26ba511ef92f68ef69f84fa1c",
    "Register_of_Shareholders.docx": "https://assets.adgm.com/download/assets/Template_RegisterOfShareholder-v1-20220107.docx/8fa6bd545b0c11efb61ca6e4a17c9897",
    "UBO_Declaration_Guidance.pdf": "https://assets.adgm.com/download/assets/Beneficial+Ownership+and+Control+Guidance+2021.pdf/628c9b6e6b9b11efbc29f277c33965bd",
    "Employment_Contract_2024.docx": "https://assets.adgm.com/download/assets/ADGM+Standard+Employment+Contract+Template+-+ER+2024+(Feb+2025).docx/ee14b252edbe11efa63b12b3a30e5e3a",
    "Employment_Contract_2019.docx": "https://assets.adgm.com/download/assets/ADGM+Standard+Employment+Contract+-+ER+2019+-+Short+Version+(May+2024).docx/33b57a92ecfe11ef97a536cc36767ef8",
    "Resolution_for_Incorporation.docx": "https://assets.adgm.com/download/assets/adgm-ra-resolution-multiple-incorporate-shareholders-LTD-incorporation-v2.docx/186a12846c3911efa4e6c6223862cd87",
    "Shareholder_Resolution_Amendment_Articles.docx": "https://assets.adgm.com/download/assets/Templates_SHReso_AmendmentArticles-v1-20220107.docx/97120d7c5af911efae4b1e183375c0b2?forcedownload=1"
}

# --- 3. List of reference/guidance/checklist files ---
ref_files = {
    "Checklist_Company_Setup.pdf": "https://www.adgm.com/documents/registration-authority/registration-and-incorporation/checklist/branch-non-financial-services-20231228.pdf",
    "Checklist_Private_Company_Limited.pdf": "https://www.adgm.com/documents/registration-authority/registration-and-incorporation/checklist/private-company-limited-by-guarantee-non-financial-services-20231228.pdf",
    "General_Incorporation.html": "https://www.adgm.com/registration-authority/registration-and-incorporation",
    "Guidance_Policy.html": "https://www.adgm.com/legal-framework/guidance-and-policy-statements",
    "Setting_Up.html": "https://www.adgm.com/setting-up"
}

# --- 4. Simple download function ---
def download_file(url, save_path):
    try:
        r = requests.get(url, stream=True)
        r.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Downloaded: {os.path.basename(save_path)}")
    except Exception as e:
        print(f"❌ Failed: {url} | Error: {e}")

# --- 5. Download all user docs ---
for filename, url in user_files.items():
    download_file(url, os.path.join(user_dir, filename))

print("\n📂 All user docs saved to:", user_dir)

# --- 6. Download all reference files ---
for filename, url in ref_files.items():
    # Save HTML as text
    if filename.endswith(".html"):
        try:
            r = requests.get(url)
            r.raise_for_status()
            with open(os.path.join(ref_dir, filename), 'w', encoding='utf-8') as f:
                f.write(r.text)
            print(f"✅ Saved HTML page: {filename}")
        except Exception as e:
            print(f"❌ Failed HTML: {url} | Error: {e}")
    else:
        download_file(url, os.path.join(ref_dir, filename))

print("\n🗂️ Reference files saved to:", ref_dir)
print("\n🚦 Download complete! Now your folders are organized and easy for RAG and compliance checking.")


In [None]:
# @title RUN THIS BEFORE CREATING RAG FOR FAKE DATA FOR NOW 🎉
import docx
from docx.shared import RGBColor
import os

# --- Set the file path (where Register_of_Directors is) ---
file_path = '/content/user_uploads/Register_of_Directors.docx'

# --- 1. Open existing docx file ---
doc = docx.Document(file_path)

# --- 2. Add a simple sample info in the first table (Director Details) ---

for table in doc.tables:
    # Find the table that matches Director info (by columns)
    if len(table.columns) >= 9:  # Director info table has 9 columns
        # Fill first data row with example director info
        if len(table.rows) > 1:  # First row is header
            cells = table.rows[1].cells
            cells[0].text = "Alex Lee"
            cells[1].text = "12 Palm St"
            cells[2].text = "Dubai"
            cells[3].text = "UAE"
            cells[4].text = "01/01/1980"
            cells[5].text = "Emirati"
            cells[6].text = "CEO"
            cells[7].text = "01/01/2024"
            cells[8].text = "01/05/2025"

            # (Optional) Add a second sample director in row 2
            if len(table.rows) > 2:
                cells = table.rows[2].cells
                cells[0].text = "Sara Patel"
                cells[1].text = "45 Lake Rd"
                cells[2].text = "Abu Dhabi"
                cells[3].text = "UAE"
                cells[4].text = "12/12/1990"
                cells[5].text = "Indian"
                cells[6].text = "Director"
                cells[7].text = "02/02/2024"
                cells[8].text = "—"
        break

# --- 3. Add sample data in second table (Company/Firm info) ---
for table in doc.tables:
    # Find table with at least 10 columns (Company info)
    if len(table.columns) >= 10:
        if len(table.rows) > 1:
            cells = table.rows[1].cells
            cells[0].text = "ADGM Tech Ltd"
            cells[1].text = "ADGM Office"
            cells[2].text = "CL12345"
            cells[3].text = "Abu Dhabi, UAE"
            cells[4].text = "LLC"
            cells[5].text = "ADGM Law"
            cells[6].text = "01/01/2024"
            cells[7].text = "01/02/2024"
            cells[8].text = "01/05/2025"
            cells[9].text = "01/06/2025"
        break

# --- 4. Save the updated docx! ---
save_path = '/content/user_uploads/Register_of_Directors.docx'
doc.save(save_path)
print(f"✅ Saved file with sample data: {save_path}")


In [None]:
# @title CREATE CROMADB RAG WITH DATA AND TEST👍
import os
import pdfplumber
import docx
import chromadb
from bs4 import BeautifulSoup

# --- SEPARATE FOLDERS FOR FILES ---
USER_FOLDER = "/content/user_uploads"
REF_FOLDER = "/content/adgm_reference"

# --- ChromaDB Setup ---
chroma_client = chromadb.PersistentClient(path="/content/chroma_db")
collection = chroma_client.get_or_create_collection(name="adgm_rules")

# --- Functions to extract text from different file types ---
def extract_text_from_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def extract_text_from_docx(path):
    doc = docx.Document(path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def extract_text_from_html(path):
    with open(path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")
    return soup.get_text(separator="\n", strip=True)

# --- Chunking function ---
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# --- Add only REFERENCE files to ChromaDB for rules ---
doc_id = 0
for file in os.listdir(REF_FOLDER):
    file_path = os.path.join(REF_FOLDER, file)
    if file.lower().endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file.lower().endswith(".docx"):
        text = extract_text_from_docx(file_path)
    elif file.lower().endswith(".html"):
        text = extract_text_from_html(file_path)
    else:
        continue

    chunks = chunk_text(text)
    for idx, chunk in enumerate(chunks):
        collection.add(
            documents=[chunk],
            metadatas=[{"source": file, "chunk_id": idx}],
            ids=[f"{doc_id}_{idx}"]
        )
    doc_id += 1

print("✅ Reference/checklist files loaded into ChromaDB")

# --- Query the vector DB (from reference, not uploads) ---
query = "Required documents for Company Incorporation"
results = collection.query(
    query_texts=[query],
    n_results=3
)

print("\n🔍 Top Results for Query:")
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
    print(f"\n📄 From: {meta['source']}")
    print(doc)


In [None]:
# @title save app.py first
%%writefile app.py
import os
import re
import json
import docx
from docx.shared import RGBColor
import pdfplumber
import streamlit as st
import google.genai as genai
import zipfile
import io


# ------------------------ SETUP FOLDERS ------------------------
USER_FOLDER = "/content/user_uploads"
REF_FOLDER = "/content/adgm_reference"
REVIEWED_FOLDER = "/content/reviewed_docs"
os.makedirs(USER_FOLDER, exist_ok=True)
os.makedirs(REF_FOLDER, exist_ok=True)
os.makedirs(REVIEWED_FOLDER, exist_ok=True)


# ------------------------ AI SETUP ------------------------
API_KEY = "AIzaSyCw4-gMPk5P3LRahNjTwkZ6VBJ39v1hKEc"
client = genai.Client(api_key=API_KEY)


COMPANY_CHECKLIST = {
    "Company Incorporation": [
        "Articles of Association",
        "Board Resolution",
        "Employment Contract",
        "UBO Declaration",
        "Register of Directors",
        "Register of Shareholders"
    ]
}


# ------------------------ HELPER FUNCTIONS ------------------------
def read_pdf(file_path):
    text_parts = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                text_parts.append(text)
    return "\n".join(text_parts)


def read_docx(file_path):
    doc = docx.Document(file_path)
    text_parts = []
    for p in doc.paragraphs:
        if p.text.strip():
            text_parts.append(p.text)
    for table in doc.tables:
        for row in table.rows:
            row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
            if row_text:
                text_parts.append(row_text)
    return "\n".join(text_parts)


def add_comment(paragraph, comment_text):
    run = paragraph.add_run(f"  [COMMENT: {comment_text}]")
    run.font.color.rgb = RGBColor(255, 0, 0)
    run.bold = True


def annotate_docx(doc_path, issues, output_path):
    doc = docx.Document(doc_path)
    for issue in issues:
        found = False
        for p in doc.paragraphs:
            if issue["keyword"].lower() in p.text.lower():
                add_comment(p, issue["suggestion"])
                found = True
                break
        if not found:
            p = doc.add_paragraph()
            add_comment(p, f"(Not found in text) {issue['suggestion']}")
    doc.save(output_path)


def parse_json_safe(raw_text):
    try:
        return json.loads(raw_text)
    except json.JSONDecodeError:
        match = re.search(r"\[.*\]", raw_text, re.S)
        if match:
            json_str = match.group(0)
            json_str = re.sub(r",\s*}", "}", json_str)
            json_str = re.sub(r",\s*]", "]", json_str)
            try:
                return json.loads(json_str)
            except:
                return []
        return []


# ------------------------ STREAMLIT APP STARTS ------------------------

st.set_page_config(page_title="ADGM Compliance Checker", layout="wide")
st.title("🎉 ADGM Company Compliance Checker")
st.markdown("See every step explained in **easy English with emojis!** 😄")

# ------------------------ FILE UPLOAD SECTION ------------------------
st.header("📤 Step 1: Upload Your Documents")
st.markdown("**Choose one way to add files:** 🤔")

# Create tabs for two options
tab1, tab2 = st.tabs(["📤 Upload New Files", "📂 Use Existing Files"])

with tab1:
    st.markdown("### 🆕 Upload Your Documents Here")
    uploaded_files = st.file_uploader(
        "Choose your ADGM documents",
        type=['pdf', 'docx'],
        accept_multiple_files=True,
        help="📋 Upload PDF or DOCX files only"
    )

    if uploaded_files:
        st.success(f"✅ {len(uploaded_files)} files uploaded! 🎉")

        # Save uploaded files to USER_FOLDER
        for uploaded_file in uploaded_files:
            file_path = os.path.join(USER_FOLDER, uploaded_file.name)
            with open(file_path, "wb") as f:
                f.write(uploaded_file.getbuffer())
            st.markdown(f"💾 Saved: `{uploaded_file.name}`")

        st.markdown("**💡 Suggestion:** Files saved! Now scroll down to see the magic happen! ✨")

with tab2:
    st.markdown("### 📁 Files Already in Folder")
    existing_files = os.listdir(USER_FOLDER)
    if existing_files:
        st.success(f"✅ Found {len(existing_files)} existing files! 📋")
        for file in existing_files:
            st.markdown(f"📄 `{file}`")
    else:
        st.info("🤷‍♂️ No files found in folder yet. Try uploading some above! ⬆️")

st.markdown("---")  # Nice separator line

# ------------------------ STAGE 1: CLASSIFICATION & DETECTION ------------------------

st.header("🎬 Stage 1: Classification & Detection")
uploaded_types = []
detected_types = {}

user_files = os.listdir(USER_FOLDER)
if not user_files:
    st.warning("No files found in user uploads folder! Upload files above or place .docx or .pdf files in `/content/user_uploads`.")
else:
    for file in user_files:
        file_path = os.path.join(USER_FOLDER, file)
        text = ""
        if file.lower().endswith(".docx"):
            text = read_docx(file_path)
        elif file.lower().endswith(".pdf"):
            text = read_pdf(file_path)
        else:
            continue

        detect_prompt = f"""
        Classify the type of this legal document.
        Choose only one (write just the name): Employment Contract, Articles of Association, Memorandum of Association, Board Resolution, UBO Declaration, Register of Directors, Register of Shareholders, Other.
        Document text:
        {text[:3000]}
        """
        resp = client.models.generate_content(model="gemini-2.0-flash", contents=detect_prompt)
        doc_type = resp.text.strip()
        uploaded_types.append(doc_type)
        detected_types[file] = doc_type
        st.markdown(f"🟦 **File:** `{file}`  →  **Gemini detected type:** *{doc_type}*")

# ------------------------ STAGE 2: CHECKLIST MATCHING ------------------------

st.header("📋 Stage 2: Checklist Matching")
def fuzzy_match(doc_type, required_type):
    return required_type.lower() in doc_type.lower()

missing_docs = []
process = "Company Incorporation"
required_docs = COMPANY_CHECKLIST[process]
for required_doc in required_docs:
    found = any(fuzzy_match(d_type, required_doc) for d_type in uploaded_types)
    if not found:
        missing_docs.append(required_doc)

if len(uploaded_types) > 0:
    st.markdown(f"🟩 **Required:** {len(required_docs)} | **Uploaded:** {len(uploaded_types)} | **Missing:** {len(missing_docs)}")
    if missing_docs:
        st.error(f"❗ Missing Documents: {missing_docs}")
    else:
        st.success("✅ All required documents are present! 🎉")
        st.markdown("**Pro Suggestion:** All documents detected. Next, check for mistakes and get suggestions below! 😃")

# ------------------------ STAGE 3: COMPLIANCE REVIEW & ANNOTATION ------------------------

st.header("📑 Stage 3: Compliance Review & Annotation")
all_issues = []

progress = st.progress(0)
total_docs = len(list(detected_types.items()))
current = 0

for file, doc_type in detected_types.items():
    file_path = os.path.join(USER_FOLDER, file)
    if not file.lower().endswith(".docx"):
        continue
    document_text = read_docx(file_path)
    retrieved_rules = "ADGM official checklist details for " + doc_type

    review_prompt = f"""
    You are an ADGM compliance assistant.
    Review the following {doc_type} against ADGM official rules.
    Return ONLY valid JSON in this exact format:
    [
      {{
        "document": "{doc_type}",
        "section": "Clause X",
        "keyword": "word from doc to locate issue",
        "issue": "short description",
        "severity": "High/Medium/Low",
        "suggestion": "how to fix"
      }}
    ]
    ADGM Official Rules:
    {retrieved_rules}
    Uploaded Document:
    {document_text[:4000]}
    """
    review_resp = client.models.generate_content(model="gemini-2.0-flash", contents=review_prompt)
    issues_detected = parse_json_safe(review_resp.text)
    annotated_path = os.path.join(REVIEWED_FOLDER, f"Reviewed_{file}")
    current += 1
    progress.progress(current / total_docs if total_docs else 1)

    if issues_detected:
        annotate_docx(file_path, issues_detected, annotated_path)
        all_issues.extend(issues_detected)
        with st.expander(f"📝 Annotated and saved: {file}", expanded=False):
            for issue in issues_detected:
                st.markdown(f"🔥 **{issue['keyword']}** (Severity: {issue['severity']}) → _{issue['suggestion']}_")
    else:
        st.success(f"✅ {file} looks clean! No Gemini problems found.")

# ---- DOWNLOAD BUTTONS for reviewed DOCX and ZIP ----
import glob
reviewed_files = [f for f in os.listdir(REVIEWED_FOLDER) if f.lower().endswith('.docx')]

st.header("📥 Download Your Reviewed Files & Report")
for filename in reviewed_files:
    file_path = os.path.join(REVIEWED_FOLDER, filename)
    with open(file_path, "rb") as f:
        st.download_button(
            label=f"⬇️ Download {filename}",
            data=f,
            file_name=filename,
            mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        )

# ZIP Download button for ALL files
if reviewed_files:
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w") as zf:
        for filename in reviewed_files:
            file_path = os.path.join(REVIEWED_FOLDER, filename)
            zf.write(file_path, arcname=filename)
    zip_buffer.seek(0)
    st.download_button(
        label="🗂️ Download ALL Reviewed DOCX as ZIP",
        data=zip_buffer,
        file_name="ADGM_Reviewed_Documents.zip",
        mime="application/zip",
    )

# Save and download JSON report
final_report = {
    "process": process,
    "documents_uploaded": len(uploaded_types),
    "required_documents": len(required_docs),
    "missing_documents": missing_docs,
    "issues_found": all_issues
}

with open("final_compliance_report.json", "w") as f:
    json.dump(final_report, f, indent=2)

with open("final_compliance_report.json", "rb") as f:
    st.download_button("⬇️ Download JSON Report", f.read(), "final_compliance_report.json", "application/json")

# ------------------------ STAGE 4: SUMMARY & REPORT ------------------------

st.header("🟪 Stage 4: Final Summary Report")

st.markdown("#### 📁 The final compliance report:")
st.json(final_report)

st.success(f"✅ Final JSON report saved at `/content/final_compliance_report.json`")
st.info(f"🌟 All reviewed (annotated) documents are saved in: `{REVIEWED_FOLDER}`")

if all_issues:
    st.markdown("### 💡 What to Fix (Suggestions):")
    for issue in all_issues[:8]:
        st.markdown(f"z🔧 {issue['suggestion']}")

st.markdown("> 😄 **Super Example Output:** If you see all green ticks above, your company docs are perfect and ready for ADGM! 🚀")

# ------------------------ END ------------------------

st.markdown("---")
st.subheader("🌈 Need Help?")
st.markdown("If you get any error, want to upload more files, or want a sample file, just ask! 👍")

RUN STREAMLIT APP

In [None]:
from pyngrok import ngrok
import os

# Set Streamlit configA
os.makedirs(".streamlit", exist_ok=True)
with open(".streamlit/config.toml", "w") as f:
    f.write("[server]\nheadless = true\nport = 5000\nenableCORS = false")

# Start ngrok
ngrok.set_auth_token("2UUGMJW8gaZ7Ikrl53By3xYHdLs_6b3ipRxC3rEXwy7JgQv5Y")
public_url = ngrok.connect(5000, domain="prepared-singularly-shepherd.ngrok-free.app")
print(f"🌐 Ngrok URL: {public_url}")

# Run Streamlit
!streamlit run app.py --server.port 5000


In [None]:
# @title LOGIC CODE WITHOUT STREAMLIT
import os
import re
import json
import docx
from docx.shared import RGBColor
import google.genai as genai
import pdfplumber

# --- Folder Setup ---
USER_FOLDER = "/content/user_uploads"
REF_FOLDER = "/content/adgm_reference"
REVIEWED_FOLDER = "/content/reviewed_docs"  # 🟩 Your folder for annotated files!
os.makedirs(USER_FOLDER, exist_ok=True)
os.makedirs(REF_FOLDER, exist_ok=True)
os.makedirs(REVIEWED_FOLDER, exist_ok=True)  # 🟢 Make sure this folder exists!

# --- Gemini Setup ---
API_KEY = "AIzaSyAlmpL5nMrAWkTC_-xTDBs1Uo_A2dv8TBM"
client = genai.Client(api_key=API_KEY)

# --- ADGM Checklist ---
COMPANY_CHECKLIST = {
    "Company Incorporation": [
        "Articles of Association",
        "Board Resolution",
        "Employment Contract",
        "UBO Declaration",
        "Register of Directors",
        "Register of Shareholders"
    ]
}

# --- Helper Functions ---
def read_pdf(file_path):
    text_parts = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                text_parts.append(text)
    return "\n".join(text_parts)

def read_docx(file_path):
    doc = docx.Document(file_path)
    text_parts = []
    for p in doc.paragraphs:
        if p.text.strip():
            text_parts.append(p.text)
    for table in doc.tables:
        for row in table.rows:
            row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
            if row_text:
                text_parts.append(row_text)
    return "\n".join(text_parts)

def add_comment(paragraph, comment_text):
    run = paragraph.add_run(f"  [COMMENT: {comment_text}]")
    run.font.color.rgb = RGBColor(255, 0, 0)
    run.bold = True

def annotate_docx(doc_path, issues, output_path):
    doc = docx.Document(doc_path)
    for issue in issues:
        found = False
        for p in doc.paragraphs:
            if issue["keyword"].lower() in p.text.lower():
                add_comment(p, issue["suggestion"])
                found = True
                break
        if not found:
            p = doc.add_paragraph()
            add_comment(p, f"(Not found in text) {issue['suggestion']}")
    doc.save(output_path)
    print(f"✅ Annotated document saved: {output_path}")

def parse_json_safe(raw_text):
    try:
        return json.loads(raw_text)
    except json.JSONDecodeError:
        match = re.search(r"\[.*\]", raw_text, re.S)
        if match:
            json_str = match.group(0)
            json_str = re.sub(r",\s*}", "}", json_str)
            json_str = re.sub(r",\s*]", "]", json_str)
            try:
                return json.loads(json_str)
            except:
                return []
        return []

# ---------------- STAGE 1: Classification & Detection ----------------

uploaded_types = []
detected_types = {}

print("🎬 Stage 1: Classification & Detection")
for file in os.listdir(USER_FOLDER):
    file_path = os.path.join(USER_FOLDER, file)
    text = ""
    if file.lower().endswith(".docx"):
        text = read_docx(file_path)
    elif file.lower().endswith(".pdf"):
        text = read_pdf(file_path)
    else:
        continue  # Skip unknown file types

    detect_prompt = f"""
    Classify the type of this legal document.
    Choose only one (write just the name): Employment Contract, Articles of Association, Memorandum of Association, Board Resolution, UBO Declaration, Register of Directors, Register of Shareholders,.
    Document text:
    {text[:3000]}
    """
    resp = client.models.generate_content(model="gemini-2.0-flash", contents=detect_prompt)
    doc_type = resp.text.strip()
    uploaded_types.append(doc_type)
    detected_types[file] = doc_type
    print(f"🟦 File: {file}  →  Gemini detected type: {doc_type}")

# ---------------- STAGE 2: Checklist Matching ----------------

def fuzzy_match(doc_type, required_type):
    return required_type.lower() in doc_type.lower()

missing_docs = []
process = "Company Incorporation"
required_docs = COMPANY_CHECKLIST[process]

for required_doc in required_docs:
    found = any(fuzzy_match(d_type, required_doc) for d_type in uploaded_types)
    if not found:
        missing_docs.append(required_doc)

print("\n📋 Stage 2: Checklist Matching")
print(f"🟩 Required: {len(required_docs)} | Uploaded: {len(uploaded_types)} | Missing: {len(missing_docs)}")
if missing_docs:
    print("❗ Missing Documents:", missing_docs)
else:
    print("✅ All required documents are present! 🎉")

# ---------------- STAGE 3: Compliance Review & Annotation ----------------

all_issues = []

print("\n📑 Stage 3: Annotating DOCX files for compliance")
for file, doc_type in detected_types.items():
    file_path = os.path.join(USER_FOLDER, file)
    if not file.lower().endswith(".docx"):
        continue
    document_text = read_docx(file_path)
    retrieved_rules = "ADGM official checklist details for " + doc_type

    review_prompt = f"""
    You are an ADGM compliance assistant.
    Review the following {doc_type} against ADGM official rules.
    Return ONLY valid JSON in this exact format:
    [
      {{
        "document": "{doc_type}",
        "section": "Clause X",
        "keyword": "word from doc to locate issue",
        "issue": "short description",
        "severity": "High/Medium/Low",
        "suggestion": "how to fix"
      }}
    ]
    ADGM Official Rules:
    {retrieved_rules}

    Uploaded Document:
    {document_text[:4000]}
    """
    review_resp = client.models.generate_content(model="gemini-2.0-flash", contents=review_prompt)
    issues_detected = parse_json_safe(review_resp.text)
    annotated_path = os.path.join(REVIEWED_FOLDER, f"Reviewed_{file}")
    if issues_detected:
        annotate_docx(file_path, issues_detected, annotated_path)
        all_issues.extend(issues_detected)
        print(f"📝 Annotated and saved: {annotated_path}")

    else:
        print(f"✅ {file} looks clean! No Gemini issues found.")

# ---------------- STAGE 4: Save and Print Final Report ----------------

final_report = {
    "process": process,
    "documents_uploaded": len(uploaded_types),
    "required_documents": len(required_docs),
    "missing_documents": missing_docs,
    "issues_found": all_issues
}
with open("/content/final_compliance_report.json", "w") as f:
    json.dump(final_report, f, indent=4)

print("\n🟪 Stage 4: Final Summary Report")
print(json.dumps(final_report, indent=2))
print("✅ Final JSON report saved at /content/final_compliance_report.json")
print(f"🌟 All reviewed documents saved in: {REVIEWED_FOLDER}")

