*This agent will assist in reviewing, validating, and helping users prepare documentation for business incorporation and compliance within the Abu Dhabi Global Market (ADGM) jurisdiction.*

**Installed** **dependencies**



In [1]:
# Installed dependencies
!pip install python-docx
!pip install docx2txt
!pip install langchain
!pip install gradio
!pip install PyPDF2
!pip install sentence-transformers
!pip install langchain-google-genai
!pip install google-generativeai
!pip install langchain-community
!pip install "unstructured[docx]"
!pip install numpy
!pip install chromadb

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading

Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-ai-generativelanguage
  Attempting uninstall: google-ai-generativelanguage
    Found existing installation: google-ai-generativelanguage 0.6.18
    Uninstalling google-ai-generativelanguage-0.6.18:
      Successfully uninstalled google-ai-generativelanguage-0.6.18
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-google-genai 2.1.9 requires google-ai-generativelanguage<0.7.0,>=0.6.18, but you have google-ai-generativelanguage 0.6.15 which is incompatible.[0m[

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

**Import Libraries**

In [None]:

import os
import json
from datetime import datetime
from typing import List
from xml.etree.ElementTree import Element, tostring
import xml.etree.ElementTree as ET
from docx import Document
from docx.opc.constants import CONTENT_TYPE, RELATIONSHIP_TYPE
from docx.opc.packuri import PackURI
from docx.opc.part import Part
from docx.oxml import OxmlElement, parse_xml
from docx.oxml.ns import qn
from google.colab import files
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
import gradio as gr

**Loading The generated API Key from Google as it is an open source LLMs**

In [3]:
GOOGLE_API_KEY = input("Enter your Google API key for Gemini: ")  # Securely input in Colab
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

Enter your Google API key for Gemini: AIzaSyAn-YaCahaB6uUBLmD4gCwQMIZVaXwjQ20


**XML for comments part**

In [None]:
# Default XML for comments part
_COMMENTS_PART_DEFAULT_XML_BYTES = b'<w:comments xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"></w:comments>'

**Document Parsing**

In [None]:
#read_docx extracts text from .docx files, while identify_document_type classifies documents based on keywords, enabling tailored processing
def read_docx(file_path):
    """
    Reads the text content from a .docx file.
    """
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

def identify_document_type(text):
    """
    Identifies the document type based on keywords.
    """
    if "Articles of Association" in text or "AoA" in text:
        return "Articles of Association"
    elif "Memorandum of Association" in text or "MoA" in text:
        return "Memorandum of Association"
    elif "Employment Contract" in text:
        return "Employment Contract"
    return "Unknown"

**RAG Pipeline**

In [None]:
# Implements a RAG pipeline—loading reference documents, creating a vector store with embeddings, and querying with Gemini. This enhances compliance checks with contextual ADGM regulations.
def load_reference_docs():
    """
    Loads ADGM reference documents uploaded to Colab.
    """
    documents = []
    print("Please upload Data Sources.pdf and ADGM reference documents (PDFs and .docx).")
    uploaded = files.upload()
    for filename, content in uploaded.items():
        with open(filename, "wb") as f:
            f.write(content)
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(filename)
            documents.extend(loader.load_and_split())
        elif filename.endswith(".docx"):
            loader = UnstructuredWordDocumentLoader(filename)
            documents.extend(loader.load())
    return documents

def setup_rag(documents):
    """
    Sets up the RAG vector store.
    """
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings)
    return vectorstore

def query_rag(vectorstore, query):
    """
    Queries the RAG system with Gemini.
    """
    docs = vectorstore.similarity_search(query, k=3)
    context = "\n".join([doc.page_content for doc in docs])
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
    response = llm.invoke(f"Based on ADGM regulations: {context}\n\nQuery: {query}").content
    return response

**Compliance Checking**

In [None]:
#Defines compliance rules, infers processes, checks for missing documents, and detects red flags (e.g., missing jurisdiction clause).
checklists = {
    "Company Incorporation": [
        "Articles of Association",
        "Memorandum of Association",
        "Incorporation Application Form",
        "UBO Declaration Form",
        "Register of Members and Directors"
    ],
}

def infer_process(uploaded_doc_types):
    """
    Infers the legal process based on uploaded document types.
    """
    if set(["Articles of Association", "Memorandum of Association"]).issubset(set(uploaded_doc_types)):
        return "Company Incorporation"
    elif "Articles of Association" in uploaded_doc_types:
        return "Partial Company Incorporation"  # Fallback for single document
    return "Unknown"

def check_missing_documents(process, uploaded_doc_types):
    """
    Checks for missing documents based on ADGM checklist.
    """
    if process == "Company Incorporation":
        required = checklists[process]
    elif process == "Partial Company Incorporation":
        required = ["Articles of Association", "Memorandum of Association"]  # Minimal set
    else:
        required = []  # Default to empty for truly unknown
    missing = list(set(required) - set(uploaded_doc_types))
    return missing, required  # Return both missing and required lists

def detect_red_flags(doc_text, doc_type):
    """
    Detects red flags in the document based on ADGM rules.
    """
    issues = []
    if doc_type == "Articles of Association":
        if "ADGM Courts" not in doc_text:
            issues.append({
                "document": doc_type,
                "section": "Jurisdiction Clause",
                "issue": "Jurisdiction clause does not specify ADGM",
                "severity": "High",
                "suggestion": "Update to ADGM Courts per Companies Regulations 2020, Art. 6"
            })
    return issues

**Comment Generation**

In [None]:
# Adds comments to .docx files using OOXML, embedding compliance suggestions directly into the document.
def add_comment_to_elements_in_place(
    docx_doc: Document, elements: List[Element], author: str, comment_text: str
) -> None:
    if not elements:
        return

    try:
        comments_part = docx_doc.part.part_related_by(
            RELATIONSHIP_TYPE.COMMENTS
        )
    except KeyError:
        comments_part = Part(
            partname=PackURI("/word/comments.xml"),
            content_type=CONTENT_TYPE.WML_COMMENTS,
            blob=_COMMENTS_PART_DEFAULT_XML_BYTES,
            package=docx_doc.part.package,
        )
        docx_doc.part.relate_to(comments_part, RELATIONSHIP_TYPE.COMMENTS)

    ET.register_namespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main")
    comments_xml = parse_xml(comments_part.blob)

    # Create the comment
    comment_id = str(len(comments_xml.findall(qn("w:comment"))))
    comment_element = OxmlElement("w:comment")
    comment_element.set(qn("w:id"), comment_id)
    comment_element.set(qn("w:author"), author)
    comment_element.set(qn("w:date"), datetime.now().isoformat())
    comment_element.set(qn("w:initials"), "AP")

    # Create the text element for the comment
    comment_paragraph = OxmlElement("w:p")
    comment_run = OxmlElement("w:r")
    comment_text_element = OxmlElement("w:t")
    comment_text_element.text = comment_text
    comment_run.append(comment_text_element)
    comment_paragraph.append(comment_run)
    comment_element.append(comment_paragraph)

    comments_xml.append(comment_element)
    comments_part._blob = tostring(comments_xml)

    # Create the commentRangeStart and commentRangeEnd elements
    comment_range_start = OxmlElement("w:commentRangeStart")
    comment_range_start.set(qn("w:id"), comment_id)
    comment_range_end = OxmlElement("w:commentRangeEnd")
    comment_range_end.set(qn("w:id"), comment_id)

    # Add the commentRangeStart to the first element and commentRangeEnd to
    # the last element
    elements[0].insert(0, comment_range_start)
    elements[-1].append(comment_range_end)

    # Add the comment reference to each element in the range
    comment_reference = OxmlElement("w:r")
    comment_reference_run = OxmlElement("w:r")
    comment_reference_run_properties = OxmlElement("w:rPr")
    comment_reference_run_properties.append(
        OxmlElement("w:rStyle", {qn("w:val"): "CommentReference"})
    )
    comment_reference_run.append(comment_reference_run_properties)
    comment_reference_element = OxmlElement("w:commentReference")
    comment_reference_element.set(qn("w:id"), comment_id)
    comment_reference_run.append(comment_reference_element)
    comment_reference.append(comment_reference_run)
    elements[0].append(comment_reference)

def generate_reviewed_doc(input_path, issues, output_path):
    """
    Generates a reviewed .docx file with comments.
    """
    doc = Document(input_path)
    for issue in issues:
        comment_text = f"{issue['issue']}. Suggestion: {issue['suggestion']}"
        # Add to the first paragraph's runs as elements
        first_paragraph = doc.paragraphs[0]
        elements = [run._element for run in first_paragraph.runs if run._element is not None]  # Fix FutureWarning
        if elements:
            add_comment_to_elements_in_place(doc, elements, "Corporate Agent", comment_text)
    doc.save(output_path)
    return output_path

**Report Generation**

In [None]:
#Creates a structured JSON report summarizing the analysis, facilitating easy review and integration.
def generate_report(process, uploaded_docs, required_docs, issues, missing):
    """
    Generates a JSON report summarizing the analysis.
    """
    report = {
        "process": process,
        "documents_uploaded": len(uploaded_docs),
        "required_documents": len(required_docs),  # Use length of required_docs
        "missing_document": missing,  # Use missing from check_missing_documents
        "issues_found": issues
    }
    with open("output_report.json", "w") as f:
        json.dump(report, f, indent=4)
    return report


**Processing Function**

In [None]:
# Orchestrates the workflow—parsing, compliance checking, comment generation, and reporting—for uploaded files
def process_documents(files):
    """
    Processes uploaded .docx files and returns report and reviewed document.
    """
    uploaded_docs = []
    issues = []
    for file in files or []:  # Handle no files uploaded
        text = read_docx(file)  # file is a string path in Gradio/Colab
        doc_type = identify_document_type(text)
        uploaded_docs.append(doc_type)
        rag_response = query_rag(vectorstore, f"Is this {doc_type} compliant with ADGM regulations?")
        issues.extend(detect_red_flags(text, doc_type))

    process = infer_process(uploaded_docs)
    missing, required_docs = check_missing_documents(process, uploaded_docs)  # Unpack both returns

    output_doc_path = "reviewed.docx"
    if files:
        generate_reviewed_doc(files[0], issues, output_doc_path)  # Use first file for review

    report = generate_report(process, uploaded_docs, required_docs, issues, missing)

    return report, output_doc_path

**Gradio Interface and Execution**

In [None]:
# Load Reference Documents (Run Once)
print("Uploading reference documents...")
documents = load_reference_docs()
vectorstore = setup_rag(documents)

In [None]:
#  Creating Gradio Interface
iface = gr.Interface(
    fn=process_documents,
    inputs=[gr.File(label="Upload .docx Files", file_count="multiple")],
    outputs=[gr.JSON(label="Analysis Report"), gr.File(label="Reviewed Document")]
)

In [None]:
# Launching Gradio Interface
iface.launch(share=True)  # Creates a public URL in Colab