In [17]:
!pip install python-docx gradio pypdf2 scikit-learn rapidfuzz



In [19]:
from docx import Document
import os, shutil

os.makedirs("sample_docs", exist_ok=True)

docs_content = {
    "Articles_of_Association.docx": """Articles of Association
Clause 3.1 — Jurisdiction is UAE Federal Courts
Clause 5.2 — Shareholders may transfer shares with approval
Clause 9 — (Signatures missing)
""",
    "Memorandum_of_Association.docx": """Memorandum of Association
Clause 1 — Company name is Example ADGM Ltd
Clause 2 — Registered office located in Abu Dhabi Global Market
""",
    "Board_Resolution.docx": """Board Resolution
It was resolved that John Smith is appointed as the Director of the Company.
(Note: Missing signatory lines)
""",
    "UBO_Declaration_Form.docx": """UBO Declaration Form
Ultimate Beneficial Owner: Jane Doe
Ownership: 120%
Nationality: British
""",
    "Register_of_Members_and_Directors.docx": """Register of Members and Directors
Member: Jane Doe — 60% Ownership
Director: John Smith
"""
}

for filename, content in docs_content.items():
    doc = Document()
    for line in content.split("\n"):
        doc.add_paragraph(line)
    doc.save(f"sample_docs/{filename}")

shutil.make_archive("sample_adgm_docs", 'zip', "sample_docs")
print("Sample test docs created: sample_adgm_docs.zip")

Sample test docs created: sample_adgm_docs.zip


In [20]:
from google.colab import files
uploaded = files.upload()
print("Uploaded:", list(uploaded.keys()))

Saving Data Sources.pdf to Data Sources.pdf
Uploaded: ['Data Sources.pdf']


In [21]:
import re
from pypdf import PdfReader

pdf_file = [f for f in os.listdir() if f.lower().endswith(".pdf")][0]
reader = PdfReader(pdf_file)

full_text = "\n".join([page.extract_text() or "" for page in reader.pages])

patterns = [
    r"Articles of Association",
    r"Memorandum of Association",
    r"Register of Members and Directors",
    r"UBO Declaration Form",
    r"Board Resolution"
]

checklist_auto = sorted(set([m.group(0) for pat in patterns for m in re.finditer(pat, full_text, re.I)]))
print("Checklist extracted:", checklist_auto)

if not checklist_auto:
    checklist_auto = [
        "Articles of Association",
        "Memorandum of Association",
        "Board Resolution",
        "UBO Declaration Form",
        "Register of Members and Directors"
    ]
    print("Fallback checklist used.")

Checklist extracted: []
Fallback checklist used.


In [22]:
from rapidfuzz import fuzz

def extract_text_from_docx(path):
    doc = Document(path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def classify_document(text, refs, threshold=65):
    best = ("Unknown", 0)
    for ref in refs:
        score = fuzz.partial_ratio(ref.lower(), text.lower())
        if score > best[1]:
            best = (ref, score)
    return best if best[1] >= threshold else ("Unknown", best[1])

In [23]:
def rule_based_check(text, doc_type):
    issues = []

    # Jurisdiction check
    if "uae federal" in text.lower() or "federal court" in text.lower():
        issues.append({
            "section": "Jurisdiction clause",
            "issue": "Mentions UAE Federal Courts instead of ADGM Courts",
            "severity": "High",
            "suggestion": "Replace with 'Abu Dhabi Global Market (ADGM) Courts'"
        })

    # Signature check
    if "sign" not in text.lower():
        issues.append({
            "section": "Signature section",
            "issue": "No signature section found",
            "severity": "Medium",
            "suggestion": "Add signature lines for authorized signatories"
        })

    # Ownership check for UBO
    if "ownership" in text.lower():
        for num in re.findall(r"(\d+)%", text):
            if int(num) > 100:
                issues.append({
                    "section": "Ownership percentage",
                    "issue": f"Ownership exceeds 100% ({num}%)",
                    "severity": "High",
                    "suggestion": "Ensure total ownership is 100% or less"
                })

    return issues

In [24]:
def insert_notes(input_path, issues, output_path):
    doc = Document(input_path)
    for issue in issues:
        doc.add_paragraph(f"REVIEW NOTE [{issue['section']}]: {issue['suggestion']}")
    doc.save(output_path)

In [25]:
import json, zipfile

def process_docs(doc_paths, process_name="Company Incorporation"):
    results = []
    reviewed_files = []
    uploaded_types = []

    for path in doc_paths:
        text = extract_text_from_docx(path)
        doc_type, score = classify_document(text, checklist_auto)
        uploaded_types.append(doc_type)
        issues = rule_based_check(text, doc_type)

        out_path = f"reviewed_{os.path.basename(path)}"
        insert_notes(path, issues, out_path)
        reviewed_files.append(out_path)

        results.append({
            "document": doc_type,
            "score": score,
            "issues_found": issues
        })

    missing = [doc for doc in checklist_auto if doc not in uploaded_types]

    report = {
        "process": process_name,
        "documents_uploaded": len(uploaded_types),
        "required_documents": len(checklist_auto),
        "missing_documents": missing,
        "documents": results
    }

    with open("report.json", "w") as f:
        json.dump(report, f, indent=2)

    with zipfile.ZipFile("reviewed_package.zip", "w") as z:
        z.write("report.json")
        for fpath in reviewed_files:
            z.write(fpath)

    return report, "reviewed_package.zip"

In [26]:
sample_paths = [os.path.join("sample_docs", f) for f in os.listdir("sample_docs") if f.endswith(".docx")]
report, zip_file = process_docs(sample_paths)
print(json.dumps(report, indent=2))

from google.colab import files
files.download(zip_file)

{
  "process": "Company Incorporation",
  "documents_uploaded": 5,
  "required_documents": 5,
  "missing_documents": [],
  "documents": [
    {
      "document": "Articles of Association",
      "score": 100.0,
      "issues_found": [
        {
          "section": "Jurisdiction clause",
          "issue": "Mentions UAE Federal Courts instead of ADGM Courts",
          "severity": "High",
          "suggestion": "Replace with 'Abu Dhabi Global Market (ADGM) Courts'"
        }
      ]
    },
    {
      "document": "UBO Declaration Form",
      "score": 100.0,
      "issues_found": [
        {
          "section": "Signature section",
          "issue": "No signature section found",
          "severity": "Medium",
          "suggestion": "Add signature lines for authorized signatories"
        },
        {
          "section": "Ownership percentage",
          "issue": "Ownership exceeds 100% (120%)",
          "severity": "High",
          "suggestion": "Ensure total ownership is 100% 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
import gradio as gr

def gradio_runner(files):
    paths = [f.name for f in files]
    report, zip_file = process_docs(paths)
    return json.dumps(report, indent=2), zip_file

ui = gr.Interface(
    fn=gradio_runner,
    inputs=gr.File(file_types=[".docx"], file_count="multiple"),
    outputs=["text", gr.File()]
)

ui.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://94e7e572be1c34b418.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


