<a href="https://colab.research.google.com/github/AdityaBhatt3010/AI-Risk-Assessment/blob/main/Risk_Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install spacy PyMuPDF fpdf

Collecting PyMuPDF
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=948c40e34b40841efd2259e48f8ab5bd147eef8e8443acc2719f28bf4c21b88e
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf, PyMuPDF
Successfully installed PyMuPDF-1.26.0 fpdf-1.7.2


In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import fitz  # PyMuPDF
import spacy
import re
import os

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

In [17]:
# Keywords for risk indicators
RISK_INDICATORS = {
    "third_party_data_sharing": [
        "third-party", "3rd party", "external vendor", "data sharing", "shared with vendors"
    ],
    "log_retention": [
        "log retention", "retention period", "logging policy", "audit log retention", "data retention"
    ],
    "source_code_review": [
        "source code review", "code inspection", "static analysis", "manual code review"
    ]
}

# Regex for technical segments
FRAMEWORK_SEGMENTS = {
    "ports": r"\bport\s*\d{1,5}\b",
    "apis": r"\bAPI(?:s)?\b",
    "modules": r"\bmodules?\b",
}

In [7]:
# --- PDF Text Extractor ---
def extract_text_from_pdf(pdf_path):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError("PDF file not found!")

    print(f"📄 Reading PDF: {pdf_path}")
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [8]:

# --- Analyzer Function ---
def analyze_text(text):
    text_lower = text.lower()
    doc = nlp(text_lower)

    result = {
        "risk_indicators": {},
        "framework_segments": {}
    }

    # Match keywords for each risk indicator
    for indicator, keywords in RISK_INDICATORS.items():
        result["risk_indicators"][indicator] = any(keyword in text_lower for keyword in keywords)

    # Use regex to find mentions of framework segments
    for segment, pattern in FRAMEWORK_SEGMENTS.items():
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        result["framework_segments"][segment] = list(set(matches)) if matches else []

    return result

In [9]:
# --- Display Results ---
def print_analysis(findings):
    print("\n📊 RISK INDICATORS:")
    for indicator, present in findings["risk_indicators"].items():
        status = "✅ Detected" if present else "❌ Not Found"
        print(f"  - {indicator.replace('_', ' ').title()}: {status}")

    print("\n🔍 FRAMEWORK SEGMENTS:")
    for segment, items in findings["framework_segments"].items():
        listed = ', '.join(items) if items else "None"
        print(f"  - {segment.title()}: {listed}")

In [18]:
# --- Main Execution ---
if __name__ == "__main__":
    pdf_file = "/content/sample_soc2_report.pdf"
    try:
        raw_text = extract_text_from_pdf(pdf_file)
        findings = analyze_text(raw_text)
        print_analysis(findings)
    except Exception as e:
        print(f"❌ Error: {e}")

📄 Reading PDF: /content/sample_soc2_report.pdf

📊 RISK INDICATORS:
  - Third Party Data Sharing: ✅ Detected
  - Log Retention: ✅ Detected
  - Source Code Review: ✅ Detected

🔍 FRAMEWORK SEGMENTS:
  - Ports: port 22, port 443
  - Apis: APIs, API
  - Modules: modules, Modules, Module
