In [1]:
# @title Dependencies
%pip install --quiet python-docx\
stix2\
langchain-core\
google-generativeai

%pip install -U pymupdf4llm\
pymupdf

%pip install tiktoken

%pip install mistralai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/177.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.8/177.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf4llm
  Downloading pymupdf4llm-0.0.27-py3-none-any.whl.metadata (4.8 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.27-py3-none-any.whl (30 kB)
Downloading pymupdf-1.26.4-cp39-abi

In [4]:
# @title Imports
import io
import re
import os
import json
import time
import uuid
import locale
import random
import string
import tempfile
from pprint import pprint
from datetime import datetime
from io import BytesIO
from dataclasses import dataclass
import glob
import ast
from abc import ABC, abstractmethod

# Text and file processing libraries
import html
from PIL import Image
import textwrap
import docx
from docx import Document
from bs4 import BeautifulSoup

import pymupdf4llm
import sys, pymupdf

# Third-party library imports for data manipulation and analysis
import numpy as np
import pandas as pd


# Visualization and display libraries
from IPython.display import HTML, display
import graphviz

# STIX2 Cyber Threat Intelligence objects
import stix2

# Google Colab utilities
from google.colab import userdata
from ipywidgets import FileUpload, widgets, Output
from IPython.display import display, Markdown
import IPython.display

# Google Gemini
import google.generativeai as genai

# Openai
import requests
import httpx
import openai
from openai import OpenAI
import base64

from mistralai import Mistral

# Langchain utilities
from langchain_core.output_parsers.json import parse_json_markdown

# LLM APIs

In [12]:
# @title Abstract API defintion

class LlmAPI(ABC):
    @abstractmethod
    def __init__(self, color: str):
        pass
    @abstractmethod
    def get_response(self):
        pass
    @abstractmethod
    def get_full_response(self):
        pass

# Wrapper class to create unified access to llm response
class ResponseWrapper:
    def __init__(self, text):
        self.text = text

In [10]:
# @title Gemini API Class

class GeminiAPI(LlmAPI):
  def __init__(self):
    """
    Configuring and initialising gemini api
    """
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

    # Set up the model
    self.generation_config = {
      "temperature": 0.1,
      "top_p": 0.95,
      "top_k": 64,
      "max_output_tokens": 10000
    }

      # "response_mime_type": "application/json"

    # Safety settings
    self.safety_settings = [
      {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE"
      },
      {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE"
      },
      {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE"
      },
      {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE"
      },
    ]

    #Model Selector
    self.model = genai.GenerativeModel('gemini-2.5-flash',
                                  generation_config=self.generation_config,
                                  safety_settings=self.safety_settings)

    self.response = ''

  def get_response(self, query):
    self.response = self.model.generate_content(query)

    return self.response.text

  def get_full_response(self,query,images = []):
    self.response = self.model.generate_content(self.create_gemini_content(query,images))
    return self.response


  def create_gemini_content(self,prompt, image_bytes_list):
    contents = []
    contents.append(prompt)
    # Add images to the content. Each image is a dict with 'mime_type' and 'data'.
    for image_bytes in image_bytes_list:
        if image_bytes:
            contents.append({"mime_type": "image/png", "data": image_bytes})
    return contents

In [30]:
# @title Groq LLaMA3 API Class

class LlamaAPI(LlmAPI):
  def __init__(self):
    self.api_key = userdata.get("LLAMA_API_KEY")

    openai.api_key = self.api_key
    openai.base_url = "https://api.groq.com/openai/v1/"
    self.model_name = "llama-3.3-70b-versatile"

    self.generation_config = {
      "temperature": 0.1,
      "top_p": 0.95,
      "max_tokens": 8000
    }

  def get_response(self, query):
    response = openai.chat.completions.create(
      model=self.model_name,
      messages=[{"role": "user", "content": query}],
      **self.generation_config
    )
    result = response.choices[0].message.content.strip()
    return ResponseWrapper(result) # send in .text attribute

  # Images are not supported in Groq
  def get_full_response(self, query, images=[]):
    return self.get_response(query)


In [14]:
# @title DeepSeek API Class
class DeepSeekAPI(LlmAPI):
    def __init__(self):
        self.client = OpenAI(api_key=userdata.get('DEEPSEEK_API_KEY'), base_url="https://api.chatanywhere.org/v1")
        self.model = "deepseek-chat"
        self.response = None
        self.temperature = 0.1
        self.top_p = 0.95
        self.max_tokens = 8000

    def get_response(self, query):
        completion = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": query}],
            temperature=self.temperature,
            top_p=self.top_p,
            max_tokens=self.max_tokens
        )
        self.response = completion
        return completion.choices[0].message.content


    def get_full_response(self, query, images=[]):
        return self.get_response(query)

In [15]:
# @title GPT 4.0 Class
class OpenAIAPI(LlmAPI):
    def __init__(self):
        self.client = OpenAI(api_key=userdata.get('DEEPSEEK_API_KEY'), base_url="https://api.chatanywhere.org/v1")

        self.model = "gpt-4o"
        self.response = None
        self.temperature = 0.1
        self.top_p = 0.95
        self.max_tokens = 8000

    def get_response(self, query):
        completion = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": query}],
            temperature=self.temperature,
            top_p=self.top_p,
            max_tokens=self.max_tokens
        )
        self.response = completion
        return completion.choices[0].message.content

    def get_full_response(self, query, image_bytes_list=[]):
        messages = [{"role": "user", "content": [{"type": "text", "text": query}]}]
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=self.temperature,
            top_p=self.top_p,
            max_tokens=self.max_tokens
        )
        self.response = response
        return response.choices[0].message.content

In [17]:
# @title Mistral API Class
class MistralAPI(LlmAPI):
    def __init__(self, api_key=None, model="ministral-8b-latest"):
        if api_key is None:
            api_key = userdata.get('MISTRAL_API_KEY')
        self.client = Mistral(api_key=api_key)
        self.model = model
        self.response = ''

    def get_response(self, query):
        chat_response = self.client.chat.complete(
            model=self.model,
            messages=[{"role": "user", "content": query}]
        )
        self.response = chat_response.choices[0].message.content
        return self.response

    def get_full_response(self, query, images=[]):
        chat_response = self.client.chat.complete(
            model=self.model,
            messages=[{"role": "user", "content": query}]
        )
        return ResponseWrapper(chat_response.choices[0].message.content)


# Document Parser

In [18]:
class DocumentProcessor:
    def __init__(self, title=None):
        #State variable for content initialation
        self.initial_processing = True

        #Set Incident Title
        self.incident_title = title

        self.pages_to_exclude = set()
        self.files_content = {}
        self.upload_widget = None
        self.setup_simple_widgets()
        self.upload_widget.observe(self.on_file_upload, names='value')

    def setup_simple_widgets(self):
        if self.upload_widget is not None:
          self.upload_widget.value.clear()
          self.upload_widget._counter = 0
        self.upload_widget = FileUpload(multiple=True)
        display(self.upload_widget)

    def on_file_upload(self, change):
        self.process_files()

    def parse_text(self, file_name: str, content: io.BytesIO) -> str:
        text = None
        tables = None
        images = None
        if file_name.endswith('.pdf'):
            text,images = self.handle_pdf(content,file_name)
        else:
            raise ValueError(f"Unsupported file type: {file_name}")

        cleaned_text = re.sub(r'\s+', ' ', text).strip()
        return cleaned_text, tables, images

    def handle_pdf(self,content,file_name):
      document = pymupdf.open(stream=content)
      images = []

      #Image Extraction
      for page_index in range(len(document)):
          page = document[page_index]
          for img_index, img in enumerate(page.get_images(full=True), start=1):
              xref = img[0]
              base_image = document.extract_image(xref)

              image_bytes = base_image["image"]
              image_ext = base_image["ext"]
              images.append(image_bytes)

      #Text to Markdown
      md_text = pymupdf4llm.to_markdown(document)

      return md_text,images

    def process_files(self):
        #Initialise
        if self.initial_processing:
          self.files_content['name'] = ""
          self.files_content['content'] = ""
          self.files_content['tables'] = ""
          self.files_content['images'] = []
          self.initial_processing = False

        for name, content in self.upload_widget.value.items():
            text, tables, images = self.parse_text(name, io.BytesIO(content['content']))
            self.files_content['name'] = name
            self.files_content['content'] = self.files_content['content'] + text
            self.files_content['tables'] = tables
            self.files_content['images'].extend(images)
            print(f"File Processed!: {name}")

        print("Processing Completed!")

    # Iterate the created pdf_images folder and load every image in bytes format
    def proccess_images(self):
      img_data = []
      for filename in glob.iglob('./pdf_images/' + '**/*.png', recursive=True):
          with open(filename, "rb") as img_file:
            img_data.append(img_file.read())
      return img_data

    def get_processed_texts(self):
        return self.files_content


# Queries

In [19]:
# @title Summarise report query
def report_query(processed_texts):
 return f"""
Your task is to process the given section of a Cyber Threat Intelligence (CTI) report from the viewpoint of a cybersecurity analyst with significant expertise in the MITRE ATT&CK framework and recreate the attack flow.

In order to achieve this, you should do the following:
* Analyze the text sentence-by-sentence, if and when necessary, consider additional sentences for context, to identify specific actions taken by attackers.
* Consider only the report as source before creating your response
* Work out the initial attack, which was used to gain access to the system and made the subsequent attacks viable.
* Include each and every step the attackers take, any and all steps, both the obvious and subtle.
* Hierarchy of prerequisite attacks should be mostly linear, as these are steps taken in order, with occasional branching and rejoining.
* Ensure compilation of all the assets compromised in the attack, even the most minor asset compromised should be noted.
* Be as detailed as possible when listing all techniques, if an asset is compromised, the system compromised to achieve this should be noted.
* Any specific script/program/malware used must be noted and added to assets.

For each action/step identified, you will output information in JSON format with the following structure:

1. Action Name: The specific technique utlised/action taken by the attacker, as described in the text, like 'Vulnerability Scanning', 'Exploit Public-Facing Application', etc.
2. Tactic ID: The Tactic ID from the MITRE ATT&CK framework that categorizes the overarching goal of the action (e.g., TA0001 for Initial Access).
3. Technique ID/Sub-technique ID: The specific Technique or Sub-technique ID from the MITRE ATT&CK framework that the action corresponds to.
4. Label(s): the technique concatenated with the Technique id, with a singular hyphen between, returned as an array with a single element (eg. ["Spearphishing Attachment - T1566"])
5. Affected Assets: The asset(s) targeted or compromised by the action, based on the report's context.
6. Prerequisite: the action name and technqiue id of any technique that must be completed previously in the attack for this step to take place. If there are no prerequisites, leave the array empty

Your response should be structured as follows (sample JSON for guidance):
{{techniques[
{{
  "action_name": "Example technique",
  "tactic_id": "TA000X",
  "technique_id": "TXXXX",
  "label":["action_name"-"technique_id"],
  "affected_assets": ["Example Affected Asset 1","Example Affected Asset 2"],
  "prerequisite": ["Action Name - Technqiue ID Prior Action in the flow"]
}}
]
}}
If you use double quotes (\") in any results, please escape them with \ to avoid poor JSON formating.
Respond with only the required JSON, DO NOT include any preamble or other comments, return only the JSON.

{processed_texts}

"""




In [20]:
# @title Minimal report query
def report_query_minimal(processed_texts):
 return f"""
Process the given section of a Cyber Threat Intelligence (CTI) report and identify all attacker actions associated with MITRE ATT&CK techniques to recreate the attack flow.
Output the findings in JSON format containing:

{{techniques[
{{
  "action_name": "Example technique",
  "tactic_id": "TA000X",
  "technique_id": "TXXXX",
  "label":["action_name"-"technique_id"],
  "affected_assets": ["Example Affected Asset 1","Example Affected Asset 2"],
  "prerequisite": ["Action Name - Technqiue ID Prior Action in the flow"]
}}
]
}}


If you use double quotes (\") in any results, please escape them with \ to avoid poor JSON formating.

Respond with only the required JSON, DO NOT include any preamble or other comments, return only the JSON.

{processed_texts}

"""



In [21]:
# @title Moderate report query
def report_query_moderate(processed_texts):
 return f"""
Process the given section of a Cyber Threat Intelligence (CTI) report and identify all attacker actions associated with MITRE ATT&CK techniques to recreate the attack flow from the perspective of a cybersecurity analyst familiar with the MITRE ATT&CK framework.
Identify attacker actions in the order they occurred, along with affected assets and any prerequisites between actions.
When assigning MITRE techniques, be as precise as possible based only on the report’s content.
Output your findings in JSON format with:
Action Name
Tactic ID
Technique ID/Sub-technique ID
Label(s)
Affected Assets
Prerequisites

{{techniques[
{{
  "action_name": "Example technique",
  "tactic_id": "TA000X",
  "technique_id": "TXXXX",
  "label":["action_name"-"technique_id"],
  "affected_assets": ["Example Affected Asset 1","Example Affected Asset 2"],
  "prerequisite": ["Action Name - Technqiue ID Prior Action in the flow"]
}}
]
}}


If you use double quotes (\") in any results, please escape them with \ to avoid poor JSON formating.

Respond with only the required JSON, DO NOT include any preamble or other comments, return only the JSON.

{processed_texts}

"""



# Wrapper


In [22]:
class Output:
  def __init__(self, doc_processor, model="gemini"):
    #Define Model
    if model == "llama":
      self.model = LlamaAPI()
    elif model == "gemini":
      self.model = GeminiAPI()
    elif model == "deepseek":
      self.model = DeepSeekAPI()
    elif model == "gpt":
      self.model = OpenAIAPI()
    elif model =="mistral":
      self.model = MistralAPI()

    #Get Data
    self.doc_processor = doc_processor
    self.doc_object = doc_processor.get_processed_texts()
    self.doc_name,extension = self.doc_object['name'].split(".")
    self.processed_texts = self.doc_object['content']
    self.processed_images = self.doc_object['images']

    #Query
    self.query = report_query(self.processed_texts)

    self.report_summarised = self.model.get_full_response(self.query,self.processed_images)


# Output

In [25]:
# @title Full File Upload (Unlimited Input)
#Parameter title=""  || set value to be shown on top of graph
doc_processor = DocumentProcessor(title="Swift")


FileUpload(value={}, description='Upload', multiple=True)

File Processed!: Sony_Source_.pdf
Processing Completed!


In [26]:
# @title Mininmum File Upload(Small Input)
doc_processor_small = DocumentProcessor(title="Swift")

FileUpload(value={}, description='Upload', multiple=True)

File Processed!: Sony_Source_.pdf
Processing Completed!


In [None]:
# @title Medium File Upload(Moderate Input)
doc_processor_medium = DocumentProcessor(title="Swift")

FileUpload(value={}, description='Upload', multiple=True)

File Processed!: Source2.pdf
Processing Completed!


# Experiment

In [28]:
# @title Helper Variables

counter_llama = 0
counter_gemini = 0
counter_deepseek = 0
counter_gpt = 0
counter_mistral = 0

In [31]:
# @title LLAMA
counter_llama += 1
o = Output(doc_processor_small,"llama")
json_start = o.report_summarised.text.find('{')
json_end = o.report_summarised.text.rfind('```')
json_text = o.report_summarised.text[json_start:json_end].strip()
data = json.loads(json_text+"}")

with open(f"{doc_processor.incident_title}_LLama_{counter_llama}.json", "w", encoding="utf-8") as f:
  json.dump(data, f, indent=2)


In [None]:
# @title GEMINI

counter_gemini += 1
o = Output(doc_processor_small,"gemini")
text = o.report_summarised.text.strip()
if text.startswith("```json"):
    text = text[len("```json"):].strip()
if text.endswith("```"):
    text = text[:-3].strip()
data = json.loads(text)
with open(f"{doc_processor_medium.incident_title}_Gemini_{counter_gemini}.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)

In [None]:
# @title DeepSeek
counter_deepseek += 1
o = Output(doc_processor_small,"deepseek")
cleaned_string = o.report_summarised.strip('`').lstrip('json').strip()
data = json.loads(cleaned_string)
with open(f"{doc_processor.incident_title}_DeepSeek_{counter_deepseek}.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)

In [None]:
# @title Mistral
counter_mistral += 1
o = Output(doc_processor,"mistral")
cleaned_string = o.report_summarised.text.strip('`').lstrip('json').strip()
data = json.loads(cleaned_string)
with open(f"{doc_processor.incident_title}_MISTRAL_{counter_mistral}.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)

⚠ Images are ignored for Ministral 8B model.


In [None]:
# @title GPT
counter_gpt += 1
o = Output(doc_processor_small,"gpt")
cleaned_string = o.report_summarised.strip('`').lstrip('json').strip()
data = json.loads(cleaned_string)
with open(f"{doc_processor.incident_title}_GPT_{counter_gpt}.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)
