# Variables

In [0]:
import os
import requests

def get_databricks_http_path(DATABRICKS_HOST: str, DATABRICKS_PAT: str) -> str:
    headers = {"Authorization": f"Bearer {DATABRICKS_PAT}"}
    # List all SQL Warehouses
    resp = requests.get(f"https://{DATABRICKS_HOST}/api/2.0/sql/warehouses", headers=headers)

    __HTTP_PATH = ""
    if resp.status_code == 200:
        warehouses = resp.json().get("warehouses", [])
        for wh in warehouses:
            __HTTP_PATH = wh["odbc_params"]["path"]
    else:
        print("Error fetching warehouses:", resp.text)

    return __HTTP_PATH


# ==== CONFIG ====
CATALOG = os.getenv("DATABRICKS_CATALOG", "btv_dc30")
SCOPE = os.getenv("DATABRICKS_SCOPE", "btv_dc30")
SOURCE_SCHEMA = os.getenv("DATABRICKS_SOURCE_SCHEMA", "silver")
TARGET_SCHEMA = os.getenv("DATABRICKS_TARGET_SCHEMA", "gold")


DATABRICKS_HOST = dbutils.notebook.entry_point.getDbutils() \
    .notebook().getContext().apiUrl().get()


DATABRICKS_HOST = os.getenv("DATABRICKS_HOST", "dbc-2ee3e0e1-ed8a.cloud.databricks.com")
DATABRICKS_TOKEN = dbutils.secrets.get(SCOPE, "databricks-pat")
DATABRICKS_HTTP_PATH =  get_databricks_http_path(DATABRICKS_HOST, DATABRICKS_TOKEN) 


SQLALCHEMY_URL = f"databricks://token:{DATABRICKS_TOKEN}@{DATABRICKS_HOST}?" + f"http_path={DATABRICKS_HTTP_PATH}&catalog={CATALOG}"

# OpenAI
os.environ["OPENAI_API_KEY"] =  dbutils.secrets.get(SCOPE, "openai-token")

# CIM docs path
VECTOR_DOCS_DIR = os.getenv("VECTOR_DOCS_DIR", "./vector_docs")
CIM_DOCS_DIR = os.getenv("CIM_DOCS_DIR", f"./{VECTOR_DOCS_DIR}/splunk_cim")
ZEEK_DOCS_DIR = os.getenv("ZEEK_DOCS_DIR", f"./{VECTOR_DOCS_DIR}/zeek")
SYSMON_DOCS_DIR = os.getenv("SYSMON_DOCS_DIR", f"./{VECTOR_DOCS_DIR}/sysmon")
OSQUERY_DOCS_DIR = os.getenv("OSQUERY_DOCS_DIR", f"./{VECTOR_DOCS_DIR}/osquery")
CHROMA_DIR = os.getenv("CHROMA_DIR", f"./chroma_{CATALOG}_index")

# Install dependencies 

In [0]:
%pip install -q databricks-sql-connector==4.0.5 \
    sqlalchemy==2.0.22 \
    databricks-sqlalchemy==2.0.7 \
    llama-index-llms-openai==0.5.4 \
    llama-index-embeddings-openai==0.5.0 \
    chromadb==1.0.20 \
    requests \
    llama-index==0.13.2 \
    llama-index-vector-stores-chroma==0.5.0 \
    beautifulsoup4==4.13.4 \
    tiktoken==0.11.0

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# Load dependencies

In [0]:
import os
import re
import json
import requests
from typing import List, Dict, Tuple


from sqlalchemy import create_engine, inspect, text


# ==== LlamaIndex / Vector DB ====
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Document,
    StorageContext,
    Settings,
    load_index_from_storage
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
import chromadb
import tiktoken

# Init LLM and embedding models

The code snippet configures a text processing environment using OpenAI models. It sets up an embedding model `text-embedding-3-large` through OpenAIEmbedding for generating vector representations of text, and a language model `gpt-4o` via OpenAI with deterministic output `temperature=0`. 

Both models use the API key stored in the environment variable `OPENAI_API_KEY`. Additionally, a SentenceSplitter named `NODE_PARSER` is initialized to split text into chunks of `1024` characters with an overlap of `100` characters, enabling manageable and context-aware processing for downstream tasks.


In [0]:
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-large",
    api_key=os.environ['OPENAI_API_KEY'],
)
Settings.llm = OpenAI(
    model="gpt-4o",
    api_key=os.environ['OPENAI_API_KEY'],
    temperature=0
)


NODE_PARSER = SentenceSplitter(chunk_size=1024, chunk_overlap=100)

In [0]:
# ==== CIM MODELS ====
CIM_DATA_MODELS = [
    "Alerts",
     "Authentication",
     "Certificates",
     "Change",
     "Data Access",
     "Data Loss Prevention",
     "Databases",
     "Email",
     "Endpoint",
     "Event Signatures",
     "Interprocess Messaging",
     "Intrusion Detection",
     "Inventory",
     "Java Virtual Machines (JVM)",
     "Malware",
     "Network Resolution (DNS)",
     "Network Sessions",
     "Network Traffic",
     "Performance",
     "Splunk Audit Logs",
     "TicketManagement",
     "Updates",
     "Vulnerabilities",
     "Web"
]


MODEL_HINTS = {
    "Alerts": {
        "app",
        "description",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "dest_type",
        "id",
        "mitre_technique_id",
        "severity",
        "severity_id",
        "signature",
        "signature_id",
        "src",
        "src_bunit",
        "src_category",
        "src_priority",
        "src_type",
        "tag",
        "type",
        "user",
        "user_bunit",
        "user_category",
        "user_name",
        "user_priority",
        "vendor_account",
        "vendor_product_id",
        "vendor_region"
    },
    "Authentication": {
        "action",
        "app",
        "authentication_method",
        "authentication_service",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_nt_domain",
        "dest_priority",
        "duration",
        "reason",
        "response_time",
        "signature",
        "signature_id",
        "src",
        "src_bunit",
        "src_category",
        "src_nt_domain",
        "src_priority",
        "src_user",
        "src_user_bunit",
        "src_user_category",
        "src_user_id",
        "src_user_priority",
        "src_user_role",
        "src_user_type",
        "tag",
        "user",
        "user_agent",
        "user_bunit",
        "user_category",
        "user_id",
        "user_priority",
        "user_role",
        "user_type",
        "vendor_account",
        "vendor_product"
    },
    "Certificates": {
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_port",
        "dest_priority",
        "duration",
        "response_time",
        "src",
        "src_bunit",
        "src_category",
        "src_port",
        "src_priority",
        "ssl_end_time",
        "ssl_engine",
        "ssl_hash",
        "ssl_is_valid",
        "ssl_issuer",
        "ssl_issuer_common_name",
        "ssl_issuer_email",
        "ssl_issuer_email_domain",
        "ssl_issuer_locality",
        "ssl_issuer_organization",
        "ssl_issuer_state",
        "ssl_issuer_street",
        "ssl_issuer_unit",
        "ssl_name",
        "ssl_policies",
        "ssl_publickey",
        "ssl_publickey_algorithm",
        "ssl_serial",
        "ssl_session_id",
        "ssl_signature_algorithm",
        "ssl_start_time",
        "ssl_subject",
        "ssl_subject_common_name",
        "ssl_subject_email",
        "ssl_subject_email_domain",
        "ssl_subject_locality",
        "ssl_subject_organization",
        "ssl_subject_state",
        "ssl_subject_street",
        "ssl_subject_unit",
        "ssl_validity_window",
        "ssl_version",
        "tag",
        "transport"
    },
    "Change": {
        "action",
        "change_type",
        "command",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_ip_range",
        "dest_nt_domain",
        "dest_port_range",
        "dest_priority",
        "direction",
        "dvc",
        "image_id",
        "instance_type",
        "object",
        "object_attrs",
        "object_category",
        "object_id",
        "object_path",
        "protocol",
        "result",
        "result_id",
        "rule_action",
        "src",
        "src_bunit",
        "src_category",
        "src_ip_range",
        "src_nt_domain",
        "src_port_range",
        "src_priority",
        "src_user",
        "src_user_bunit",
        "src_user_category",
        "src_user_name",
        "src_user_priority",
        "src_user_type",
        "status",
        "tag",
        "user",
        "user_agent",
        "user_name",
        "user_type",
        "vendor_account",
        "vendor_product",
        "vendor_product_id",
        "vendor_region"
    },
    "Data Access": {
        "action",
        "app",
        "app_id",
        "dest",
        "dest_name",
        "dest_url",
        "dvc",
        "email",
        "object",
        "object_category",
        "object_id",
        "object_path",
        "object_size",
        "owner",
        "owner_email",
        "owner_id",
        "parent_object",
        "parent_object_category",
        "parent_object_id",
        "src",
        "user",
        "user_agent",
        "user_group",
        "user_role",
        "vendor_account",
        "vendor_product"
    },
    "Data Loss Prevention": {
        "action",
        "app",
        "category",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "dest_zone",
        "dlp_type",
        "dvc",
        "dvc_bunit",
        "dvc_category",
        "dvc_priority",
        "dvc_zone",
        "object",
        "object_category",
        "object_path",
        "severity",
        "severity_id",
        "signature",
        "signature_id",
        "src",
        "src_bunit",
        "src_category",
        "src_priority",
        "src_user",
        "src_user_bunit",
        "src_user_category",
        "src_user_priority",
        "src_zone",
        "tag",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product"
    },
    "Databases": {
        "availability",
        "avg_executions",
        "buffer_cache_hit_ratio",
        "commits",
        "cpu_used",
        "cursor",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "dump_area_used",
        "duration",
        "elapsed_time",
        "free_bytes",
        "indexes_hit",
        "instance_name",
        "instance_reads",
        "instance_version",
        "instance_writes",
        "last_call_minute",
        "lock_mode",
        "lock_session_id",
        "logical_reads",
        "logon_time",
        "machine",
        "memory_sorts",
        "number_of_users",
        "obj_name",
        "object",
        "os_pid",
        "physical_reads",
        "process_limit",
        "processes",
        "query",
        "query_id",
        "query_plan_hit",
        "query_time",
        "records_affected",
        "response_time",
        "seconds_in_wait",
        "serial_num",
        "session_id",
        "session_limit",
        "session_status",
        "sessions",
        "sga_buffer_cache_size",
        "sga_buffer_hit_limit",
        "sga_data_dict_hit_ratio",
        "sga_fixed_area_size",
        "sga_free_memory",
        "sga_library_cache_size",
        "sga_redo_log_buffer_size",
        "sga_shared_pool_size",
        "sga_sql_area_size",
        "src",
        "src_bunit",
        "src_category",
        "src_priority",
        "start_time",
        "stored_procedures_called",
        "table_scans",
        "tables_hit",
        "tablespace_name",
        "tablespace_reads",
        "tablespace_status",
        "tablespace_used",
        "tablespace_writes",
        "tag",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product",
        "wait_state",
        "wait_time"
    },
    "Email": {
        "action",
        "delay",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "duration",
        "file_hash",
        "file_name",
        "file_size",
        "filter_action",
        "filter_score",
        "internal_message_id",
        "message_id",
        "message_info",
        "orig_dest",
        "orig_recipient",
        "orig_src",
        "process",
        "process_id",
        "protocol",
        "recipient",
        "recipient_count",
        "recipient_domain",
        "recipient_status",
        "response_time",
        "retries",
        "return_addr",
        "signature",
        "signature_extra",
        "signature_id",
        "size",
        "src",
        "src_bunit",
        "src_category",
        "src_priority",
        "src_user",
        "src_user_bunit",
        "src_user_category",
        "src_user_domain",
        "src_user_priority",
        "status_code",
        "subject",
        "tag",
        "url",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product",
        "xdelay",
        "xref"
    },
    "Endpoint": {
        "action",
        "cpu_load_percent",
        "creation_time",
        "description",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_is_expected",
        "dest_port",
        "dest_priority",
        "dest_requires_av",
        "dest_should_timesync",
        "dest_should_update",
        "file_access_time",
        "file_acl",
        "file_create_time",
        "file_hash",
        "file_modify_time",
        "file_name",
        "file_path",
        "file_size",
        "mem_used",
        "original_file_name",
        "os",
        "parent_process",
        "parent_process_exec",
        "parent_process_guid",
        "parent_process_id",
        "parent_process_name",
        "parent_process_path",
        "process",
        "process_current_directory",
        "process_exec",
        "process_guid",
        "process_hash",
        "process_id",
        "process_integrity_level",
        "process_name",
        "process_path",
        "registry_hive",
        "registry_key_name",
        "registry_path",
        "registry_value_data",
        "registry_value_name",
        "registry_value_text",
        "registry_value_type",
        "service",
        "service_dll",
        "service_dll_hash",
        "service_dll_path",
        "service_dll_signature_exists",
        "service_dll_signature_verified",
        "service_exec",
        "service_hash",
        "service_id",
        "service_name",
        "service_path",
        "service_signature_exists",
        "service_signature_verified",
        "src",
        "src_category",
        "src_port",
        "src_priority",
        "src_requires_av",
        "src_should_timesync",
        "src_should_update",
        "start_mode",
        "state",
        "status",
        "tag",
        "transport",
        "transport_dest_port",
        "user",
        "user_bunit",
        "user_category",
        "user_id",
        "user_priority",
        "vendor_product"
    },
    "Event Signatures": {
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "signature",
        "signature_id",
        "tag",
        "vendor_product"
    },
    "Interprocess Messaging": {
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "duration",
        "endpoint",
        "endpoint_version",
        "message",
        "message_consumed_time",
        "message_correlation_id",
        "message_delivered_time",
        "message_delivery_mode",
        "message_expiration_time",
        "message_id",
        "message_priority",
        "message_properties",
        "message_received_time",
        "message_redelivered",
        "message_reply_dest",
        "message_type",
        "parameters",
        "payload",
        "payload_type",
        "request_payload",
        "request_payload_type",
        "request_sent_time",
        "response_code",
        "response_payload_type",
        "response_received_time",
        "response_time",
        "return_message",
        "rpc_protocol",
        "status",
        "tag"
    },
    "Intrusion Detection": {
        "action",
        "category",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_port",
        "dest_priority",
        "dvc",
        "dvc_bunit",
        "dvc_category",
        "dvc_priority",
        "file_hash",
        "file_name",
        "file_path",
        "ids_type",
        "severity",
        "severity_id",
        "signature",
        "signature_id",
        "src",
        "src_bunit",
        "src_category",
        "src_priority",
        "tag",
        "transport",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product"
    },
    "Inventory": {
        "array",
        "blocksize",
        "cluster",
        "cpu_cores",
        "cpu_count",
        "cpu_mhz",
        "description",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_ip",
        "dest_priority",
        "dns",
        "enabled",
        "family",
        "fd_max",
        "hypervisor",
        "hypervisor_id",
        "inline_nat",
        "interactive",
        "interface",
        "ip",
        "latency",
        "lb_method",
        "mac",
        "mem",
        "mount",
        "name",
        "node",
        "node_port",
        "os",
        "parent",
        "password",
        "read_blocks",
        "read_latency",
        "read_ops",
        "serial",
        "shell",
        "size",
        "snapshot",
        "src_ip",
        "status",
        "storage",
        "tag",
        "time",
        "user",
        "user_bunit",
        "user_category",
        "user_id",
        "user_priority",
        "vendor_product",
        "version",
        "vip_port",
        "write_blocks",
        "write_latency",
        "write_ops"
    },
    "Java Virtual Machines (JVM)": {
        "cm_enabled",
        "cm_supported",
        "committed_memory",
        "compilation_time",
        "cpu_time",
        "cpu_time_enabled",
        "cpu_time_supported",
        "current_cpu_time",
        "current_loaded",
        "current_user_time",
        "daemon_thread_count",
        "free_physical_memory",
        "free_swap",
        "heap_committed",
        "heap_initial",
        "heap_max",
        "heap_used",
        "jvm_description",
        "max_file_descriptors",
        "non_heap_committed",
        "non_heap_initial",
        "non_heap_max",
        "non_heap_used",
        "objects_pending",
        "omu_supported",
        "open_file_descriptors",
        "os",
        "os_architecture",
        "os_version",
        "peak_thread_count",
        "physical_memory",
        "process_name",
        "start_time",
        "swap_space",
        "synch_supported",
        "system_load",
        "tag",
        "thread_count",
        "threads_started",
        "total_loaded",
        "total_processors",
        "total_unloaded",
        "uptime",
        "vendor_product",
        "version"
    },
    "Malware": {
        "action",
        "category",
        "date",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_nt_domain",
        "dest_priority",
        "dest_requires_av",
        "file_hash",
        "file_name",
        "file_path",
        "product_version",
        "sender",
        "severity_id",
        "signature",
        "signature_id",
        "signature_version",
        "src",
        "src_bunit",
        "src_category",
        "src_priority",
        "tag",
        "url",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product"
    },
    "Network Resolution (DNS)": {
        "additional_answer_count",
        "answer",
        "answer_count",
        "authority_answer_count",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_port",
        "dest_priority",
        "duration",
        "message_type",
        "name",
        "query",
        "query_count",
        "query_type",
        "record_type",
        "reply_code",
        "reply_code_id",
        "response_time",
        "src",
        "src_bunit",
        "src_category",
        "src_port",
        "src_priority",
        "tag",
        "transaction_id",
        "transport",
        "ttl",
        "vendor_product"
    },
    "Network Sessions": {
        "action",
        "dest_bunit",
        "dest_category",
        "dest_dns",
        "dest_ip",
        "dest_mac",
        "dest_nt_host",
        "dest_priority",
        "duration",
        "lease_duration",
        "lease_scope",
        "response_time",
        "signature",
        "signature_id",
        "src_bunit",
        "src_category",
        "src_dns",
        "src_ip",
        "src_mac",
        "src_nt_host",
        "src_priority",
        "tag",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product"
    },
    "Network Traffic": {
        "action",
        "app",
        "bytes",
        "bytes_in",
        "bytes_out",
        "channel",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_interface",
        "dest_ip",
        "dest_mac",
        "dest_port",
        "dest_priority",
        "dest_translated_ip",
        "dest_translated_port",
        "dest_zone",
        "direction",
        "duration",
        "dvc",
        "dvc_bunit",
        "dvc_category",
        "dvc_ip",
        "dvc_mac",
        "dvc_priority",
        "dvc_zone",
        "flow_id",
        "icmp_code",
        "icmp_type",
        "packets",
        "packets_in",
        "packets_out",
        "process_id",
        "protocol",
        "protocol_version",
        "response_time",
        "rule",
        "session_id",
        "src",
        "src_bunit",
        "src_category",
        "src_interface",
        "src_ip",
        "src_mac",
        "src_port",
        "src_priority",
        "src_translated_ip",
        "src_translated_port",
        "src_zone",
        "ssid",
        "tag",
        "tcp_flag",
        "tos",
        "transport",
        "ttl",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_account",
        "vendor_product",
        "vlan",
        "wifi"
    },
    "Performance": {
        "action",
        "array",
        "blocksize",
        "cluster",
        "cpu_load_mhz",
        "cpu_load_percent",
        "cpu_time",
        "cpu_user_percent",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "dest_should_timesync",
        "dest_should_update",
        "fan_speed",
        "fd_max",
        "fd_used",
        "hypervisor_id",
        "latency",
        "mem",
        "mem_committed",
        "mem_free",
        "mem_used",
        "mount",
        "parent",
        "power",
        "read_blocks",
        "read_latency",
        "read_ops",
        "resource_type",
        "signature",
        "signature_id",
        "storage",
        "storage_free",
        "storage_free_percent",
        "storage_used",
        "storage_used_percent",
        "swap",
        "swap_free",
        "swap_used",
        "tag",
        "temperature",
        "thruput",
        "thruput_max",
        "uptime",
        "write_blocks",
        "write_latency",
        "write_ops"
    },
    "Splunk Audit Logs": {
        "access_count",
        "access_time",
        "action_mode",
        "action_name",
        "action_status",
        "app",
        "buckets",
        "buckets_size",
        "complete",
        "component",
        "cron",
        "datamodel",
        "digest",
        "duration",
        "earliest",
        "event_id",
        "host",
        "info",
        "is_inprogress",
        "last_error",
        "last_sid",
        "latest",
        "mod_time",
        "orig_rid",
        "orig_sid",
        "retention",
        "rid",
        "savedsearch_name",
        "search",
        "search_et",
        "search_lt",
        "search_name",
        "search_type",
        "sid",
        "signature",
        "size",
        "source",
        "sourcetype",
        "spent",
        "splunk_server",
        "status",
        "summary_id",
        "uri",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "view"
    },
    "TicketManagement": {
        "affect_dest",
        "change",
        "comments",
        "description",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "incident",
        "priority",
        "problem",
        "severity",
        "severity_id",
        "splunk_id",
        "splunk_realm",
        "src_user",
        "src_user_bunit",
        "src_user_category",
        "src_user_priority",
        "status",
        "tag",
        "ticket_id",
        "time_submitted",
        "user",
        "user_bunit",
        "user_category",
        "user_priority"
    },
    "Updates": {
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "dest_should_update",
        "dvc",
        "file_hash",
        "file_name",
        "severity",
        "severity_id",
        "signature",
        "signature_id",
        "status",
        "tag",
        "vendor_product"
    },
    "Vulnerabilities": {
        "bugtraq",
        "category",
        "cert",
        "cve",
        "cvss",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_priority",
        "dvc",
        "dvc_bunit",
        "dvc_category",
        "dvc_priority",
        "msft",
        "mskb",
        "severity",
        "severity_id",
        "signature",
        "signature_id",
        "tag",
        "url",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product",
        "xref"
    },
    "Web": {
        "action",
        "app",
        "bytes",
        "bytes_in",
        "bytes_out",
        "cached",
        "category",
        "cookie",
        "dest",
        "dest_bunit",
        "dest_category",
        "dest_port",
        "dest_priority",
        "duration",
        "error_code",
        "http_content_type",
        "http_method",
        "http_referrer",
        "http_referrer_domain",
        "http_user_agent",
        "http_user_agent_length",
        "operation",
        "response_time",
        "site",
        "src",
        "src_bunit",
        "src_category",
        "src_priority",
        "status",
        "storage_name",
        "tag",
        "uri_path",
        "uri_query",
        "url",
        "url_domain",
        "url_length",
        "user",
        "user_bunit",
        "user_category",
        "user_priority",
        "vendor_product"
    }
}


# Helpers to download relevant docs

The provided Python code automates the downloading and local storage of various security and logging documentation. It first ensures that the target directories exist and then defines a reusable `download_file()` to fetch files from a URL and save them locally. 

Specific functions are defined to download different sets of documents: `download_splunk_cim_json()` retrieves multiple Splunk CIM JSON files from a GitHub repository, `download_zeek_protocol_docs()` downloads a Zeek protocol PDF, `download_sysmon_docs()` fetches a Sysmon cheatsheet PDF, and `download_osquery_schema()` downloads the osquery schema JSON file. 

Finally, the `download_all_docs()` orchestrates these calls to ensure all relevant documentation is downloaded and organized into their respective directories.

In [0]:
import os
import requests
from bs4 import BeautifulSoup

# Directory to store downloaded documents
os.makedirs(VECTOR_DOCS_DIR, exist_ok=True)

def download_file(url, local_path):
    """Download a file from a URL and save it locally."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(local_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {local_path}")
    except requests.RequestException as e:
        print(f"Failed to download {url}: {e}")

def download_splunk_cim_json():
    """Download all Splunk CIM JSON files."""
    base_url = "https://raw.githubusercontent.com/splunk/addonfactory-splunk_sa_cim/master/default/data/models/"
    files = [
        "Alerts.json",
        "Application_State.json",
        "Authentication.json",
        "Certificates.json",
        "Change.json",
        "Change_Analysis.json",
        "Compute_Inventory.json",
        "DLP.json",
        "Data_Access.json",
        "Databases.json",
        "Email.json",
        "Endpoint.json",
        "Event_Signatures.json",
        "Interprocess_Messaging.json",
        "Intrusion_Detection.json",
        "JVM.json",
        "Malware.json",
        "Network_Resolution.json",
        "Network_Sessions.json",
        "Network_Traffic.json",
        "Performance.json",
        "Splunk_Audit.json",
        "Splunk_CIM_Validation.json",
        "Ticket_Management.json",
        "Updates.json",
        "Vulnerabilities.json",
        "Web.json",
    ]
    for file in list(MODEL_HINTS.keys()):
        url = base_url + file
        local_path = os.path.join(CIM_DOCS_DIR, file)
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        download_file(url, local_path)

def download_zeek_protocol_docs():
    """Download Zeek protocol documentation."""
    urls = [
        "https://f.hubspotusercontent00.net/hubfs/8645105/Corelight_May2021/Pdf/002_CORELIGHT_080420_ZEEK_LOGS_US_ONLINE.pdf"
    ]
    for url in urls:
        local_path = os.path.join(SYSMON_DOCS_DIR, os.path.basename(url))
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        download_file(url, local_path)


def download_sysmon_docs():
    """Download Sysmon documentation."""
    urls = [
        "https://networkforensic.dk/Sysmon/Files/Sysmon-Cheatsheet.pdf"
    ]
    for url in urls:
        local_path = os.path.join(SYSMON_DOCS_DIR, os.path.basename(url))
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        download_file(url, local_path)

def download_osquery_schema():
    """Download osquery schema documentation."""
    url = "https://raw.githubusercontent.com/jmpsec/osctrl/refs/heads/main/deploy/osquery/data/5.14.1.json"
    local_path = os.path.join(OSQUERY_DOCS_DIR, "5.14.1.json")
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    download_file(url, local_path)

def download_all_docs():
    
    """Download all necessary documentation."""
    download_splunk_cim_json()
    download_zeek_protocol_docs()
    download_sysmon_docs()
    download_osquery_schema()



# Vector DB bootstrap

This Python function, `build_cim_index`, constructs or loads a persistent vector index for CIM documentation using ChromaDB. It first ensures the Chroma persistence directory exists and initializes a Chroma client and collection, wrapping it in a `ChromaVectorStore` and `StorageContext`. 

If an existing index is found in the collection, it reuses it; otherwise, it traverses the specified `vector_docs_dir`, loading all files with `SimpleDirectoryReader()`, tagging each document with its data model based on its folder, and collecting them. 

The documents are then split into nodes using `NODE_PARSER` and inserted into a new `VectorStoreIndex`, which is persisted in the Chroma directory. The function returns the vector index, either reused or newly built, while logging progress throughout.

In [0]:
def build_cim_index(vector_docs_dir: str = VECTOR_DOCS_DIR, chroma_dir: str = CHROMA_DIR) -> VectorStoreIndex:
    # Ensure Chroma persistence directory exists
    os.makedirs(chroma_dir, exist_ok=True)

    client = chromadb.PersistentClient(path=chroma_dir)
    collection = client.get_or_create_collection(CATALOG)
    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_ctx = StorageContext.from_defaults(vector_store=vector_store)

    # ---- Check if index already exists ----
    if collection.count() > 0:
        print(f"[INFO] Found existing index in {chroma_dir}, reusing Chroma collection...")
        return VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_ctx)

    # ---- Otherwise build index ----
    if not os.path.isdir(vector_docs_dir):
        raise FileNotFoundError(f"CIM docs folder not found: {vector_docs_dir}")

    docs = []
    for root, _, files in os.walk(vector_docs_dir):
        for f in files:
            full = os.path.join(root, f)
            print(f"[LOAD] {full}")
            data_model = os.path.basename(root).replace(" ", "_")
            reader = SimpleDirectoryReader(input_files=[full])
            loaded = reader.load_data()
            for d in loaded:
                d.metadata = d.metadata or {}
                d.metadata.update({"data_model": data_model})
            docs.extend(loaded)

    # Parse and insert into index
    nodes = NODE_PARSER.get_nodes_from_documents(docs)
    index = VectorStoreIndex.from_documents([], storage_context=storage_ctx)
    index.insert_nodes(nodes)

    print(f"[INFO] Built and stored new index at {chroma_dir}")
    return index



# Determine CIM model selection/mapping

This code defines a set of functions for mapping a Databricks table to a Splunk CIM (Common Information Model) data model and generating a corresponding CTAS (Create Table As Select) SQL statement. The `choose_cim_model()` selects the best-fitting CIM data model for a table by scoring column names against predefined hints, retrieving relevant context from a vector index, and using an LLM to pick the most appropriate model, falling back to the heuristic if necessary. 

The `map_columns_to_cim()` uses the LLM with contextual CIM documentation to map each table column to a corresponding CIM field, producing a JSON array of source-to-CIM mappings and defaulting unmapped columns to themselves. 

Finally, `generate_ctas_sql()` constructs a SQL statement to create a target schema and table in Databricks with columns renamed according to the generated mappings, effectively transforming the source table to conform to the chosen CIM model.

In [0]:
def choose_cim_model(table_name: str, columns: List[Dict], index: VectorStoreIndex) -> str:
    col_names = [c["name"] for c in columns]
    best_model = None
    best_score = 0
    for model, hints in MODEL_HINTS.items():
        score = sum(1 for c in col_names for h in hints if h.lower() in c.lower())
        if score > best_score:
            best_score, best_model = score, model

    retriever = index.as_retriever(similarity_top_k=4)
    context = retriever.retrieve(
        f"Which Splunk CIM data model best fits a table named {table_name} with columns: {col_names}?"
    )
    #context_text = "\n\n".join([n.node.get_text(MetadataMode.LLM) for n in context])
    context_text = "\n\n".join([n.node.get_text() for n in context])

    
    candidate_list = ", ".join(CIM_DATA_MODELS)
    prompt = f"""
    You are a Splunk CIM expert.
    Pick the single best matching Splunk CIM Data Model from this list:
    {candidate_list}

    Table: {table_name}
    Columns: {col_names}

    Context:\n{context_text}

    Answer with only the exact Data Model name from the list.
    """
    resp = Settings.llm.complete(prompt)
    picked = resp.text.strip()
    if picked not in CIM_DATA_MODELS and best_model:
        picked = best_model
    return picked

def map_columns_to_cim(model: str, table_name: str, columns: List[Dict], index: VectorStoreIndex) -> List[Tuple[str, str]]:
    retriever = index.as_retriever(similarity_top_k=6)

    
    context = retriever.retrieve(
        f"Splunk CIM {model} model field reference and examples"
    )
    context_text = "\n\n".join([n.node.get_text() for n in context])

    
    col_names = [c["name"] for c in columns]


    prompt = f"""
    You are mapping a Databricks table's columns to Splunk CIM fields for the **{model}** data model.
    Return ONLY valid JSON without markdown, without ```json``` tags.

    Table: {table_name}
    Columns: {col_names}

    Use ts for all log event timestamps, hostname for all host identifies, computer names, etc

    Using the context excerpts from CIM docs, produce a JSON array of objects with keys: source_col, cim_field.
    If no close match, set cim_field = source_col.

    Context:\n{context_text}

    Return ONLY valid JSON.
    """
    resp = Settings.llm.complete(prompt)

    try:
        pairs = json.loads(resp.text)
        out = []
        for p in pairs:
            sc = p.get("source_col"); cf = p.get("cim_field")
            if sc and cf:
                out.append((sc, cf))
        mapped_src = {sc for sc, _ in out}
        for c in col_names:
            if c not in mapped_src:
                out.append((c, c))

        return out
    except Exception as e:
        return [(c, c) for c in col_names]


def generate_ctas_sql(catalog: str, source_schema: str, target_schema: str, table: str, mappings: List[Tuple[str, str]]) -> str:
    select_list = ",\n        ".join([f"`{src}` AS `{dst}`" for src, dst in mappings])
    return f"""
CREATE SCHEMA IF NOT EXISTS {catalog}.{target_schema};
CREATE TABLE IF NOT EXISTS {catalog}.{target_schema}.`{table}` AS
SELECT
        {select_list}
FROM {catalog}.{source_schema}.`{table}`;
""".strip()

# Main flow

This main function orchestrates an end-to-end workflow for mapping Databricks tables to Splunk CIM data models and optionally executing the resulting SQL. 

It first downloads all relevant documentation for Splunk CIM and related security tools, then builds a persistent vector index of these documents for semantic retrieval. Using SQLAlchemy, it inspects the source schema to get a list of tables and their columns. For each table, it determines the best-fitting CIM model using the vector index, maps table columns to corresponding CIM fields, and generates a CTAS SQL statement to transform the table into the target schema. 

All generated SQL is printed, and if the environment variable EXECUTE_SQL is set to true, the code executes the statements against the database.

In [0]:
def main():
    # Step 1: Download documents for Splunk CIM and security tools
    #download_all_docs()

    # Step 2: Build vector index with Splunk CIM and security tool documents
    index = build_cim_index(VECTOR_DOCS_DIR, CHROMA_DIR)
    

    # Step 3: Obtain a list of tables
    engine = create_engine(SQLALCHEMY_URL)
    inspector = inspect(engine)
    tables = inspector.get_table_names(schema=SOURCE_SCHEMA)

    all_outputs: Dict[str, str] = {}
    for table in tables:
        cols = inspector.get_columns(table, schema=SOURCE_SCHEMA)
        model = choose_cim_model(table, cols, index)
        mappings = map_columns_to_cim(model, table, cols, index)
        sql = generate_ctas_sql(CATALOG, SOURCE_SCHEMA, TARGET_SCHEMA, table, mappings)
        all_outputs[table] = sql
        print(f"\n-- Table: {table} | CIM Model: {model}\n{sql}\n")

    # if os.getenv("EXECUTE_SQL", "false").lower() in ("1", "true", "yes"): 
    #     with engine.begin() as conn:
    #         for table, sql in all_outputs.items():
    #             for stmt in [s.strip() for s in sql.split(";\n") if s.strip()]:
    #                 conn.execute(text(stmt))


if __name__ == "__main__":
    main()

2025-09-04 00:37:20,873 - INFO - Received command c on object id p0


[INFO] Found existing index in ./chroma_btv_dc30_index, reusing Chroma collection...


2025-09-04 00:37:21,493 - INFO - Received status code 200 for POST request
2025-09-04 00:37:21,494 - INFO - HTTP Response with status code 200, message: OK
2025-09-04 00:37:21,496 - INFO - Successfully opened session 01f08927-51a1-1013-9d53-e935becd1e6f
2025-09-04 00:37:21,819 - INFO - Received status code 200 for POST request
2025-09-04 00:37:21,820 - INFO - HTTP Response with status code 200, message: OK
2025-09-04 00:37:21,871 - INFO - Backing off send_request(...) for 0.2s (requests.exceptions.ConnectionError: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0xff1ddf68dac0>: Failed to resolve 'us.i.posthog.com' ([Errno -3] Temporary failure in name resolution)")))
2025-09-04 00:37:22,117 - INFO - Backing off send_request(...) for 0.4s (requests.exceptions.ConnectionError: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /b


-- Table: hmail_app | CIM Model: Email
CREATE SCHEMA IF NOT EXISTS btv_dc30.gold;
CREATE TABLE IF NOT EXISTS btv_dc30.gold.`hmail_app` AS
SELECT
        `hostname` AS `host`,
        `ts` AS `_time`,
        `service` AS `service`,
        `session_id` AS `internal_message_id`,
        `client_ip` AS `src`,
        `message` AS `message`
FROM btv_dc30.silver.`hmail_app`;



2025-09-04 00:37:30,524 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:30,815 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:30,993 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:32,502 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:32,648 - INFO - Received status code 200 for POST request
2025-09-04 00:37:32,648 - INFO - HTTP Response with status code 200, message: OK



-- Table: hmail_imapd | CIM Model: Email
CREATE SCHEMA IF NOT EXISTS btv_dc30.gold;
CREATE TABLE IF NOT EXISTS btv_dc30.gold.`hmail_imapd` AS
SELECT
        `hostname` AS `host`,
        `ts` AS `_time`,
        `service` AS `service`,
        `session_id` AS `internal_message_id`,
        `client_ip` AS `src`,
        `message` AS `message`
FROM btv_dc30.silver.`hmail_imapd`;



2025-09-04 00:37:32,848 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:33,423 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:33,663 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:36,663 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:36,789 - INFO - Received status code 200 for POST request
2025-09-04 00:37:36,789 - INFO - HTTP Response with status code 200, message: OK



-- Table: hmail_smtp | CIM Model: Email
CREATE SCHEMA IF NOT EXISTS btv_dc30.gold;
CREATE TABLE IF NOT EXISTS btv_dc30.gold.`hmail_smtp` AS
SELECT
        `hostname` AS `host`,
        `ts` AS `_time`,
        `sender` AS `src_user`,
        `recipient` AS `recipient`,
        `client_ip` AS `src`,
        `server_ip` AS `dest`,
        `protocol` AS `protocol`,
        `status_code` AS `status_code`,
        `session_id` AS `internal_message_id`
FROM btv_dc30.silver.`hmail_smtp`;



2025-09-04 00:37:36,935 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:37,476 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:37,611 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:39,471 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:39,595 - INFO - Received status code 200 for POST request
2025-09-04 00:37:39,595 - INFO - HTTP Response with status code 200, message: OK



-- Table: hmail_smtpd | CIM Model: Email
CREATE SCHEMA IF NOT EXISTS btv_dc30.gold;
CREATE TABLE IF NOT EXISTS btv_dc30.gold.`hmail_smtpd` AS
SELECT
        `hostname` AS `host`,
        `ts` AS `_time`,
        `service` AS `service`,
        `client_ip` AS `src`,
        `message` AS `message`
FROM btv_dc30.silver.`hmail_smtpd`;



2025-09-04 00:37:40,266 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:42,063 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:42,322 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-09-04 00:37:43,374 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-04 00:37:43,510 - INFO - Received status code 200 for POST request
2025-09-04 00:37:43,511 - INFO - HTTP Response with status code 200, message: OK



-- Table: hmail_tcpip | CIM Model: Email
CREATE SCHEMA IF NOT EXISTS btv_dc30.gold;
CREATE TABLE IF NOT EXISTS btv_dc30.gold.`hmail_tcpip` AS
SELECT
        `hostname` AS `src`,
        `ts` AS `_time`,
        `service` AS `service`,
        `message` AS `message`
FROM btv_dc30.silver.`hmail_tcpip`;



2025-09-04 00:37:43,692 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can