# Install required packages

### Install Notes

To run this you will need:
- pip install openai
- pip install PyMuPDF
- pip install vectordb

Note that in order for vectordb to install you must have the Microsoft Build Tools installed. You can do so for free at https://visualstudio.microsoft.com/visual-cpp-build-tools/. In the installer select the C++ build tools in the Workloads section. In the right pane, under "Optional", select the following:
- MSVC v143 - VS 2022 C++ x64/x86 build tools (or Latest Version) <br>
- Windows 11 SDK (or latest version for your OS)

# Import required packages


In [1]:
import sys
import os 
import openai
import fitz
from docarray import BaseDoc
from docarray.typing import NdArray
from typing import Dict
from docarray import DocList
import numpy as np
from vectordb import HNSWVectorDB

# Other Housekeeping

### Define PolicyDoc class and Vector Database

Here we define a new class, PolicyDoc that will be used to create new vector databases and query against existing databases

In [2]:
class PolicyDoc(BaseDoc):
  text: str = ''
  embedding: NdArray[1536]
  metadata: Dict[str, str] = {}

The kernal below defines the vector database. If it already exists in the current directory it will connect to it. If this is a new vector database you will need to populate it using the rest of the code in this notebook!

In [3]:
index_name = "./epa_policy_vdb"  # In vectordb the database is in a local folder
db = HNSWVectorDB[PolicyDoc](workspace=index_name, space='cosine')

### Set OpenAI API Key


In [4]:
openai.api_key = os.getenv("OPENAI_API_KEY")
# print(openai.api_key)

### Set repository path
This is where your policy documents are stored


In [5]:
repo_path = '../resources/EPA_Policy_Example/'

# Function to Segment Policy Documents
This function splits a document into segments. OpenAI is called and prompted to split the document logically into sections and subsections, ensuring to mark where each of the sections begins and ends


In [6]:
def split_document_into_segments(doc_text):
    # Here we tell OpenAI to segment the document into logical sections
    prompt = (
        "Identify and split the following document into logical sections and subsections. "
        "Provide each section or subsection in the form of text segments, and be sure to "
        "mark where each section begins and ends:\n\n"
        f"{doc_text}\n\n"
        "Return the document split logically into segments based on its content."
    )
    
    # Here we dall the OpenAI API to actually analyze and split the document
    response = openai.chat.completions.create(
        model="gpt-4o-mini",  # Mini is used for testing, however currently 4o is more accurage
        messages=[{"role": "system", "content": "You are an expert document segmenter."},
                  {"role": "user", "content": prompt}],
        max_tokens=4000,
        temperature=0
    )
    
    # Extract the response from the LLM
    segmented_text = response.choices[0].message.content

    # Split the response by sections based on markers or logical breaks (assumed returned by GPT)
    segments = segmented_text.split("\n\n---\n\n")
    
    # Print each segment with a clear label for debugging/verification
    # for i, segment in enumerate(segments, 1):
     #   print(f"Segment {i}:\n{'-' * 20}\n{segment}\n{'-' * 20}\n")
    
    return segments

# Function to Create Embeddings for each Segment
This function takes in segments of text and calls OpenAI to provide vector embeddings

In [7]:
def generate_embeddings(segments, embed_model="text-embedding-ada-002"):
    embeddings = []  # Initialize an empty list to store the embeddings
    client = openai.OpenAI()
    # Loop through each segment, embed it, and store the result in the embeddings list
    for segment in segments:
        response = client.embeddings.create(input=segment, model=embed_model)
        embeddings.append(response.data[0].embedding)  # Extract the actual embedding

    return embeddings

# Create Policy Document Database
This will loop through the repository specified above and search for .pdf or .txt files. Each time it finds a .pdf or .txt file it will segment the document into sections, embed each of those sections, and then populate the vector database defined earlier with these embedded segments. The database stores the embeddings, the filepath, the actual text that was embedded, and an ID.

In [None]:
# Loop through repo and search for all .pdf or .txt files

for filename in os.listdir(repo_path):
    file_path = os.path.join(repo_path, filename)
    print("Processing " + file_path)
    
    # Open if .pdf and extract its text
    if filename.endswith(".pdf"):
        policy_text = "\n".join([page.get_text() for page in fitz.open(file_path)])
        
    # Open if .txt and extract its txt
    elif filename.endswith(".txt"): 
        with open(file_path, 'r', encoding='utf-8') as file:
            policy_text = file.read()

    # ignore non .pdf or .txt files and continue loop
    else:
        continue
        
    # Segment the document text, then embed the segments
    segments = split_document_into_segments(policy_text)
    embeddings = generate_embeddings(segments)

   # Creates a PolicyDoc file which associates embeddings and segments with the doc name and text
    if len(segments) > 0:
        doc_list = [
            PolicyDoc(text=file_path+str(i), embedding=embeddings[i], metadata={'document':file_path, 'text':segments[i]})
            for i in range(len(segments))
        ]
        
        # Upsert the document into the vector database
        db.index(inputs=DocList[PolicyDoc](doc_list))

# Test Index


In [10]:
import textwrap

embed_model="text-embedding-ada-002"

# Let's try a to search our index! Below is the raw text of an example control
query = textwrap.dedent("""
CA-5 PLAN OF ACTION AND MILESTONES Control: 
    a. Develop a plan of action and milestones for the system to document the planned remediation 
       actions of the organization to correct weaknesses or deficiencies noted during the assessment 
       of the controls and to reduce or eliminate known vulnerabilities in the system; and 
    b. Update existing plan of action and milestones [Assignment: organization-defined frequency] 
       based on the findings from control assessments, independent audits or reviews, and continuous
       monitoring activities.
""")

# Embed the text of the example control, and get the embedding vector itself
embedding_response = openai.embeddings.create(input=query, model=embed_model)
embedding_Vector = np.array(embedding_response.data[0].embedding)

# Turn the example control text and embeddings into a PolicyDoc object
qe = PolicyDoc(text=query, embedding=embedding_Vector)

# Perform the query
res = db.search(inputs=DocList[PolicyDoc]([qe]), limit=4)
for m in res[0].matches:
    print(m)

[1;35mPolicyDoc[0m[1m([0m
    [33mid[0m=[32m'5e14f4728f87aacddb68a3819d943b39'[0m,
    [33mtext[0m=[32m'../resources/EPA_Policy_Example/information_security_assessment_authorization_and_monitoring_procedure.pd[0m
[32mf7'[0m,
    [33membedding[0m=[1;35mNdArray[0m[1m([0m[1m[[0m [1;36m0.00926604[0m, [1;36m-0.00278223[0m,  [1;36m0.0020672[0m , [33m...[0m,  [1;36m0.00164908[0m,
          [1;36m0.00739621[0m, [1;36m-0.04013897[0m[1m][0m[1m)[0m,
    [33mmetadata[0m=[1m{[0m
        [32m'text'[0m: [32m'### 6. PROCEDURE  \nSIO, ISO and EPA SO or their official designees for EPA-operated systems; and [0m
[32mSM, for systems operated on behalf of the EPA and to the extent made applicable to their management of the system [0m
[32mthrough a contract or other appropriate mechanism, are responsible for implementing the controls in this procedure.[0m
[32mEPA is adopting this procedure agency-wide and expects these officials to develop a plan with tim

[1;35mPolicyDoc[0m[1m([0m
    [33mid[0m=[32m'688d436db740cd6b5cd41e5dfb4d5d4f'[0m,
    [33mtext[0m=[32m'../resources/EPA_Policy_Example/information_security_assessment_authorization_and_monitoring_procedure.pd[0m
[32mf2'[0m,
    [33membedding[0m=[1;35mNdArray[0m[1m([0m[1m[[0m [1;36m0.01593186[0m,  [1;36m0.00282322[0m, [1;36m-0.00679782[0m, [33m...[0m, [1;36m-0.00666394[0m,
         [1;36m-0.01370274[0m, [1;36m-0.00795924[0m[1m][0m[1m)[0m,
    [33mmetadata[0m=[1m{[0m
        [32m'text'[0m: [32m'### 1. PURPOSE  \nThe purpose of this procedure is to facilitate the implementation of [0m
[32mEnvironmental Protection Agency [0m[32m([0m[32mEPA[0m[32m)[0m[32m security control requirements for the Assessment, Authorization and [0m
[32mMonitoring [0m[32m([0m[32mCA[0m[32m)[0m[32m control family, as identified in National Institute of Standards and Technology [0m[32m([0m[32mNIST[0m[32m)[0m[32m Special [0m
[32mPublication 

[1;35mPolicyDoc[0m[1m([0m
    [33mid[0m=[32m'60e933f174b1540337a28c19bc4ba635'[0m,
    [33mtext[0m=[32m'../resources/EPA_Policy_Example/information_security_planning_procedure.pdf6'[0m,
    [33membedding[0m=[1;35mNdArray[0m[1m([0m[1m[[0m [1;36m0.01852034[0m,  [1;36m0.00020712[0m,  [1;36m0.00565512[0m, [33m...[0m,  [1;36m0.01997073[0m,
         [1;36m-0.01325571[0m, [1;36m-0.04604982[0m[1m][0m[1m)[0m,
    [33mmetadata[0m=[1m{[0m
        [32m'text'[0m: [32m'### 5. PROCEDURE  \nSIO, ISO and EPA SO or their official designees for EPA-operated systems; and [0m
[32mSM, for systems operated on behalf of the EPA and to the extent made applicable to their management of the system [0m
[32mthrough a contract or other appropriate mechanism, are responsible for implementing the controls in this procedure.[0m
[32mEPA is adopting this procedure agency-wide, and expects these officials to develop a plan with timelines for [0m
[32madoption for their 

[1;35mPolicyDoc[0m[1m([0m
    [33mid[0m=[32m'0da27d1fd4778a84032de890f11cdcda'[0m,
    [33mtext[0m=[32m'../resources/EPA_Policy_Example/information_security_risk_assessment_procedure.pdf6'[0m,
    [33membedding[0m=[1;35mNdArray[0m[1m([0m[1m[[0m [1;36m0.01924152[0m, [1;36m-0.00356093[0m, [1;36m-0.00022733[0m, [33m...[0m, [1;36m-0.00720516[0m,
         [1;36m-0.00887109[0m, [1;36m-0.03090305[0m[1m][0m[1m)[0m,
    [33mmetadata[0m=[1m{[0m
        [32m'text'[0m: [32m'### 5. PROCEDURE  \nSIO, ISO and EPA SO or their official designees for EPA-operated systems; and [0m
[32mSM, for systems operated on behalf of the EPA and to the extent made applicable to their management of the system [0m
[32mthrough a contract or other appropriate mechanism, are responsible for implementing the controls in this procedure.[0m
[32mEPA is adopting this procedure agency-wide and expects these officials to develop a plan with timelines for [0m
[32madoption for 