In [1]:
import sys
import os
import nltk
import openai
import pinecone
import fitz
from pinecone import Pinecone, ServerlessSpec
from nltk.tokenize import word_tokenize

# Needed for tokenizing
nltk.download('punkt_tab')

# Set OpenAI key and Pinecone API key
openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone = Pinecone(api_key=pinecone_api_key)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\butch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
def split_document_into_segments(doc_text):
    words = word_tokenize(doc_text)  # Tokenize the document into words
    segments = []  # Initialize an empty list to store the segments
    max_words=250
    num_words=len(words)
    
    # Loop through the tokenized words and create segments
    print(f"Number of words: {num_words}")
    
    for i in range(0, len(words), max_words):
        segment = ' '.join(words[i:i + max_words])  # Join the words into a segment
        segments.append(segment)  # Append the segment to the list
        
        # Print each segment with a clear label
        print(f"Segment {len(segments)}:\n{'-' * 20}\n{segment}\n{'-' * 20}\n")
    
    return segments

In [3]:
def generate_embeddings(segments, embed_model="text-embedding-ada-002"):
    embeddings = []  # Initialize an empty list to store the embeddings
    client = openai.OpenAI()
    # Loop through each segment, embed it, and store the result in the embeddings list
    for segment in segments:
        response = client.embeddings.create(input=segment, model=embed_model)
        embeddings.append(response.data[0].embedding)  # Extract the actual embedding

    return embeddings

In [4]:
def load_document_from_pdf(file_path):
    # The document we're grabbing
    control_doc = file_path
    
    # The following opens the doc in fitz, iterates the pages, and combines the text of
    # each page (separating the pages with a new line)
    control_text = "\n".join([page.get_text() for page in fitz.open(control_doc)])
   # print(control_text)
    
    return control_text

In [5]:
document_path = '../resources/EPA_Policy_Example/information_security_assessment_authorization_and_monitoring_procedure.pdf'

doc_text = load_document_from_pdf(document_path)
segments = split_document_into_segments(doc_text)
embeddings = generate_embeddings(segments)

Number of words: 3168
Segment 1:
--------------------
Information Security – Assessment , Authorization and Monitoring ( CA ) Procedure Directive No : CIO 2150-P-04.3 Page 1 of 9 Note : IT/IM directives are reviewed annually for content , relevance , and clarity Form Rev . 06/09/2020 IT/IM DIRECTIVE PROCEDURE Issued by the EPA Chief Information Officer , Pursuant to Delegation 1-19 Information Security – Assessment , Authorization and Monitoring ( CA ) Procedure 1 . PURPOSE The purpose of this procedure is to facilitate the implementation of Environmental Protection Agency ( EPA ) security control requirements for the Assessment , Authorization and Monitoring ( CA ) control family , as identified in National Institute of Standards and Technology ( NIST ) Special Publication ( SP ) 800-53 , Revision 5 , Security and Privacy Controls for Information Systems and Organization . 2 . SCOPE These procedures address all United States EPA information and information systems to include informati

In [6]:
# Here we specify the name of the pinecone index we are connecting to
index_name = "ombtest" 

In [7]:
## This section is optional, if we want a user to be able to specify
## a specific index name. It needs to be edited to incorporate in the
## working code

#input("Please enter the Pinecone index name: ")

#if index_name in [index['name'] for index in pinecone.list_indexes()]:
 #   overwrite_confirmation = input(f"Index '{index_name}' already exists. Do you want to overwrite it? (y/n): ")
  #  if overwrite_confirmation.lower() == 'y':
   #     pinecone.delete_index(index_name)
    #else:
     #   index_name = input("Please enter a new Pinecone index name: ")

# indexes the embeddings, if the index already exists, deletes it and creates it
# again with the new code.
if embeddings:
    embedding_length = len(embeddings[0])
    if index_name not in ([index['name'] for index in pinecone.list_indexes()]):
        pinecone.create_index(
                index_name,
                dimension=embedding_length,
                metric='cosine',
                spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
    else:
        pinecone.delete_index(index_name) 
        pinecone.create_index(
                index_name,
                dimension=embedding_length,
                metric='cosine',
                spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
else:
    print("Embedding error")

print(f'Total segments created: {len(segments)}')
print(f'Embedding length: {embedding_length}')

Total segments created: 13
Embedding length: 1536


In [8]:
# connect to index
index = pinecone.Index(index_name)

In [9]:
if (len(segments) > 0):
    # Object is of the form (id, vector, meta_data)
    to_upsert = [(document_path+str(i), embeddings[i],{'document':document_path, 'text':segments[i]}) 
                 for i in range(len(segments)) ]
    index.upsert(vectors=to_upsert)

In [10]:
# view index stats 
index_stats = index.describe_index_stats()
index_stats

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [11]:
import textwrap

embed_model="text-embedding-ada-002"

# Let's try a to search our index!
query = textwrap.dedent("""
  CA-5 PLAN OF ACTION AND MILESTONES Control: 
    a. Develop a plan of action and milestones for the system to document the planned remediation 
       actions of the organization to correct weaknesses or deficiencies noted during the assessment 
       of the controls and to reduce or eliminate known vulnerabilities in the system; and 
    b. Update existing plan of action and milestones [Assignment: organization-defined frequency] 
       based on the findings from control assessments, independent audits or reviews, and continuous
       monitoring activities.
""")
qe = openai.embeddings.create(input=query, model=embed_model)
res = index.query(vector=qe.data[0].embedding, top_k=10, include_metadata=True)

In [12]:
res

{'matches': [{'id': '../resources/EPA_Policy_Example/information_security_assessment_authorization_and_monitoring_procedure.pdf6',
              'metadata': {'document': '../resources/EPA_Policy_Example/information_security_assessment_authorization_and_monitoring_procedure.pdf',
                           'text': 'or upon a significant change to the system '
                                   'or operating environment . CA-7 – '
                                   'Continuous Monitoring For All Systems : 1 '
                                   ') Develop a system-level continuous '
                                   'monitoring strategy and implement '
                                   'continuous monitoring in accordance with '
                                   'the organization-level continuous '
                                   'monitoring strategy that includes : a ) '
                                   'Establishing the following system-level '
                                  