# Import required packages


In [1]:
import sys
import os
import openai
import pinecone
import fitz
from pinecone import Pinecone, ServerlessSpec

# Set API Keys


In [2]:
openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone = Pinecone(api_key=pinecone_api_key)
# print(pinecone_api_key)
# print(openai.api_key)

In [7]:
repo_path = '../resources/EPA_Simple_English/'

# Function to Segment Policy Documents


In [None]:
def split_document_into_segments(doc_text):
    prompt = (
        "Identify and split the following document into logical sections and subsections. "
        "Provide each section or subsection in the form of text segments, and be sure to "
        "mark where each section begins and ends:\n\n"
        f"{doc_text}\n\n"
        "Return the document split logically into segments based on its content."
    )
    
    # Call the OpenAI API to analyze and split the document
    response = openai.chat.completions.create(
        model="gpt-4o-mini",  # Replace with the correct model name
        messages=[{"role": "system", "content": "You are an expert document segmenter."},
                  {"role": "user", "content": prompt}],
        max_tokens=4000,  # Adjust as necessary based on document length
        temperature=0  # More deterministic splitting
    )
    
    # Extract the response from the LLM
    segmented_text = response.choices[0].message.content

    # Split the response by sections based on markers or logical breaks (assumed returned by GPT)
    segments = segmented_text.split("\n\n---\n\n")
    
    # Print each segment with a clear label for debugging/verification
    #for i, segment in enumerate(segments, 1):
     #   print(f"Segment {i}:\n{'-' * 20}\n{segment}\n{'-' * 20}\n")
    
    return segments

In [None]:
segments = split_document_into_segments(doc_text) # for spot testing the above function

# Function to Create Embeddings for each Segment

In [None]:
def generate_embeddings(segments, embed_model="text-embedding-ada-002"):
    embeddings = []  # Initialize an empty list to store the embeddings
    client = openai.OpenAI()
    # Loop through each segment, embed it, and store the result in the embeddings list
    for segment in segments:
        response = client.embeddings.create(input=segment, model=embed_model)
        embeddings.append(response.data[0].embedding)  # Extract the actual embedding

    return embeddings

In [None]:
embeddings = generate_embeddings(segments) # for spot testing the above function

# Function to Index Embedded Segments

In [None]:
# connect to index
def get_Index(index_name, embedding_length):
    if index_name not in pinecone.list_indexes().names():#([index['name'] for index in pinecone.list_indexes()]):
        pinecone.create_index(
                    index_name,
                    dimension=embedding_length,
                    metric='cosine',
                    spec=ServerlessSpec(cloud='aws', region='us-east-1')
            )
    index = pinecone.Index(index_name)

    return index

# Create Policy Document Database

In [None]:
# Here we specify the name of the pinecone index we are connecting to
index_name = "policy-doc-database-pe"

In [None]:
# Loop through repo and search for all .pdf or .txt files
for filename in os.listdir(repo_path):
    file_path = os.path.join(repo_path, filename)
    if filename.endswith(".pdf"): # Open .pdf and extract its text
        policy_text = "\n".join([page.get_text() for page in fitz.open(file_path)])

    elif filename.endswith(".txt"): # Open .txt and extract its txt
        with open(file_path, 'r', encoding='utf-8') as file:
            policy_text = file.read()
    else: # ignore non .pdf or .txt files and continue loop
        continue
        
    # Segment the document text, then embed the segments, and generate an Index (or connect to an existing Index)
    segments = split_document_into_segments(policy_text)
    embeddings = generate_embeddings(segments)
    index = get_Index(index_name, embedding_length = len(embeddings[0]))
    
    # Add the embeddings to the index, including document title (as file_path)
    if (len(segments) > 0):
        # Object is of the form (id, vector, meta_data)
        to_upsert = [(file_path+str(i), embeddings[i],{'document':file_path, 'text':segments[i]}) 
                     for i in range(len(segments)) ]
        index.upsert(vectors=to_upsert)

# View and Test Index

In [4]:
# Here we specify the name of the pinecone index we are connecting to
index_name = "policy-doc-database-pe"
index = pinecone.Index(index_name)
# view index stats 
index_stats = index.describe_index_stats()
index_stats

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 145}},
 'total_vector_count': 145}

In [6]:
import textwrap

embed_model="text-embedding-ada-002"

# Let's try a to search our index!
query = textwrap.dedent("""
CA-5 PLAN OF ACTION AND MILESTONES Control: 
    a. Develop a plan of action and milestones for the system to document the planned remediation 
       actions of the organization to correct weaknesses or deficiencies noted during the assessment 
       of the controls and to reduce or eliminate known vulnerabilities in the system; and 
    b. Update existing plan of action and milestones [Assignment: organization-defined frequency] 
       based on the findings from control assessments, independent audits or reviews, and continuous
       monitoring activities.
""")
qe = openai.embeddings.create(input=query, model=embed_model)
res = index.query(vector=qe.data[0].embedding, top_k=4, include_metadata=True)

In [7]:
res

{'matches': [{'id': '../resources/EPA_Simple_English/SimpleEinglish_information_security_assessment_authorization_and_monitoring_procedure.txt7',
              'metadata': {'document': '../resources/EPA_Simple_English/SimpleEinglish_information_security_assessment_authorization_and_monitoring_procedure.txt',
                           'text': '### 6. PROCEDURE\n'
                                   'The Senior Information Officials (SIOs), '
                                   'Information Security Officers (ISOs), and '
                                   'EPA System Owners (SOs), or their '
                                   'representatives, along with Service '
                                   'Managers (SMs) for systems run on behalf '
                                   'of the EPA, must follow these procedures. '
                                   'They are responsible for implementing '
                                   'these controls and developing a plan with '
              