In [0]:
# --- STEP 1: Install Libraries ---
%pip install pypdf faiss-cpu openai

In [0]:
# --- STEP 2: Import Libraries ---
import os
import openai
import faiss
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [0]:

# --- STEP 3: Set OpenAI API Key ---
openai_api_key = os.getenv("open_ai_key")


In [0]:
# --- STEP 2: Import Libraries ---
import os
import openai
import faiss
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- STEP 3: Set OpenAI API Key ---
openai.api_key = openai_api_key  # Or set it directly

# --- STEP 4: Initialize OpenAI Client ---
client = openai.OpenAI(api_key = openai_api_key) #this will use the API key that was set above.

# --- STEP 5: Define Embedding Function ---
def get_embedding(text, model="text-embedding-ada-002"):
    """Generates an embedding for the given text using OpenAI Embeddings API."""
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# --- STEP 6: Load PDF Document ---
pdf_path = "UKRI-081020-HRPolicyFramework.pdf"  # Replace with your PDF path
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# --- STEP 7: Split Document into Chunks ---
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)


In [0]:
# --- STEP 8: Generate Embeddings and Create FAISS Index ---
embeddings_list = [get_embedding(text.page_content) for text in texts]
embeddings_array = np.array(embeddings_list).astype('float32')

dimension = len(embeddings_array[0])
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity
index.add(embeddings_array)


In [0]:
# Save FAISS Index and Text Chunks
import pickle

faiss.write_index(index, "rag_index.faiss")
with open("rag_texts.pkl", "wb") as f:
    pickle.dump(texts, f)

print("RAG model saved.")

In [0]:
# --- STEP 9: Define Retrieval Function ---
def retrieve_similar(query, index, k=4):
    """Retrieves similar text chunks based on the query."""
    query_embedding = np.array([get_embedding(query)]).astype('float32')
    distances, indices = index.search(query_embedding, k)
    return distances, indices

# --- STEP 10: Example Query and Retrieval ---
query = "What is the main topic of the document?"
distances, indices = retrieve_similar(query, index)


In [0]:
# --- STEP 11: Print Results --- #see retreivals that will be passed to context window
print(f"Query: {query}\n")
for i, idx in enumerate(indices[0]):
    print(f"Distance: {distances[0][i]}")
    print(f"Text: {texts[idx].page_content[:200]}...\n")

In [0]:
# --- STEP 10: Define Answer Generation Function ---
def generate_answer(query, context):
    """Generates an answer using the OpenAI Chat Completions API."""
    prompt = f"Answer the question based on the context below.\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # Or another suitable chat model
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

# --- STEP 11: Question Answering Loop ---
while True:
    user_question = input("Ask a question about the PDF (or type 'exit'): ")
    if user_question.lower() == "exit":
        break

    distances, indices = retrieve_similar(user_question, index)

    # Combine relevant contexts
    context = "\n".join([texts[idx].page_content for idx in indices[0]])

    answer = generate_answer(user_question, context)
    print(f"Answer: {answer}\n")

In [0]:
import sys
print(sys.version)