# Retrieval Augmented Generation
### Using MongoDB Atlas and OpenAI

In [None]:
from IPython.display import IFrame

PDF_URI = 'https://storage.googleapis.com/indeed_demo/resume.pdf'
IFrame(PDF_URI, width=1280, height=1000)

## Extract only Summary

In [None]:
import fitz  # PyMuPDF
import requests
from io import BytesIO

# Define the URL of the PDF file
url = "https://storage.googleapis.com/indeed_demo/resume.pdf"

# Fetch the PDF file from the URL
response = requests.get(url)
pdf_content = BytesIO(response.content)

# Open the PDF file
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")

# Extract text from each page and store the whole resume in a variable
resume_text = ""
for page_num in range(len(pdf_document)):
    page = pdf_document.load_page(page_num)
    resume_text += page.get_text()

# Find the Summary section
start_keyword = "Summary"
end_keywords = ["Highlights", "Experience", "Education", "Skills"]

# Extract the summary paragraph
start_index = resume_text.find(start_keyword)
if start_index != -1:
    end_index = min([resume_text.find(kw, start_index) for kw in end_keywords if resume_text.find(kw, start_index) != -1], default=len(resume_text))
    summary_text = resume_text[start_index + len(start_keyword):end_index].strip()
else:
    summary_text = "No summary found."
    

print(summary_text)



## Connect to MongoDB

In [None]:
from pymongo import MongoClient
import os

mongo_db_name = 'careerServices'
mongo_coll_name = 'resumes'

mongo_client = MongoClient("<MONGODB_CLUSTER_LINK>")
mongo_coll = mongo_client[mongo_db_name][mongo_coll_name]
mongo_db_and_coll_path = '{}.{}'.format(mongo_db_name, mongo_coll_name)

doc_count = mongo_coll.count_documents({})
'{} document count is {:,}'.format(mongo_db_and_coll_path, doc_count)

## Covert the summary to vectors

In [None]:
import openai
openai.api_key = "<OPENAI_API>"
# Send the summary text to the OpenAI embedding model
response = openai.Embedding.create(
    input=summary_text,
    model="text-embedding-ada-002"
)

# Extract the embedding vectors
embedding_vectors = response['data'][0]['embedding']

# Print the embedding vectors
print(embedding_vectors)

## Perform Vector Search

In [None]:
pipeline = [
  {
    '$vectorSearch': {
      'index': 'vector_index_2', 
      'path': 'summary_embedding', 
      'queryVector': embedding_vectors,
      'numCandidates': 150, 
      'limit': 4
    }
  }, {
    '$project': {
      '_id': 0, 
      'summary_section': 1,
    }
  }
]

result = mongo_coll.aggregate(pipeline)

# Initialize an empty array to store the summary sections
summary_sections = []

# Iterate through the cursor and collect the summary sections
for document in result:
    summary_sections.append(document['summary_section'])

# Print the array of summary sections
print(summary_sections)

## Send summaries to OpenAI

In [None]:
# Create the prompt
prompt = f"Improve the following summary by taking influence from the provided summaries:\n\nSummary to Improve:\n{summary_text}\n\nInfluence Summaries:\n"
for idx, summary in enumerate(summary_sections, 1):
    prompt += f"\nSummary {idx}:\n{summary}"

# Create the completion request
completion = openai.ChatCompletion.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
)

# Extract the improved summary
improved_summary = completion.choices[0].message['content'].strip()

# Print the improved summary

print("Original Summary:\n")
print(summary_text)
print("\nImproved Summary:\n")
print(improved_summary)

