# Retrieval Augmented Generation
### Using MongoDB Atlas and OpenAI

In [1]:
from IPython.display import IFrame

PDF_URI = 'https://storage.googleapis.com/indeed_demo/resume.pdf'
IFrame(PDF_URI, width=1280, height=1000)

## Extract only Summary

In [2]:
import fitz  # PyMuPDF
import requests
from io import BytesIO

# Define the URL of the PDF file
url = "https://storage.googleapis.com/indeed_demo/resume.pdf"

# Fetch the PDF file from the URL
response = requests.get(url)
pdf_content = BytesIO(response.content)

# Open the PDF file
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")

# Extract text from each page and store the whole resume in a variable
resume_text = ""
for page_num in range(len(pdf_document)):
    page = pdf_document.load_page(page_num)
    resume_text += page.get_text()

# Find the Summary section
start_keyword = "Summary"
end_keywords = ["Highlights", "Experience", "Education", "Skills"]

# Extract the summary paragraph
start_index = resume_text.find(start_keyword)
if start_index != -1:
    end_index = min([resume_text.find(kw, start_index) for kw in end_keywords if resume_text.find(kw, start_index) != -1], default=len(resume_text))
    summary_text = resume_text[start_index + len(start_keyword):end_index].strip()
else:
    summary_text = "No summary found."
    

print(summary_text)



Energetic Administrative Assistant with 30+ years experience in high-level executive support roles. Organized
and professional. Dedicated and focused, who excels at prioritizing, completing multiple tasks simultaneously
and following through to achieve project goals. Seeking a role of increased responsibility and authority. Adept at
managing multiple projects with ease using expert time management methods.


## Connect to MongoDB

In [3]:
from pymongo import MongoClient
import os

mongo_db_name = 'careerServices'
mongo_coll_name = 'resumes'

mongo_client = MongoClient("<MONGODB_CLUSTER_LINK>")
mongo_coll = mongo_client[mongo_db_name][mongo_coll_name]
mongo_db_and_coll_path = '{}.{}'.format(mongo_db_name, mongo_coll_name)

doc_count = mongo_coll.count_documents({})
'{} document count is {:,}'.format(mongo_db_and_coll_path, doc_count)

ServerSelectionTimeoutError: <mongodb_cluster_link>:27017: [Errno 8] nodename nor servname provided, or not known, Timeout: 30s, Topology Description: <TopologyDescription id: 66be5b8f798e7e85c040aa3e, topology_type: Unknown, servers: [<ServerDescription ('<mongodb_cluster_link>', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('<mongodb_cluster_link>:27017: [Errno 8] nodename nor servname provided, or not known')>]>

## Covert the summary to vectors

In [20]:
import openai
openai.api_key = "<OPENAI_API>"
# Send the summary text to the OpenAI embedding model
response = openai.Embedding.create(
    input=summary_text,
    model="text-embedding-ada-002"
)

# Extract the embedding vectors
embedding_vectors = response['data'][0]['embedding']

# Print the embedding vectors
print(embedding_vectors)

[-0.04011474549770355, -0.015139140188694, -0.008398113772273064, -0.028607938438653946, -0.004732638597488403, 0.01776396483182907, -0.01327657513320446, -0.002576769096776843, -0.03698616474866867, 0.0050971973687410355, 0.028501885011792183, -0.014489562250673771, 0.0022652370389550924, 0.004981201607733965, -0.004583500791341066, -0.019871778786182404, 0.03345988690853119, -0.0030374391935765743, -0.022417062893509865, -0.012262438423931599, -0.03189559653401375, -0.010830716229975224, -0.00644275126978755, -0.011984048411250114, 0.0025303708389401436, -0.013720674440264702, 0.007987155579030514, -0.02322572097182274, 0.01431722566485405, -0.05172760412096977, 0.01545730046927929, -0.019434308633208275, 0.0023414629977196455, -0.019593387842178345, -0.0030921229626983404, -0.018678676337003708, 0.0076955086551606655, -0.005130338948220015, 0.01121515966951847, -0.007854589261114597, 0.0025668267626315355, 0.017392776906490326, 0.003130235942080617, 0.004507274832576513, 0.000383408

## Perform Vector Search

In [21]:
pipeline = [
  {
    '$vectorSearch': {
      'index': 'vector_index_2', 
      'path': 'summary_embedding', 
      'queryVector': embedding_vectors,
      'numCandidates': 150, 
      'limit': 4
    }
  }, {
    '$project': {
      '_id': 0, 
      'summary_section': 1,
    }
  }
]

result = mongo_coll.aggregate(pipeline)

# Initialize an empty array to store the summary sections
summary_sections = []

# Iterate through the cursor and collect the summary sections
for document in result:
    summary_sections.append(document['summary_section'])

# Print the array of summary sections
print(summary_sections)

['Dedicated and focused Administrative Assistant who excels at prioritizing, completing multiple tasks simultaneously and following through to achieve project goals. Seeking a role of increased responsibility and authority.', "Highly motivated, and a dynamic Human Resources professional with diverse credentials seeking a position with a growing organization to make a positive impact on company policies, and experience career growth. Accomplished human resources and administrative professional with over 3 years' experience. Attentive to detail, able to take initiative, prioritize multiple tasks and manage workload. Resourceful team player with a can-do attitude. Results-driven with strong communication, analytical, and interpersonal skills while maintaining the utmost confidentiality.", 'Dedicated, Driven, and Dynamic with over 20 years of customer service expertise. Motivated to maintain customer satisfaction and contribute to company success with an emphasis in personnel and process m

## Send summaries to OpenAI

In [22]:
# Create the prompt
prompt = f"Improve the following summary by taking influence from the provided summaries:\n\nSummary to Improve:\n{summary_text}\n\nInfluence Summaries:\n"
for idx, summary in enumerate(summary_sections, 1):
    prompt += f"\nSummary {idx}:\n{summary}"

# Create the completion request
completion = openai.ChatCompletion.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
)

# Extract the improved summary
improved_summary = completion.choices[0].message['content'].strip()

# Print the improved summary

print("Original Summary:\n")
print(summary_text)
print("\nImproved Summary:\n")
print(improved_summary)



Original Summary:

Energetic Administrative Assistant with 30+ years experience in high-level executive support roles. Organized
and professional. Dedicated and focused, who excels at prioritizing, completing multiple tasks simultaneously
and following through to achieve project goals. Seeking a role of increased responsibility and authority. Adept at
managing multiple projects with ease using expert time management methods.

Improved Summary:

Seasoned and energetic Administrative Assistant with over 30 years of high-level executive support experience. Adept at prioritizing, multitasking, and seeing projects through to completion with exceptional organizational skills. Dedicated and focused, capable of managing multiple projects with expert time management methods. Seeking a position with increased responsibility and authority to leverage extensive experience and contribute to organizational success. A resourceful and professional team player, committed to delivering excellence and su