In [24]:
import os
import pdfplumber
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

# Initialize the Groq client with the API key
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))


def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
        return text


def extract_entities_with_groq(text):
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are an AI that extracts structured data from resumes. Output should be in JSON format."},
            {"role": "user", "content": f"Extract key information (like name, contact, skills, education, projects and experience), discarding excess info about education and experience, from the following resume:\n{text}"}
        ],
        model="llama-3.3-70b-versatile",  # Replace with the Groq model you want to use
    )
    return chat_completion.choices[0].message.content



pdf_path = input("Enter resume path: ")  # Replace with your resume file
if not os.path.exists(pdf_path):
    print(f"Error: File '{pdf_path}' not found.")
else:
    resume_text = extract_text_from_pdf(pdf_path)
    if not resume_text.strip():
            print("Error: No text extracted from the PDF. Check the file content.")
    else:
        # Extract entities with Groq
        extracted_info = extract_entities_with_groq(resume_text)
            
        # Print extracted info
        print("Extracted Information:")
        print(extracted_info)


Extracted Information:
Here's the extracted information in JSON format:

```json
{
  "name": "Charles McTurland",
  "contact": {
    "email": "cmcturland@email.com",
    "phone": "(123) 456-7890",
    "linkedin": "linkedin.com/in/charlesmcturland"
  },
  "skills": [
    "Python (Django)",
    "Javascript (NodeJS, ReactJS, jQuery)",
    "SQL (MySQL, PostgreSQL, NoSQL)",
    "HTML5/CSS",
    "AWS",
    "Unix",
    "Git"
  ],
  "education": {
    "degree": "B.S. Computer Science",
    "university": "University of Pittsburgh",
    "duration": "September 2008 - April 2012"
  },
  "experience": [
    {
      "company": "Embark",
      "position": "Software Engineer",
      "duration": "January 2015 - current",
      "location": "New York, NY"
    },
    {
      "company": "MarketSmart",
      "position": "Software Engineer",
      "duration": "April 2012 - January 2015",
      "location": "Washington, DC"
    },
    {
      "company": "Marketing Science Company",
      "position": "Software 

In [1]:
import os
import re
import pdfplumber
import fitz  # PyMuPDF
import json
from groq import Groq
from dotenv import load_dotenv
import chromadb

chroma_client = chromadb.PersistentClient(path="chroma_resume.db")
collection = chroma_client.get_or_create_collection(name="resumes")
# Load environment variables
load_dotenv()

# Initialize the Groq client with the API key
api_key = os.environ.get("GROQ_API_KEY")
if not api_key:
    raise ValueError("Error: GROQ_API_KEY is missing. Please set it in your .env file.")

client = Groq(api_key=api_key)

def extract_text_with_pdfplumber(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "".join(page.extract_text() or "" for page in pdf.pages)
            if text.strip():
                return text
    except Exception as e:
        print(f"pdfplumber failed: {e}")
    return None


def extract_text_with_fitz(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "".join(page.get_text() for page in doc) # type: ignore
        if text.strip():
            return text 
    except Exception as e:
        print(f"fitz (PyMuPDF) failed: {e}")
    return None


def extract_entities_with_groq(text):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an AI that extracts key information from resumes. Summarize the candidate's essential details in a few sentences, including their name, contact, skills, education, and job titles. Do not give anything else as output."},
                {"role": "user", "content": f"Extract key information (like name, contact, skills, education, projects, certifications, and experience) from the following resume:\n{text}"}
            ],
            model="llama-3.3-70b-versatile",
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling Groq API: {e}")
    return None

# def clean_llm_output(llm_output):
#     # Remove backticks and "```json" if present
#     cleaned_output = re.sub(r"```(json)?", "", llm_output).strip()
#     return cleaned_output

# def convert_llm_output_to_dict(llm_output):
#     try:
#         cleaned_output = clean_llm_output(llm_output)
#         return json.loads(cleaned_output)
#     except json.JSONDecodeError as e:
#         print(f"Error decoding JSON: {e}")
#         return None


def save_to_chromadb(resume_data, raw_text):
    try:
        # Add resume data to Chroma collection
        collection.add(
            documents=[raw_text],  # Store the raw resume text
            metadatas=[{"summary":resume_data}],  # Store structured data as metadata
            ids=[resume_data.get("name", "unknown").replace(" ", "_").lower()]
        )
        print("Resume data saved to ChromaDB")
    # Save everything back to the file
    except Exception as e:
        print(f"Error saving to ChromaDB: {e}")



if __name__ == "__main__":
    pdf_path = input("Enter resume path: ")
    
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' not found.")
    else:
        resume_text = extract_text_with_pdfplumber(pdf_path)
        
        if not resume_text:
            print("pdfplumber extraction failed or returned empty text. Trying fitz...")
            resume_text = extract_text_with_fitz(pdf_path)
        
        if not resume_text:
            print("Error: No text extracted from the PDF. Check the file content.")
        else:
            # Extract entities with Groq
            extracted_info = extract_entities_with_groq(resume_text)
            # structured_data = convert_llm_output_to_dict(extracted_info)
            if extracted_info:
                save_to_chromadb(extracted_info, resume_text)
                print("Saved Information in JSON file")
                # try:
                #     # Try to parse the result as JSON
                #     parsed_info = json.loads(extracted_info)
                #     print("Extracted Information:")
                #     print(json.dumps(parsed_info, indent=4))
                # except json.JSONDecodeError:
                #     # Handle non-JSON output
                #     print("Extracted Information (raw):")
                #     print(extracted_info)


Error saving to ChromaDB: 'str' object has no attribute 'get'
Saved Information in JSON file


In [49]:
import chromadb
client = chromadb.PersistentClient(path="chroma.db")

collection = client.create_collection(name="food")

collection.add(
    documents=[
        "This is a pen",
        "This is a pineapple",
        "This is an Apple",
        "This is a book"
    ],
    ids=[
        "id1",
        "id2",
        "id3",
        "id4"
    ]
)

results = collection.query(
    query_texts=["This is a query about food"], # Chroma will embed this for you
    n_results=4 # how many results to return
)
print(results)


{'ids': [['id2', 'id3', 'id4', 'id1']], 'embeddings': None, 'documents': [['This is a pineapple', 'This is an Apple', 'This is a book', 'This is a pen']], 'uris': None, 'data': None, 'metadatas': [[None, None, None, None]], 'distances': [[1.2466824285124267, 1.524765415432772, 1.5835720648468317, 1.8873233167385282]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [1]:
import chromadb
client = chromadb.PersistentClient(path="chroma.db")
collection = client.get_collection(name="food")

results = collection.query(
    query_texts=["This is a query about studying"], # Chroma will embed this for you
    n_results=4, # how many results to return
)

# Extract results safely
ids = results.get('ids', [[]])[0] or []
documents = results.get('documents', [[]])[0] or [] # type: ignore
distances = results.get('distances', [[]])[0] or [] # type: ignore

# Print the results
for i in range(len(ids)):
    print(f"Document ID: {ids[i]}")
    print(f"Text: {documents[i]}")
    print(f"Distance: {distances[i]:.4f}")
    print("--------------------")


print(results)

Document ID: id4
Text: This is a book
Distance: 1.4273
--------------------
Document ID: id1
Text: This is a pen
Distance: 1.6124
--------------------
Document ID: id3
Text: This is an Apple
Distance: 1.7944
--------------------
Document ID: id2
Text: This is a pineapple
Distance: 1.8213
--------------------
{'ids': [['id4', 'id1', 'id3', 'id2']], 'embeddings': None, 'documents': [['This is a book', 'This is a pen', 'This is an Apple', 'This is a pineapple']], 'uris': None, 'data': None, 'metadatas': [[None, None, None, None]], 'distances': [[1.4272642063675842, 1.612399273619461, 1.7944306332278257, 1.8212716796584583]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [None]:
import os
import re
import pdfplumber
import fitz  # PyMuPDF
import json
from groq import Groq
from dotenv import load_dotenv
import chromadb

chroma_client = chromadb.PersistentClient(path="chroma_resume.db")
collection = chroma_client.get_or_create_collection(name="resumes")

# Load environment variables
load_dotenv()

# Initialize the Groq client with the API key
api_key = os.environ.get("GROQ_API_KEY")
if not api_key:
    raise ValueError("Error: GROQ_API_KEY is missing. Please set it in your .env file.")

client = Groq(api_key=api_key)

def extract_text_with_pdfplumber(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "".join(page.extract_text() or "" for page in pdf.pages)
            if text.strip():
                return text
    except Exception as e:
        print(f"pdfplumber failed: {e}")
    return None

def extract_text_with_fitz(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "".join(page.get_text() for page in doc)  # type: ignore
        if text.strip():
            return text
    except Exception as e:
        print(f"fitz (PyMuPDF) failed: {e}")
    return None

def extract_entities_with_groq(text):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an AI that extracts structured data from resumes. Output should be in JSON format only. Exclude descriptions and work done in job. Do not give anything else as output."},
                {"role": "user", "content": f"Extract key information (like name, contact, skills, education, projects, certifications, and experience) from the following resume:\n{text}"}
            ],
            model="llama-3.3-70b-versatile",
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling Groq API: {e}")
    return None


def convert_llm_output_to_dict(llm_output):
    try:
        cleaned_output = re.sub(r"```(json)?", "", llm_output).strip()
        return json.loads(cleaned_output)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None


def save_to_chromadb(resume_data):
    try:
        if not resume_data:
            print("No resume data to save.")
            return
        
        resume_dict = convert_llm_output_to_dict(resume_data)
        if not resume_dict:
            print("Failed to parse LLM output to JSON.")
            return

        candidate_name = resume_dict.get("name", "unknown").replace(" ", "_").lower()
        summary = json.dumps(resume_dict)
        
        collection.add(
            documents=[summary],  # Store only the summarized data
            metadatas=[{"name": resume_dict.get("name", "unknown")}],  # Minimal metadata
            ids=[candidate_name]
        )
        print(f"Summary data for '{candidate_name}' saved to ChromaDB")
    except Exception as e:
        print(f"Error saving to ChromaDB: {e}")


if __name__ == "__main__":
    pdf_path = input("Enter resume path: ")
    
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' not found.")
    else:
        resume_text = extract_text_with_pdfplumber(pdf_path)
        
        if not resume_text:
            print("pdfplumber extraction failed or returned empty text. Trying fitz...")
            resume_text = extract_text_with_fitz(pdf_path)
        
        if not resume_text:
            print("Error: No text extracted from the PDF. Check the file content.")
        else:
            # Extract entities with Groq
            extracted_info = extract_entities_with_groq(resume_text)
            print(extracted_info)
            if extracted_info:
                save_to_chromadb(extracted_info)
                print("Saved Information to ChromaDB")


```
{
  "name": "Janine Nel",
  "contact": {
    "address": "1515 Pacific Ave, Los Angeles, CA 90291, United States",
    "phone": "3868683442",
    "email": "email@email.com"
  },
  "skills": [
    "AutoCAD",
    "Industry Trends & Sales Forecasting",
    "Knowledge of Technical Diagrams",
    "Engineering",
    "Agile Project Management",
    "English",
    "Dutch"
  ],
  "education": [
    {
      "degree": "Masters in Industrial Engineering",
      "institution": "Harvard University",
      "location": "Miami",
      "date": "January 2019 — May 2022"
    },
    {
      "degree": "Professional Engineering (PE) Exam",
      "institution": "National Council of Examiners for Engineering and Surveying (NCEES)",
      "location": "Newton",
      "date": "January 2018 — December 2019"
    }
  ],
  "certifications": [
    {
      "certification": "Certified Associate in Project Management (CAPM)",
      "institution": "Project Management Institute (PMI)",
      "location": "Seneca, South C

In [None]:
import chromadb
client = chromadb.PersistentClient(path="chroma_resume.db")
collection = client.get_collection(name="resumes2")

results = collection.query(
    query_texts=["who is the most experienced"], # Chroma will embed this for you
    n_results=5, # how many results to return
)

ids = results.get('ids', [[]])[0] or []
distances = results.get('distances', [[]])[0] or [] # type: ignore

for i in range(len(ids)):
    # print(f"Document ID: {i+1}")
    print(f"Name: {ids[i]}")
    print(f"Distance: {distances[i]:.4f}")
    print("--------------------")


Number of requested results 5 is greater than number of elements in index 3, updating n_results = 3


Name: janine_nel
Distance: 1.5689
--------------------
Name: charles_mcturland
Distance: 1.6711
--------------------
Name: cynthia_dwayne
Distance: 1.7637
--------------------


In [2]:
import os
import re
import pdfplumber
import fitz  # PyMuPDF
import json
from groq import Groq
from dotenv import load_dotenv
import chromadb

chroma_client = chromadb.PersistentClient(path="chroma_resume.db")
collection = chroma_client.get_or_create_collection(name="resumes3")

# Load environment variables
load_dotenv()
api_key = os.environ.get("GROQ_API_KEY")
if not api_key:
    raise ValueError("Error: GROQ_API_KEY is missing. Please set it in your .env file.")

client = Groq(api_key=api_key)

def extract_text_with_pdfplumber(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "".join(page.extract_text() or "" for page in pdf.pages)
            if text.strip():
                return text
    except Exception as e:
        print(f"pdfplumber failed: {e}")
    return None

def LLM(text):
    try:
        chat_completion = client.chat.completions.create( 
            messages=[
                {"role": "system", "content": "You are an AI that summarizes resumes. 'Name:' should be used before person's name. Print skills as it is. Mention time frame of experience along with roles in experience.  Do not give descriptions. Do not exceed 2 lines for any given section."},
                {"role": "user", "content": f"Summarize and Extract key information (like personal details, skills, education, projects, certifications, experience and certifications(optional)) from the following resume:\n{text}"}
            ],
            model="llama-3.3-70b-versatile",
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling Groq API: {e}")
    return None

def save_to_chromadb(resume_data):
    try:
        if not resume_data:
            print("No resume data to save.")
            return
        

        name_match = re.search(r"Name:\s*(.*)", resume_data)
        candidate_name = name_match.group(1).strip().replace(" ", "_").lower() if name_match else "unknown"
        
        collection.add(
            documents=[resume_data],  # Store summarized data
            metadatas=[{"name": candidate_name}],  # Store the extracted name in metadata
            ids=[candidate_name]
        )
        print(f"Summary data for '{candidate_name}' saved to ChromaDB")
    except Exception as e:
        print(f"Error saving to ChromaDB: {e}")




pdf_path = input("Enter path: ")

if not os.path.exists(pdf_path):
    print(f"Error: File '{pdf_path}' not found.")
else:
    resume_text = extract_text_with_pdfplumber(pdf_path)
    if not resume_text:
        print("Error: No text extracted from the PDF. Check the file content.")
    else:
        # Extract entities with Groq
        extracted_info = LLM(resume_text)
        if extracted_info:
            #print(extracted_info)
            save_to_chromadb(extracted_info)
            print("Saved Information to ChromaDB")

           

Summary data for 'kristen_connelly' saved to ChromaDB
Saved Information to ChromaDB


In [3]:
res = collection.query(
    query_texts=[""], # Chroma will embed this for you
    n_results=5, # how many results to return
    include=["embeddings","documents","distances"] # type: ignore
)
print(res)

Number of requested results 5 is greater than number of elements in index 3, updating n_results = 3


{'ids': [['kristen_connelly', 'cynthia_dwayne', 'charles_mcturland']], 'embeddings': [array([[-0.09390237, -0.09990636,  0.00696741, ..., -0.0504648 ,
         0.02070054,  0.05681674],
       [-0.0889769 , -0.05071229,  0.0372017 , ..., -0.01357653,
        -0.0081168 ,  0.05889499],
       [-0.05332222, -0.01952961, -0.00477163, ..., -0.07719181,
        -0.04477137,  0.10925198]], shape=(3, 384))], 'documents': [['Name: Kristen Connelly\nContact: 3868683442, email@email.com, 1515 Pacific Ave, Los Angeles, CA 90291\n\nSkills: \nAdobe Premiere Pro, Call Sheets & Sides, Camera Boom, Light Boom, Mic Boom, DaVinci Resolve, Languages: English, Dutch; Flemish\n\nEducation: \nBA in Film and Television, Boston University, FEBRUARY 2021 — PRESENT, Advanced Course in Digital Video Editing, ADMEC Multimedia Institute, JANUARY 2018 — JULY 2018\n\nExperience: \nVideo Production Assistant, Blue Penguin Designs, FEBRUARY 2021 — PRESENT, Video Production Assistant, Botle Bob Advertising, JANUARY 201