In [1]:
!pip install langchain langchain-groq chromadb pandas sentence-transformers pymupdf



In [2]:
# Import required libraries
from langchain_groq import ChatGroq
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException
from sentence_transformers import SentenceTransformer
import pandas as pd
import fitz  # PyMuPDF for PDF extraction
import chromadb
import uuid

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from tqdm.autonotebook import tqdm, trange





In [3]:
# Step 1: Set up LLM and ChromaDB Client
llm = ChatGroq(temperature=0, groq_api_key='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', model_name="llama-3.2-1b-preview")
client = chromadb.Client()

In [4]:
# Step 2: Extract job description from the given URL
def extract_job_description(url):
    loader = WebBaseLoader([url])
    data = loader.load().pop().page_content
    return data

In [5]:

# Step 3: Extract text from PDF resumes
def extract_text_from_pdf(pdf_path):
    try:
        pdf_document = fitz.open(pdf_path)
        extracted_text = ""

        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            extracted_text += page.get_text()

        pdf_document.close()
        return extracted_text
    except Exception as e:
        print(f"Error extracting from {pdf_path}: {e}")
        return ""

In [6]:
# Step 4: Generate embeddings using Sentence Transformer
def generate_embeddings(text, model):
    return model.encode(text)

In [7]:
# Step 5: Store embeddings in ChromaDB
def store_in_chromadb(collection, skills, resume_text, skills_embedding, resume_embedding, file_path):
    unique_id = str(uuid.uuid4())

    # Convert embeddings to lists (ChromaDB requires embeddings as lists)
    skills_embedding = skills_embedding.tolist()
    resume_embedding = resume_embedding.tolist()
    
    # Add skills and resume embeddings along with metadata and unique ID
    collection.add(
        ids=[unique_id],
        documents=[resume_text],
        embeddings=[resume_embedding],
        metadatas=[{
            "skills": skills,
            "file_path": file_path
        }]
    )
    print(f"Stored resume from {file_path} with skills: {skills}")

In [8]:
# Step 6: Load CSV data containing resume paths and skills
def load_csv(csv_path):
    return pd.read_csv(csv_path)

In [9]:

# Step 7: Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')



In [10]:
# Step 8: Create a ChromaDB collection for resume embeddings
collection = client.create_collection(name="resume_embeddings_chromadb")

In [11]:
# Step 9: Process each row in the CSV to store resume embeddings
def process_resumes(csv_path):
    data = load_csv(csv_path)
    for index, row in data.iterrows():
        skills = row['Techstack']
        resume_path = row['Paths']

        # Generate skills embedding
        skills_embedding = generate_embeddings(skills, model)

        # Extract text from resume
        resume_text = extract_text_from_pdf(resume_path)

        # Generate resume embedding
        resume_embedding = generate_embeddings(resume_text, model)

        # Store the embeddings and metadata in ChromaDB
        store_in_chromadb(collection, skills, resume_text, skills_embedding, resume_embedding, resume_path)

In [12]:
# Step 10: Retrieve relevant resumes based on job description
def find_relevant_resumes(job_description, top_k=5):
    job_embedding = generate_embeddings(job_description, model)
    results = collection.query(
        query_embeddings=[job_embedding.tolist()],
        n_results=top_k
    )
    return results['metadatas']

In [13]:
# Step 11: Generate a tailored cover letter based on job description and selected resume
def generate_cover_letter(job_description, selected_resume_text):
    prompt_cover_letter = PromptTemplate.from_template(
        """
        ### JOB DESCRIPTION:
        {job_description}

        ### INSTRUCTION:
        You are Anwitha Arbi, a Master’s student in Computer Science at Santa Clara University, graduating in June 2025. 
        You are passionate about AI, backend development, data engineering, and full-stack development. 
        Your experience spans AI and machine learning projects, backend architecture, cloud engineering, scalable data engineering solutions, distributed systems, and full-stack applications. 
        You have worked at leading organizations like Nvidia and LTIMindtree, where you contributed to projects in autonomous vehicles, cloud engineering, backend engineering, distributed systems, and LLM-based applications. 
        You are eager to apply these skills to meet the specific needs of the role described.

        Below, you need to write a tailored cover letter to the employer regarding the job mentioned above.
        In your cover letter, introduce yourself and explain why you are a strong candidate for the position. 
        Mention the following relevant work experiences and highlight 4-5 key projects or accomplishments from your resume:

        {selected_resume_text}

        Tailor your writing to the job description, emphasizing how your background and skill set in AI, backend development, data engineering, and full-stack development align with their needs.

        ### COVER LETTER (NO PREAMBLE):
        """
    )
    chain_extract = prompt_cover_letter | llm
    res = chain_extract.invoke(input={"job_description": job_description, "selected_resume_text": selected_resume_text})
    return res.content

In [18]:
# Step 12: Main function to run the end-to-end process
def main(job_url, csv_path):
    # Step 12.1: Extract job description
    job_description = extract_job_description(job_url)
    print("Extracted job description successfully.")
    
    # Step 12.2: Process resumes from CSV and store embeddings (run only once)
    process_resumes(csv_path)
    
    # Step 12.3: Find relevant resumes based on job description
    relevant_resumes = find_relevant_resumes(job_description)
    print(f"Found {len(relevant_resumes)} relevant resumes.")
    
    # Step 12.4: Select the most relevant resume
    selected_resume_text = relevant_resumes[0]  # Access the first resume dictionary
    #selected_resume_text = selected_resume['resume_text']
    
    # Step 12.5: Generate cover letter
    cover_letter = generate_cover_letter(job_description, selected_resume_text)
    print("Generated cover letter:")
    print(cover_letter)


In [19]:
# Example usage
job_url = "https://www.amazon.jobs/en/jobs/2781927/data-engineer-co-op-winter-internship-2025-us?cmpid=SPLICX0248M&ss=paid&utm_campaign=cxro&utm_content=job_posting&utm_medium=social_media&utm_source=linkedin.com"
csv_path = "my_portfolio.csv"  # Path to your CSV file containing resume paths and tech stacks
main(job_url, csv_path)

Extracted job description successfully.
Stored resume from C:/Users/anwit/Documents/Resumes/Default Resume/Anwitha_Arbi_Resume.pdf with skills: Python, AWS, React, MongoDB, NoSQL, Distributed Systems, Docker, Kubernetes, Langchain, AI, PostgreSQL, backend, REST API, Java, AI, Vector Database, Gen AI, LLM
Stored resume from C:/Users/anwit/Documents/Resumes/React Resume/Anwitha_Arbi_Resume.pdf with skills: JavaScript, CSS, GraphQL, React.js, MongoDB, Node.js, REST API, Python, Flask, Docker, Kubernetes, AWS, Java
Stored resume from C:/Users/anwit/Documents/Resumes/OpenCV Backend/Anwitha_Arbi_Resume.pdf with skills: Pandas, Numpy, OpenCV, AWS EC2, Langchain, Llama3, MySQL, ChromaDB, Streamlit, Python, Flask, Docker, Kubernetes, React.js, MongoDB, Node.js, REST, Java
Stored resume from C:/Users/anwit/Documents/Resumes/Open CV Distributed Systems ML/Anwitha_Arbi_Resume.pdf with skills: OpenCV, Machine Learning, Spark, Scala, Distributed Systems, Python,  Data Engineering, data warehouse
Fou