## Setup Environment

In [1]:
from dotenv import load_dotenv
import os, json,sys
import numpy as np

# langsh*t (should find an alternative asap!)

from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document # required for splitting the text using lang****
from langchain_core.prompts import ChatPromptTemplate # Could do without it

from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright

In [26]:
# get ai provider api endpoints and keys from .env
load_dotenv()
print(os.getenv("FAST_LLM_API_BASE"))
print(os.getenv("CONTEXT_LLM_API_BASE"))

https://api.groq.com/openai/v1
https://generativelanguage.googleapis.com/v1beta/openai/


## Get Inputs

In [3]:
jobPostingUrl="https://www.linkedin.com/jobs/view/4300570375"

In [4]:
cvPath="examples/cv.pdf"
jobPostingPath = "examples/jobPostingText.txt"
additionalInfoPath = "examples/additionalInfo.txt"

# check if jobPostingUrl is defined
try:
    print(jobPostingUrl)
except:
    jobPostingUrl=""


https://www.linkedin.com/jobs/view/4300570375


## Static inputs
Prompts & Template

In [5]:
def load_config_file(filename):
    """Load configuration files from the config directory"""
    config_path = os.path.join('config', filename)
    with open(config_path, 'r', encoding='utf-8') as file:
        return file.read().strip()

cvTemplate = load_config_file('cvTemplate.txt')
cvKeywordsPrompt = load_config_file('cvKeywordsPrompt.txt')
jobKeywordsPrompt = load_config_file('jobKeywordsPrompt.txt')
jobRagPrompt = load_config_file('jobRagPrompt.txt')
cvEnhancePrompt = load_config_file('cvEnhancePrompt.txt')

print("Everything loaded successfully!")

Everything loaded successfully!


## Process Inputs

Handle CV and Additonal Info

In [6]:
# read cv:
loader = PyPDFLoader(cvPath)
pages = []
for page in loader.load():
    pages.append(page)
# join pages into a single string
cvRawText = "\n".join([page.page_content for page in pages])

# read additional info
with open(additionalInfoPath, 'r', encoding='utf-8') as file:
    additionalInfo = file.read().strip()

# combine cv and additional info
if additionalInfo:
    cvText = f"""
------------
CV Raw Text:
------------
{cvRawText}
---------------
Additional Info:
---------------
{additionalInfo}
    """
else:
    cvText = cvRawText
print(cvText)


------------
CV Raw Text:
------------
Ahmed Taha
Fresh Software Engineer
 ahmedtaha1234@gmail.com  +201557528856  Cairo, Egypt  creative-geek.tech  github.com/Creative-Geek
 linkedin.com/in/ahmed-taha-thecg  Exempted
PROFILE
Freshly graduated Software Engineer with hands-on experience in web development, AI integrations & Automation, and
multimedia production. Skilled in React, Nodejs, Flask, and Python, aspires to create dynamic, user-friendly applications. Has
delivered projects from web solutions to AI-driven tools‚Äîincluding an Arabic Handwriting E2E OCR system. Strong in UI/UX
design and committed to crafting efficient, engaging digital experiences.
PROJECTS
Tasky,AI-Powered Todo List 04/2025 ‚Äì 05/2025
Developed a fullstack todo list app with React, Node.js, and Prisma, focusing on user-friendly design and smooth
animations.
Deployed the client, server, and Postgres database, while enforcing security best practices.
Integrated an AI that turns pasted coworker messages into ta

### Handle Job Posting

Here we're gonna define a couple of functions to handle the job posting.

if job posting is url -> fetch html -> clean it -> embed it -> rag it to get the job posting text

if it's a text then we're done here

In [7]:
# define a couple of functions for url job posting extraction (probably the longest part of this program)
# This üëá was a pain to run inside the jupyter notebook on windows
async def fetchUrl(jobPostingUrl):
    if sys.platform == 'win32':
        import asyncio
        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
    def run_sync_playwright():
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context()
            page = context.new_page()
            page.goto(jobPostingUrl)

            data = {
                'url': page.url,
                'title': page.title(),
                'content': page.content(),
            }

            browser.close()
            return data['content']
    # run the code in a separate thread (because notebook)
    content = await asyncio.to_thread(run_sync_playwright)
    return content


# removing anything but text from the html
def cleanHTML(jobPostingHTML):
    soup = BeautifulSoup(jobPostingHTML, "html.parser")
    
    # Remove script and style tags
    for tag in soup(["script", "style"]):
        tag.decompose() # weird naming but I'll alow it
    
    text = soup.get_text(separator="\n", strip=True) # get text ONLY! (separated by line)
    return text

# a function for generating embeddings
def embed(text):
    # Get ready for embedding
    embeddings = OpenAIEmbeddings(
        model=os.getenv("EMBED_LLM_MODEL_NAME"),
        api_key=os.getenv("EMBED_LLM_API_KEY"),
        base_url=os.getenv("EMBED_LLM_API_BASE"),
    )
    
    # Text splitter definition
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000, chunk_overlap=200
    )
    
    documents = [Document(page_content=text)] # because lang****
    
    # Now split the text
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks.")
    
    # Embed! Put in memory (see? we're not storing anything!)
    print("Creating vector store with OpenAI embeddings...")
    vector_store = InMemoryVectorStore.from_documents(chunks, embeddings)
    # ‚ö†Ô∏è‚ö†Ô∏è BIG TODO: embedding those in one request WILL FAIL with very large text (which is stupid because that's why we're doing RAG in the first place)
    # Solution? batch processing, send multiple requests each with like 16-32 chunks or something
    print("Vector store ready.")
    
    return vector_store

#RAAAAAAAAAAG!
def doRAG(jobExtractedText):
    
    # Embed the job posting
    vector_store = embed(jobExtractedText)
    
    # Retrieve
    retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # could try more chunks but since beautiful soup already cleaned the text, it should be fine
    query = "job title responsibilities qualifications requirements description" # TODO: git gud at prompt engineering
    relevant_pieces = retriever.invoke(query)
    
    # Quick check in case it fails
    if not relevant_pieces:
        print("No relevant chunks found.")
        return None
    combined_context = "\n\n".join([doc.page_content for doc in relevant_pieces[:3]])
    
    # Ask LLM
    FAST_LLM = ChatOpenAI(
        model=os.getenv("FAST_LLM_MODEL_NAME"),
        api_key=os.getenv("FAST_LLM_API_KEY"),
        base_url=os.getenv("FAST_LLM_API_BASE"),
    )
    # Prepare prompt
    completeJobRagPrompt = ChatPromptTemplate.from_messages([
        ("system", jobRagPrompt),
        ("human", "Extract the job details from this text:\n\n{text}")
    ])
    
    
    chain = completeJobRagPrompt | FAST_LLM # "Why is this a chain?" no idea :)
    
    try:
        response = chain.invoke({"text": combined_context})
        # try parsing the response as json
        if hasattr(response, 'content'):
            json_text = response.content
        else:
            json_text = str(response)
            
        json_text = json_text.strip()
        
        # in case it wraps it in a code block (Not a good way but if it works...)
        if json_text.startswith('```json'):
            json_text = json_text[7:]
        if json_text.endswith('```'):
            json_text = json_text[:-3]
        json_text = json_text.strip()
        
        jobPostingText = json.loads(json_text)
        print("\n--- Extraction Complete ---")   
        return jobPostingText
        
    except Exception as error:
        print("Error: ", error)

# this is a simpler embed function that doesn't return a store, instead it returns the vectors directly (for use with keyword embeddings)
def embedSingle(text):
    """Get single embedding for entire text (no chunking)"""
    embeddings = OpenAIEmbeddings(
        model=os.getenv("EMBED_LLM_MODEL_NAME"),
        api_key=os.getenv("EMBED_LLM_API_KEY"),
        base_url=os.getenv("EMBED_LLM_API_BASE"),
    )
    
    # Get single embedding for the whole text
    embedding_vector = embeddings.embed_query(text)
    return np.array(embedding_vector)

async def extract_job_posting_from_url(jobPostingUrl):
    jobPostingHTML = await fetchUrl(jobPostingUrl)
    print("HTML fetched.")
    print(jobPostingHTML[:500])
    jobExtractedText = cleanHTML(jobPostingHTML)
    print("HTML cleaned.")
    jobPostingText = doRAG(jobExtractedText)
    print(json.dumps(jobPostingText, indent=2))
    return jobPostingText


In [8]:
# jobPostingHTML = await fetchUrl(jobPostingUrl)
# print(jobPostingHTML[:500])

In [9]:
# jobExtractedText = cleanHTML(jobPostingHTML)
# print(jobExtractedText[:500])

In [10]:
# jobPostingText = doRAG(jobExtractedText)
# print(json.dumps(jobPostingText, indent=2))

In [11]:
# check if job posting is a url or direct text:

if jobPostingUrl:
    jobPostingText = await extract_job_posting_from_url(jobPostingUrl)
    print(json.dumps(jobPostingText, indent=2))
else:
    with open(jobPostingPath, 'r', encoding='utf-8') as file:
        jobPostingText = file.read().strip()


HTML fetched.
<!DOCTYPE html><html lang="en"><head>
        <meta name="pageKey" content="d_jobs_guest_details">
          
    <meta name="robots" content="max-image-preview:large, noarchive">
      <meta name="bingbot" content="max-image-preview:large">
  
<!----><!---->        <meta name="locale" content="en_US">
<!---->        <meta id="config" data-app-version="2.0.2576" data-call-tree-id="AAY/2UqtZAEcGYKncwOL4g==" data-multiproduct-name="jobs-guest-frontend" data-service-name="jobs-guest-frontend" data-
HTML cleaned.
Created 6 chunks.
Creating vector store with OpenAI embeddings...
Vector store ready.

--- Extraction Complete ---
{
  "role_summary": "Entry\u2011level IT Help Desk professional responsible for managing IT assets, troubleshooting network, hardware and software issues, and delivering user support across the main office and branch locations.",
  "key_responsibilities": [
    "Manage IT department assets.",
    "Solve network problems and device issues in main office a

## Make Some AI Calls ü§ô

First, prepare prompts

In [12]:
# format cv keyword extraction prompt
cvKeywordsPrompt = cvKeywordsPrompt.format(cvText=cvText)
# format job posting keyword extraction prompt
jobKeywordsPrompt = jobKeywordsPrompt.format(jobPostingText=jobPostingText)

In [13]:
print(cvKeywordsPrompt)

You are an expert keyword extraction system specializing in resume and CV analysis. Your task is to extract relevant professional keywords from the provided CV text.

**Instructions:**
1. Extract keywords that are professionally relevant and valuable for job matching, skill assessment, or career analysis
2. Focus on the following categories:
   - Technical skills (programming languages, software, tools, frameworks)
   - Professional skills (project management, leadership, analysis, etc.)
   - Industry terms and domain expertise
   - Job titles and roles
   - Certifications and qualifications
   - Relevant technologies and methodologies
   - Important action verbs that demonstrate capabilities

**Guidelines:**
- Extract single words or short phrases (2-3 words maximum)
- Include both explicit skills mentioned and implied competencies
- Normalize variations (e.g., "JavaScript" and "JS" should be "JavaScript")
- Include both technical and soft skills
- Avoid generic words like "the", "and

012 üòÜ

...I mean, hit the api endpoints

In [14]:
FAST_LLM = ChatOpenAI(model=os.getenv("FAST_LLM_MODEL_NAME"),
                      base_url=os.getenv("FAST_LLM_API_BASE"),
                      api_key=os.getenv("FAST_LLM_API_KEY"),
                      )

cvKeywordsMessages = [(
    "human",
    cvKeywordsPrompt,
)]

jobKeywordsMessages = [(
    "human",
    jobKeywordsPrompt,
)]

cvKeywords = FAST_LLM.invoke(cvKeywordsMessages)
print(cvKeywords.content)
jobKeywords = FAST_LLM.invoke(jobKeywordsMessages)
print(jobKeywords.content)

["Python","C++","JavaScript","TypeScript","React","React Native","Next.js","Vue","Flask","FastAPI","Django","Node.js","WordPress","LLMs","Agent AI","LangChain","Stable Diffusion","Flux","Vertex AI Platform","TensorFlow","Image Processing","Google Cloud","Azure","Docker","GitHub","Git","Jira","Linux","Prisma","SQLite","PostgreSQL","MongoDB","Godot Engine","Qt","Graphic Design","Video Editing","Motion Graphics","Adobe Creative Suite","UI/UX Design","Technical Writing","Content Creation","Communication","Teamwork","Problem Solving","Adaptability","Creativity","Time Management","Detail Oriented","Software Engineer","Software Developer","IT Specialist","Store Developer","Graphics Designer","Tech Content Writer","Fullstack Development","Web Development","AI Integration","Automation","OCR","E2E OCR","CNN","Bi-LSTM","Handwriting Recognition","Generative AI","Agentic LLM","AI for SDGs","FOSS","Markdown","SSR","SEO","RTL Support","Real-time Monitoring","Web Dashboard","Email Alerts","Secure Auth

## Cosine Similarity
Now we need to know how 'similar' the resume is to the job posting.

To do this we make two embeddings:

1.  An embedding for the entire content of the resume.
2.  An embedding for the string of extracted job keywords.

An embedding is just a multi-dimensional vector representing the 'meaning' of a token in relation to other tokens.

So, by knowing the angle between those two vectors, we can know how 'similar' they are in 'meaning'.

In [15]:
# Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [16]:
cvEmbeddingVector = embedSingle(cvKeywords.content)
jobEmbeddingVector = embedSingle(jobKeywords.content)

currentCosineSimilarity = cosine_similarity(cvEmbeddingVector, jobEmbeddingVector)
print(f"Cosine Similarity: {currentCosineSimilarity}")

Cosine Similarity: 0.5730963347432936


## Resume Enhancement

Now the fun part, we know:

1. The job posting text (jobPostingText)
2. The job posting keywords (jobKeywords)
3. The resume text (cvText)
4. The resume keywords (cvKeywords)
5. How similar are the resume and the job posting (currentCosineSimilarity)
6. The cv template we can render the final resume in (cvTemplate)

Let's pass _ALL_ of this to a smart LLM.

In [17]:
def formatCvEnhancePrompt(cvTemplate, cvText, jobPostingText, cvKeywords, jobKeywords, currentCosineSimilarity):
    return cvEnhancePrompt.format(
        cvTemplate=cvTemplate,
        cvText=cvText,
        jobPostingText=jobPostingText,
        cvKeywords=cvKeywords,
        jobKeywords=jobKeywords,
        currentCosineSimilarity=currentCosineSimilarity,
    )

def generateFinalCV(cvEnhancePromptFormatted):

    # format messages
    cvEnhanceMessages = [(
        "human",
        cvEnhancePromptFormatted,
    )]
    
    # define smart LLM
    SMART_LLM = ChatOpenAI(
        model=os.getenv("SMART_LLM_MODEL_NAME"),
        base_url=os.getenv("SMART_LLM_API_BASE"),
        api_key=os.getenv("SMART_LLM_API_KEY"),
    )
    
    # get response
    response = SMART_LLM.invoke(cvEnhanceMessages)
    return response.content

In [18]:
cvEnhancePromptFormatted = formatCvEnhancePrompt(cvTemplate, cvText, jobPostingText, cvKeywords.content, jobKeywords.content, currentCosineSimilarity)
print(cvEnhancePromptFormatted)

You are an expert resume editor and talent acquisition specialist. Your task is to revise the following resume so that it aligns as closely as possible with the provided job description and extracted job keywords, in order to maximize the cosine similarity between the resume and the job keywords.

**Instructions:**
- Carefully review the job description and the list of extracted job keywords.
- Update the candidate's resume by:
    - Emphasizing and naturally incorporating relevant skills, experiences, and keywords from the job description and keyword list.
    - Where appropriate, naturally weave the extracted job keywords into the resume content.
    - Rewriting, adding, or removing resume content as needed to better match the job requirements.
    - Maintaining a natural, professional tone and avoiding keyword stuffing.
    - Where possible, use quantifiable achievements and action verbs.
    - The current cosine similarity score is 0.5731. Revise the resume to further increase this

In [19]:
finalCV = generateFinalCV(cvEnhancePromptFormatted)
print(finalCV)

---
name: Ahmed Taha
header:
  - text: |
      <span style="font-style: italic; font-weight: normal; display: block; margin-top: -7.5px; margin-bottom:5px;">
      Entry-Level IT Help Desk Professional
      </span>
  - text: <span class="iconify" data-icon="tabler:mail"></span> ahmedtaha1234@gmail.com
    link: mailto:ahmedtaha1234@gmail.com
  - text: <span class="iconify" data-icon="tabler:phone"></span> +201557528856
  - text: <span class="iconify" data-icon="tabler:map-pin"></span> Cairo, Egypt
  - text: <span class="iconify" data-icon="tabler:world"></span> creative-geek.tech
    link: https://creative-geek.tech
  - text: <span class="iconify" data-icon="tabler:brand-github"></span> github.com/Creative-Geek
    link: https://github.com/Creative-Geek
  - text: <span class="iconify" data-icon="tabler:brand-linkedin"></span> linkedin.com/in/ahmed-taha-thecg
    link: https://linkedin.com/in/ahmed-taha-thecg
  - text: <span class="iconify" data-icon="tabler:shield-check"></span> Exemp

In [20]:
# save to cv.md
with open("examples/cv.md", "w", encoding="utf-8") as f:
    f.write(finalCV)

In [21]:
# Embed the improved CV and compute the new cosine similarity vs job keywords
newCvEmbeddingVector = embedSingle(finalCV)
newCosineSimilarity = cosine_similarity(newCvEmbeddingVector, jobEmbeddingVector)

print(f"New Cosine Similarity: {newCosineSimilarity:.6f}")
print(f"Improvement over previous: {float(newCosineSimilarity - currentCosineSimilarity):+.6f}")

New Cosine Similarity: 0.533371
Improvement over previous: -0.039725


In [None]:
# Try again if no improvement
if newCosineSimilarity <= currentCosineSimilarity:
    # we'll append the "New Cosine Similarity: 0.533371; Improvement over previous: -0.039725; to the chat messages and make a new call"
    
    # form the similarity string
    similarityString = f"New Cosine Similarity: {newCosineSimilarity:.6f}; Improvement over previous: {float(newCosineSimilarity - currentCosineSimilarity):+.6f}"
    # this is a wrong way of using langc**n but it's a quick fix for now
    cvEnhanceMessages = [
        ("human", cvEnhancePromptFormatted),
        ("assistant", finalCV),
        ("human", similarityString),
    ]
    
    # define smart LLM
    SMART_LLM = ChatOpenAI(
        model=os.getenv("SMART_LLM_MODEL_NAME"),
        base_url=os.getenv("SMART_LLM_API_BASE"),
        api_key=os.getenv("SMART_LLM_API_KEY"),
    )
    
    # get response
    response = SMART_LLM.invoke(cvEnhanceMessages)
    print(response.content)

## HTML Generation

Now we need to convert our cv to a pdf using a custom built API.

You can also use [ohmycv!](https://ohmycv.app/)

In [27]:
import requests
url = os.getenv('SPECIAL_SAUCE_API_URL')

headers = {
    "X-API-Key": os.getenv('SPECIAL_SAUCE_API_KEY')
}

# Prepare the form data
data = {
    "outputFormat": "html",
    "fontSize": "12px",
    "lineHeight": "1.15",
    "marginTop": "45px",
    "marginBottom": "45px",
    "marginLeft": "40px",
    "marginRight": "40px"
}
files = {
    "markdown": ("cv.md", finalCV.encode("utf-8"))
}

# Make POST request
response = requests.post(url, headers=headers, data=data, files=files)

# Check the response
if response.status_code == 200:
    with open("examples/cv.html", "w", encoding="utf-8") as f:
        f.write(response.text)
    print("HTML generated and saved to cv.html")
else:
    print(f"Error: {response.status_code} - {response.text}")

HTML generated and saved to cv.html
