## Setup Environment

In [1]:
from dotenv import load_dotenv
import os, json,sys

# langsh*t (should find an alternative asap!)

from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document # required for splitting the text using lang****
from langchain_core.prompts import ChatPromptTemplate # Could do without it

from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright

In [2]:
# get ai provider api endpoints and keys from .env
load_dotenv()
print(os.getenv("FAST_LLM_API_BASE"))

https://api.groq.com/openai/v1


## Get Inputs

In [3]:
jobPostingUrl="https://www.linkedin.com/jobs/view/4292529710"

In [4]:
cvPath="examples/cv.pdf"
jobPostingPath = "examples/jobPostingText.txt"
additionalInfoPath = "examples/additionalInfo.txt"

# check if jobPostingUrl is defined
try:
    print(jobPostingUrl)
except:
    jobPostingUrl=""


https://www.linkedin.com/jobs/view/4292529710


## Static inputs
Prompts & Template

In [11]:
def load_config_file(filename):
    """Load configuration files from the config directory"""
    config_path = os.path.join('config', filename)
    with open(config_path, 'r', encoding='utf-8') as file:
        return file.read().strip()

cvTemplate = load_config_file('cvTemplate.txt')
cvKeywordsPrompt = load_config_file('cvKeywordsPrompt.txt')
jobKeywordsPrompt = load_config_file('jobKeywordsPrompt.txt')
jobRagPrompt = load_config_file('jobRagPrompt.txt')

print("Everything loaded successfully!")

Everything loaded successfully!


## Process Inputs

Handle CV and Additonal Info

In [6]:
# read cv:
loader = PyPDFLoader(cvPath)
pages = []
for page in loader.load():
    pages.append(page)
# join pages into a single string
cvRawText = "\n".join([page.page_content for page in pages])

# read additional info
with open(additionalInfoPath, 'r', encoding='utf-8') as file:
    additionalInfo = file.read().strip()

# combine cv and additional info
if additionalInfo:
    cvText = f"""
------------
CV Raw Text:
------------
{cvRawText}
---------------
Additional Info:
---------------
{additionalInfo}
    """
else:
    cvText = cvRawText
print(cvText)


------------
CV Raw Text:
------------
Ahmed Taha
Fresh Software Engineer
 ahmedtaha1234@gmail.com  +201557528856  Cairo, Egypt  creative-geek.tech  github.com/Creative-Geek
 linkedin.com/in/ahmed-taha-thecg  Exempted
PROFILE
Freshly graduated Software Engineer with hands-on experience in web development, AI integrations & Automation, and
multimedia production. Skilled in React, Nodejs, Flask, and Python, aspires to create dynamic, user-friendly applications. Has
delivered projects from web solutions to AI-driven tools‚Äîincluding an Arabic Handwriting E2E OCR system. Strong in UI/UX
design and committed to crafting efficient, engaging digital experiences.
PROJECTS
Tasky,AI-Powered Todo List 04/2025 ‚Äì 05/2025
Developed a fullstack todo list app with React, Node.js, and Prisma, focusing on user-friendly design and smooth
animations.
Deployed the client, server, and Postgres database, while enforcing security best practices.
Integrated an AI that turns pasted coworker messages into ta

### Handle Job Posting

Here we're gonna define a couple of functions to handle the job posting.

if job posting is url -> fetch html -> clean it -> embed it -> rag it to get the job posting text

if it's a text then we're done here

In [13]:
# define a couple of functions for url job posting extraction (probably the longest part of this program)
# This üëá was a pain to run inside the jupyter notebook on windows
async def fetchUrl(jobPostingUrl):
    if sys.platform == 'win32':
        import asyncio
        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
    def run_sync_playwright():
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context()
            page = context.new_page()
            page.goto(jobPostingUrl)

            data = {
                'url': page.url,
                'title': page.title(),
                'content': page.content(),
            }

            browser.close()
            return data['content']
    # run the code in a separate thread (because notebook)
    content = await asyncio.to_thread(run_sync_playwright)
    return content


# removing anything but text from the html
def cleanHTML(jobPostingHTML):
    soup = BeautifulSoup(jobPostingHTML, "html.parser")
    
    # Remove script and style tags
    for tag in soup(["script", "style"]):
        tag.decompose() # weird naming but I'll alow it
    
    text = soup.get_text(separator="\n", strip=True) # get text ONLY! (separated by line)
    return text

# a function for generating embeddings
def embed(text):
    # Get ready for embedding
    embeddings = OpenAIEmbeddings(
        model=os.getenv("EMBED_LLM_MODEL_NAME"),
        api_key=os.getenv("EMBED_LLM_API_KEY"),
        base_url=os.getenv("EMBED_LLM_API_BASE"),
    )
    
    # Text splitter definition
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000, chunk_overlap=200
    )
    
    documents = [Document(page_content=text)] # because lang****
    
    # Now split the text
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks.")
    
    # Embed! Put in memory (see? we're not storing anything!)
    print("Creating vector store with OpenAI embeddings...")
    vector_store = InMemoryVectorStore.from_documents(chunks, embeddings)
    # ‚ö†Ô∏è‚ö†Ô∏è BIG TODO: embedding those in one request WILL FAIL with very large text (which is stupid because that's why we're doing RAG in the first place)
    # Solution? batch processing, send multiple requests each with like 16-32 chunks or something
    print("Vector store ready.")
    
    return vector_store

#RAAAAAAAAAAG!
def doRAG(jobExtractedText):
    
    # Embed the job posting
    vector_store = embed(jobExtractedText)
    
    # Retrieve
    retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # could try more chunks but since beautiful soup already cleaned the text, it should be fine
    query = "job title responsibilities qualifications requirements description" # TODO: git gud at prompt engineering
    relevant_pieces = retriever.invoke(query)
    
    # Quick check in case it fails
    if not relevant_pieces:
        print("No relevant chunks found.")
        return None
    combined_context = "\n\n".join([doc.page_content for doc in relevant_pieces[:3]])
    
    # Ask LLM
    FAST_LLM = ChatOpenAI(
        model=os.getenv("FAST_LLM_MODEL_NAME"),
        api_key=os.getenv("FAST_LLM_API_KEY"),
        base_url=os.getenv("FAST_LLM_API_BASE"),
    )
    # Prepare prompt
    completeJobRagPrompt = ChatPromptTemplate.from_messages([
        ("system", jobRagPrompt),
        ("human", "Extract the job details from this text:\n\n{text}")
    ])
    
    
    chain = completeJobRagPrompt | FAST_LLM # "Why is this a chain?" no idea :)
    
    try:
        response = chain.invoke({"text": combined_context})
        # try parsing the response as json
        if hasattr(response, 'content'):
            json_text = response.content
        else:
            json_text = str(response)
            
        json_text = json_text.strip()
        
        # in case it wraps it in a code block (Not a good way but if it works...)
        if json_text.startswith('```json'):
            json_text = json_text[7:]
        if json_text.endswith('```'):
            json_text = json_text[:-3]
        json_text = json_text.strip()
        
        jobPostingText = json.loads(json_text)
        print("\n--- Extraction Complete ---")   
        return jobPostingText
        
    except Exception as error:
        print("Error: ", error)

def extract_job_posting_from_url(jobPostingUrl):
    jobPostingHTML = fetchUrl(jobPostingUrl)
    jobExtractedText = cleanHTML(jobPostingHTML)
    jobPostingText = doRAG(jobExtractedText)
    return jobPostingText


In [8]:
jobPostingHTML = await fetchUrl(jobPostingUrl)
print(jobPostingHTML[:500])

<!DOCTYPE html><html lang="en"><head>
        <meta name="pageKey" content="d_jobs_guest_details">
          
    <meta name="robots" content="max-image-preview:large, noarchive">
      <meta name="bingbot" content="max-image-preview:large">
  
<!----><!---->        <meta name="locale" content="en_US">
<!---->        <meta id="config" data-app-version="2.0.2576" data-call-tree-id="AAY/pCTWOh9fZm0uvlrHyA==" data-multiproduct-name="jobs-guest-frontend" data-service-name="jobs-guest-frontend" data-


In [9]:
jobExtractedText = cleanHTML(jobPostingHTML)
print(jobExtractedText[:500])

Everything To Gain hiring Fully Remote Software Engineer - Cairo in Cairo, Cairo, Egypt | LinkedIn
Skip to main content
LinkedIn
Fully Remote Software Engineer - Cairo in El Qantara Gharb
Expand search
Jobs
This button displays the currently selected search type. When expanded it provides a list of search options that will switch the search inputs to match the current selection.
Jobs
People
Learning
Clear text
Clear text
Clear text
Clear text
Clear text
Join now
Sign in
Fully Remote Software Eng


In [14]:
jobPostingText = doRAG(jobExtractedText)
print(json.dumps(jobPostingText, indent=2))

Created 6 chunks.
Creating vector store with OpenAI embeddings...
Vector store ready.

--- Extraction Complete ---
{
  "role_summary": "Fully remote Software Engineer based in Cairo, responsible for end\u2011to\u2011end development of efficient, secure, and user\u2011friendly software applications.",
  "key_responsibilities": [
    "Design, implement, and maintain software applications that fulfill business needs",
    "Collaborate with product and design teams to define software requirements and technical specifications",
    "Troubleshoot and optimize existing applications for performance and scalability",
    "Engage in code reviews and share knowledge with team members to maintain high coding standards",
    "Research and integrate new technologies to enhance the development process"
  ],
  "required_qualifications": [
    "Proficiency in programming languages like Python and JavaScript",
    "Experience with web application development and familiarity with popular front\u2011end f

In [15]:
# check if job posting is a url or direct text:

if jobPostingUrl:
    jobPostingText = extract_job_posting_from_url(jobPostingUrl)
    print(json.dumps(jobPostingText, indent=2))
else:
    with open(jobPostingPath, 'r', encoding='utf-8') as file:
        jobPostingText = file.read().strip()


TypeError: Incoming markup is of an invalid type: <coroutine object fetchUrl at 0x0000029F0A5E6A40>. Markup must be a string, a bytestring, or an open filehandle.

## Make Some AI Calls ü§ô

First, prepare prompts

In [None]:
# format cv keyword extraction prompt
cvKeywordsPrompt = cvKeywordsPrompt.format(cvText=cvText)
# format job posting keyword extraction prompt
jobKeywordsPrompt = jobKeywordsPrompt.format(jobPostingText=jobPostingText)

In [None]:
print(cvKeywordsPrompt)

012 üòÜ

...I mean, hit the api endpoints

In [None]:
FAST_LLM = ChatOpenAI(model=os.getenv("FAST_LLM_MODEL_NAME"),
                      base_url=os.getenv("FAST_LLM_API_BASE"),
                      api_key=os.getenv("FAST_LLM_API_KEY"),
                      )

cvKeywordsMessages = [(
    "human",
    cvKeywordsPrompt,
)]

jobKeywordsMessages = [(
    "human",
    jobKeywordsPrompt,
)]

cvKeywords = FAST_LLM.invoke(cvKeywordsMessages)
print(cvKeywords.content)
jobKeywords = FAST_LLM.invoke(jobKeywordsMessages)
print(jobKeywords.content)

## Cosine Similarity
Now we need to know how 'similar' the resume is to the job posting.

To do this we make two embeddings:

1.  An embedding for the entire content of the resume.
2.  An embedding for the string of extracted job keywords.

An embedding is just a multi-dimensional vector representing the 'meaning' of a token in relation to other tokens.

So, by knowing the angle between those two vectors, we can know how 'similar' they are in 'meaning'.

In [None]:
# # Make the same AI call 3 times and print results separated by a line
# responses = []
# for _ in range(3):
#     resp = FAST_LLM.invoke(cvKeywordsMessages)
#     responses.append(resp.content.strip())

# print("\n------\n".join(responses))