In [None]:
import pymupdf
import fitz 
from pydantic import BaseModel
import os 
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time 
from openai import OpenAI
import json 
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
import numpy as np 
from sentence_transformers import SentenceTransformer
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from unstructured.partition.pdf import partition_pdf

def pdf_to_text(pdf_path: str) -> str:
    elements = partition_pdf(pdf_path, strategy="fast")
    return "\n".join([el.text.strip() for el in elements if el.text.strip()])


In [3]:
# defining schema for input 

class ResumeInfo(BaseModel):
    name: str
    skills: list = []
    education: list = []
    work_experience: list = []
    projects: list = []
    filepath : str 


json_schema = ResumeInfo.model_json_schema()

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time 

In [5]:
def LLM_call(prompt):
    client = OpenAI(
    base_url="http://172.16.2.214:8000/v1", 
    api_key="-" 
    )
    response = client.chat.completions.create(
    model="Qwen/Qwen2.5-32B-Instruct-AWQ",
    messages=[
        {"role": "user", "content": prompt}
    ],
   
   extra_body={"guided_json": json_schema}
    )

    print(response.choices[0].message.content)

    return json.loads(response.choices[0].message.content)


In [6]:
# writing the prompt as an input to the LLM 
def parsing_helper(markdown_text,filepath):
  prompt = f"""
  You are a precise and strict **Information Extraction Assistant**.

  Your task is to extract structured data from **unstructured CV text**, strictly following the provided JSON schema.
  add this filepath as well {filepath}

  json schema as follow : 
        name: str
        skills: list = []
        education: list = []
        work_experience: list = []
        projects: list = []
        filepath : str 
    


  ---

  ### Rules:
  - Only extract information that is **explicitly stated** in the CV text.
  - If a field is **missing**, use:'
    - `null` for missing strings
    - `[]` for missing lists
  - Do **not** hallucinate, infer, summarize, or rewrite content.
  - Preserve original text exactly as it appears.
  - Return a **valid JSON object only** — no markdown, no extra explanation.
  ### CV Text: 

  {markdown_text}

  ### Output(matches the schema):
"""
  
  return LLM_call(prompt)

In [7]:
def cv_parser_pipeline(path):
    candidates = []
    if not os.path.isdir(path):
        raise ValueError(f"{path} is not a valid directory!")

    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isdir(file_path):
            continue

        ##Calling pdf_to_text and converting each into text and passing it to the llm 

        print("Converting CV to text")

        text =  pdf_to_text(file_path)
        
        ## passing it to the LLM 
        structured_output = parsing_helper(text,file_path)

        candidates.append(structured_output)
    return candidates

In [8]:
candidates = cv_parser_pipeline("Resumes")

Converting CV to text
{
  "name": "AHMAD RASHAD MOJEEB",
  "skills": [
    "Project Management: Hands-on experience with Jira for tracking tasks and managing sprints",
    "Frameworks & Tools: Spring Boot, .NET, React, Git, MySQLL",
    "Development Areas: Full-Stack Development, Game Development, and Operating Systems Simulation",
    "Deployment: Docker (Containerisation), Website Deployment",
    "Programming Languages: Java, Python, C#, SQL, Assembly, C++, C"
  ],
  "education": [
    {
      "degree": "Bachelor of Computer Science",
      "institution": "FAST | nuces",
      "years": "2022-2026"
    },
    {
      "degree": "Alevels",
      "institution": "Supernova school",
      "years": "2020-2022",
      "details": "1Astar 2As"
    },
    {
      "degree": "Olevels",
      "institution": "Supernova school",
      "years": "2018-2020",
      "details": "6Astar 2As"
    }
  ],
  "work_experience": [],
  "projects": [
    "SportSync – Full-Stack Sports Event Management System (Sp

In [41]:

client = QdrantClient("localhost", port=6333)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  
client.recreate_collection(
collection_name="cv_data",
vectors_config={
    "skills": VectorParams(size=384, distance=Distance.COSINE),
    "education": VectorParams(size=384, distance=Distance.COSINE),
    "work_experience": VectorParams(size=384, distance=Distance.COSINE),
    "projects": VectorParams(size=384, distance=Distance.COSINE),
    }
)

  client.recreate_collection(


True

In [42]:
required_fields = ["skills", "education", "work_experience", "projects"]

###-------------------------------------------------------------------------
def zero_vector(dim=384):
    return [0.0] * dim
###-------------------------------------------------------------------------
def join_and_embed(field_list,embedding_model):
    if not field_list:
        return zero_vector()  
    pieces = []
    for item in field_list:
        if isinstance(item, dict):
         
            pieces.append(", ".join(f"{k}: {v}" for k, v in item.items()))
        elif isinstance(item, str):
            pieces.append(item)
        else:
       
            pieces.append(str(item))
    text = " ".join(pieces)
    return embedding_model.encode([text])[0].tolist()

###-----------------------------------------------------------------------
def insert_candidate(candidate, collection_name="cv_data"):
    vector_data = {
        field: join_and_embed(candidate.get(field, []),embedding_model)
        for field in required_fields
    }
    
    payload = {}
    if "name" in candidate:
        payload["name"] = candidate["name"]
    if "filepath" in candidate: 
        payload["filepath"] = candidate["filepath"]

    # Point ID
    point_id = candidate.get("id", hash(candidate.get("name", "unknown")) & 0xFFFFFFFFFFFFFFFF)

    point = PointStruct(
        id=point_id,
        vector=vector_data,
        payload=payload
    )
    
    client.upsert(collection_name=collection_name, points=[point])
    print(f"Inserted: {payload.get('name', 'Unnamed')}")

In [43]:
## creating VEC DB 
def create_vec_db(candidates):
    for i, candidate in enumerate(candidates): 
        print("Inserting candidate")
        insert_candidate(candidate)
    scroll_result = client.scroll(
    collection_name="cv_data",
    with_payload=True,
    with_vectors=True,
    limit=100
)

    for point in scroll_result[0]:
        print(f"\nCandidate ID: {point.id}")
        print(f"Name: {point.payload.get('name', 'N/A')}")
        print(f"Filepath: {point.payload.get('filepath', 'N/A')}")  # ✅ This line shows the path
        print("Vectors:")
        for vector_name, vector_values in point.vector.items():
            print(f" - {vector_name} ({len(vector_values)} dims)")
            print(f"   {vector_values[:10]}...")

In [44]:
create_vec_db(candidates)

Inserting candidate
Inserted: AHMAD RASHAD MOJEEB
Inserting candidate
Inserted: Muhammad Azeem Chaudhry

Candidate ID: 15029620614353576281
Name: Muhammad Azeem Chaudhry
Filepath: Resumes\Muhammad Azeem Chaudhry's Resume.pdf
Vectors:
 - education (384 dims)
   [-0.021256179, -0.085238256, 0.02810062, 0.023358196, -0.05861551, -0.058037534, -0.08238744, -0.017035617, -0.083069034, 0.00675746]...
 - skills (384 dims)
   [-0.08107303, -0.18257825, -0.008319617, -0.0040023117, 0.06360502, -0.0043624775, -0.048732035, 0.030923348, -0.017590592, -0.009718532]...
 - projects (384 dims)
   [-0.03959531, -0.09473572, 0.025652448, -0.054258443, 0.0010956578, 0.029833755, -0.0011511324, -0.051974535, -0.0009792366, -0.08130918]...
 - work_experience (384 dims)
   [-0.05999295, -0.026150364, 0.014208963, 0.021188542, 0.024000205, -0.04163701, 0.0057619046, -0.0010891248, -0.035834394, -0.028546486]...

Candidate ID: 15308989954804716007
Name: AHMAD RASHAD MOJEEB
Filepath: Resumes\Ahmad Rashad (7).

In [22]:
def job_description_parser(job_description) : 
    
    job_prompt = f"""
    You are an **Information Extraction Assistant**.

    Your task:
    - Parse the provided **job description**.
    - Extract **explicit information only** — ***do not infer, invent, or assume***.
    - Output a **valid JSON object** that matches the schema shown below.

    ### JSON schema:
    {{
    "skills": [ "list of required skills as short strings" ],
    "work_experience": "explicit description of required work experience, as a string",
    "education": "explicit education or qualification requirements, as a string",
    "projects": [ "list of explicitly mentioned types of projects or domains" ]
    }}

    ### Rules:
    - If a field is not present in the job description, use:
    - an empty list `[]` for list fields,
    - or `null` for string fields.
    - Do **not** add any extra text outside the JSON.
    - Do **not** add markdown or explanations.
    - Preserve the original wording of the job description when filling fields.

    ### Job Description:
    {job_description}
    ### Output(matches the schema):
 """

    return LLM_call(job_prompt)

In [16]:
job_description = input("***Please Enter the Job Description***")
parsed__job_description = job_description_parser(job_description)

{
    "name": "Junior AI Engineer",
    "skills": ["Python", "NumPy", "Pandas", "TensorFlow", "PyTorch", "data structures", "algorithms", "Git", "version control", "communication", "teamwork"],
    "work_experience": ["Prior experience with Git and version control.", "Strong communication and teamwork skills."],
    "projects": ["GitHub", "Kaggle", "any relevant work that showcases your ML projects"]
  ,
    "filepath": "json_output/job_description_junior_ai_engineer.json"
}


In [64]:
job_vectors = {
    field: join_and_embed(parsed__job_description.get(field, []), embedding_model)
    for field in required_fields
}

fields = ["skills", "education", "work_experience", "projects"]
user_weights_raw = {}

print("Please enter weight for each field. Total should sum to 1 (e.g. 0.4, 0.2, etc.)")

# Step 1: Get input for each field
for field in fields:
    while True:
        try:
            weight = float(input(f"Enter weight for '{field}': "))
            if weight < 0:
                raise ValueError
            user_weights_raw[field] = weight
            break
        except ValueError:
            print("Invalid input. Please enter a non-negative number.")

total_weight = sum(user_weights_raw.values())

if abs(total_weight - 1.0) > 1e-6:
    print(f"\n Total weight entered is {total_weight:.3f}, normalizing to 1.")
    user_weights = {k: v / total_weight for k, v in user_weights_raw.items()}
else:
    user_weights = user_weights_raw

print("\n Normalized Weights:")
for field, weight in user_weights.items():
    print(f"  {field}: {weight:.3f}")


results = {}

for field in required_fields:
    hits = client.search(
        collection_name="cv_data",
        query_vector=(field, job_vectors[field]),  # Named vector search
        limit=5,  # Top 5 candidates
        with_payload=True,
        with_vectors=False  # No need to return full vectors now
    )
    results[field] = hits


score_board = defaultdict(float)

for field in results:
    weight = user_weights.get(field, 0)
    for hit in results[field]:
        score_board[hit.id] += hit.score * weight


Please enter weight for each field. Total should sum to 1 (e.g. 0.4, 0.2, etc.)

 Total weight entered is 1.300, normalizing to 1.

 Normalized Weights:
  skills: 0.154
  education: 0.077
  work_experience: 0.308
  projects: 0.462


  hits = client.search(


In [66]:
top_k = int(input("Enter number of top candidates to display: "))

ranked = sorted(score_board.items(), key=lambda x: x[1], reverse=True)

top_candidates = []  

print(f"\n🔍 Top {top_k} matching candidates:\n")
shown = 0

for candidate_id, total_score in ranked:
    point = next(
        (pt for pt in client.scroll(
            collection_name="cv_data",
            with_payload=True,
            with_vectors=False,
            limit=100
        )[0] if pt.id == candidate_id),
        None
    )
    if point:
        candidate_info = {
            "name": point.payload.get("name"),
            "filepath": point.payload.get("filepath"),
            "score": round(total_score, 4),
            "id": candidate_id
        }
        top_candidates.append(candidate_info)

        # Display
        print(f"Name: {candidate_info['name']}")
        print(f"Filepath: {candidate_info['filepath']}")
        print(f"Score: {candidate_info['score']}\n")

        shown += 1
    if shown >= top_k:
        break



🔍 Top 10 matching candidates:

Name: Muhammad Azeem Chaudhry
Filepath: Resumes\Muhammad Azeem Chaudhry's Resume.pdf
Score: 0.1723

Name: AHMAD RASHAD MOJEEB
Filepath: Resumes\Ahmad Rashad (7).pdf
Score: 0.1467



In [None]:
prompt_3 = f""" You are an expert technical recruiter and AI career advisor. Use the information provided below to perform an in-depth candidate evaluation using semantic embeddings retrieved from a vector database (Qdrant).

---

### 🧾 Job Description

{job_description}
---

### 📌 Top {top_k} Candidates from Vector Similarity Search : {top_candidates}

These candidates were retrieved from the Qdrant vector database based on semantic similarity to the job description. Each candidate includes their resume filepath,a vector similarity score and id.
---

Here is all the candidates 
{candidates}

### 🎯 Task

Analyze the candidates above with respect to the job description and perform the following:

1. **Compare** each candidate's qualifications with the job description in terms of:
   - Skills
   - Work experience
   - Relevant projects
   - Educational background

2. **Rank** the candidates from most to least suitable based on the job description and vector match scores.

3. **Justify** your top 1–2 recommendations with detailed reasoning, focusing on fit for the role. Also the decision should not be eccentric to the match scores

4. **Highlight Gaps**:
   - Are there any key missing skills or misalignments?
   - Are there strengths that go beyond the role?

5. **Provide Insights**:
   - Strengths and weaknesses of each candidate
   - A comparison table showing each candidate's match to the job requirements
   - Visual ideas: skill coverage bar chart or score comparison chart
   - Suggest which candidate fits which kind of sub-role (e.g., research-focused, full-stack AI, deployment)

6. Write in a **professional tone** suitable for HR and technical hiring managers.

---

Your goal is to aid the hiring manager in making a well-informed and confident decision based on both semantic similarity and practical job fit. Dont disclose any scores to the user in the output 
 """

In [70]:
client = OpenAI(
base_url="http://172.16.2.214:8000/v1", 
api_key="-" 
)
response = client.chat.completions.create(
model="Qwen/Qwen2.5-32B-Instruct-AWQ",
messages=[
    {"role": "user", "content": prompt_3}
    ],
    )

print(response.choices[0].message.content)

### Candidate Evaluation Report

#### Overview

This report evaluates two top candidates for the Junior AI Engineer position at TechVision based on their resumes and vector similarity scores. The analysis covers skills, work experience, relevant projects, educational background, and alignment with the job description.

#### Candidate Comparison

| Criteria           | Muhammad Azeem Chaudhry                          | AHMAD RASHAD MOJEEB                             |
|--------------------|--------------------------------------------------|--------------------------------------------------|
| **Skills**         | Strong in Python, Machine Learning, NLP, Docker  | Broad range in Full-Stack Development, Game Dev  |
| **Work Experience**| Multiple internships in AI/ML                     | None                                             |
| **Projects**       | AI-driven audio systems, Traffic Sign Classifier  | Full-Stack sports event management               |
| **Education**      | FAS