In [135]:
import pymupdf
import fitz 
from pydantic import BaseModel
import os 
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time 
from openai import OpenAI
import json 
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
import numpy as np 
from sentence_transformers import SentenceTransformer

In [136]:
from unstructured.partition.pdf import partition_pdf

def pdf_to_text(pdf_path: str) -> str:
    elements = partition_pdf(pdf_path, strategy="fast")
    return "\n".join([el.text.strip() for el in elements if el.text.strip()])


In [137]:
# defining schema for input 

class ResumeInfo(BaseModel):
    name: str
    skills: list = []
    education: list = []
    work_experience: list = []
    projects: list = []
    filepath : str 


json_schema = ResumeInfo.model_json_schema()

In [138]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time 

In [139]:
def LLM_call(prompt):
    client = OpenAI(
    base_url="http://172.16.2.214:8000/v1", 
    api_key="-" 
    )
    response = client.chat.completions.create(
    model="Qwen/Qwen2.5-32B-Instruct-AWQ",
    messages=[
        {"role": "user", "content": prompt}
    ],
   
   extra_body={"guided_json": json_schema}
    )

    print(response.choices[0].message.content)

    return json.loads(response.choices[0].message.content)


In [140]:
# writing the prompt as an input to the LLM 
def parsing_helper(markdown_text,filepath):
  prompt = f"""
  You are a precise and strict **Information Extraction Assistant**.

  Your task is to extract structured data from **unstructured CV text**, strictly following the provided JSON schema.
  add this filepath as well {filepath}

  json schema as follow : 
        name: str
        skills: list = []
        education: list = []
        work_experience: list = []
        projects: list = []
        filepath : str 
    


  ---

  ### Rules:
  - Only extract information that is **explicitly stated** in the CV text.
  - If a field is **missing**, use:'
    - `null` for missing strings
    - `[]` for missing lists
  - Do **not** hallucinate, infer, summarize, or rewrite content.
  - Preserve original text exactly as it appears.
  - Return a **valid JSON object only** — no markdown, no extra explanation.
  ### CV Text: 

  {markdown_text}

  ### Output(matches the schema):
"""
  
  return LLM_call(prompt)

In [141]:
def cv_parser_pipeline(path):
    candidates = []
    if not os.path.isdir(path):
        raise ValueError(f"{path} is not a valid directory!")

    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isdir(file_path):
            continue

        ##Calling pdf_to_text and converting each into text and passing it to the llm 

        print("Converting CV to text")

        text =  pdf_to_text(file_path)
        
        ## passing it to the LLM 
        structured_output = parsing_helper(text,file_path)

        candidates.append(structured_output)
    return candidates

In [142]:
candidates = cv_parser_pipeline("Resumes")

Converting CV to text
{
  "name": "AHMAD RASHAD MOJEEB",
  "skills": [
    "Project Management: Hands-on experience with Jira for tracking tasks and managing sprints",
    "Frameworks & Tools: Spring Boot, .NET, React, Git, MySQLL",
    "Development Areas: Full-Stack Development, Game Development, and Operating Systems Simulation",
    "Deployment: Docker (Containerisation), Website Deployment",
    "Programming Languages: Java, Python, C#, SQL, Assembly, C++, C"
  ],
  "education": [
    {
      "degree": "Bachelor of Computer Science",
      "institution": "FAST | nuces",
      "duration": "2022-2026"
    },
    {
      "degree": "Alevels",
      "institution": "Supernova school",
      "duration": "1Astar 2As",
      "year": "2020-2022"
    },
    {
      "degree": "Olevels",
      "institution": "Supernova school",
      "duration": "6Astar 2As",
      "year": "2018-2020"
    }
  ],
  "work_experience": [],
  "projects": [
    "SportSync – Full-Stack Sports Event Management System 

In [143]:
candidates

[{'name': 'AHMAD RASHAD MOJEEB',
  'skills': ['Project Management: Hands-on experience with Jira for tracking tasks and managing sprints',
   'Frameworks & Tools: Spring Boot, .NET, React, Git, MySQLL',
   'Development Areas: Full-Stack Development, Game Development, and Operating Systems Simulation',
   'Deployment: Docker (Containerisation), Website Deployment',
   'Programming Languages: Java, Python, C#, SQL, Assembly, C++, C'],
  'education': [{'degree': 'Bachelor of Computer Science',
    'institution': 'FAST | nuces',
    'duration': '2022-2026'},
   {'degree': 'Alevels',
    'institution': 'Supernova school',
    'duration': '1Astar 2As',
    'year': '2020-2022'},
   {'degree': 'Olevels',
    'institution': 'Supernova school',
    'duration': '6Astar 2As',
    'year': '2018-2020'}],
  'work_experience': [],
  'projects': ['SportSync – Full-Stack Sports Event Management System (Spring Boot + React + API integration - worked as scrum master)',
   'BookYourHall – Full-Stack Web Ap

In [144]:

client = QdrantClient("localhost", port=6333)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  
client.recreate_collection(
collection_name="cv_data",
vectors_config={
    "skills": VectorParams(size=384, distance=Distance.COSINE),
    "education": VectorParams(size=384, distance=Distance.COSINE),
    "work_experience": VectorParams(size=384, distance=Distance.COSINE),
    "projects": VectorParams(size=384, distance=Distance.COSINE),
    }
)

  client.recreate_collection(


True

In [145]:
required_fields = ["skills", "education", "work_experience", "projects"]

###-------------------------------------------------------------------------
def zero_vector(dim=384):
    return [0.0] * dim
###-------------------------------------------------------------------------
def join_and_embed(field_list,embedding_model):
    if not field_list:
        return zero_vector()  
    pieces = []
    for item in field_list:
        if isinstance(item, dict):
         
            pieces.append(", ".join(f"{k}: {v}" for k, v in item.items()))
        elif isinstance(item, str):
            pieces.append(item)
        else:
       
            pieces.append(str(item))
    text = " ".join(pieces)
    return embedding_model.encode([text])[0].tolist()

###-----------------------------------------------------------------------
def insert_candidate(candidate, collection_name="cv_data"):
    vector_data = {
        field: join_and_embed(candidate.get(field, []),embedding_model)
        for field in required_fields
    }
    
    payload = {}
    if "name" in candidate:
        payload["name"] = candidate["name"]

    # Point ID
    point_id = candidate.get("id", hash(candidate.get("name", "unknown")) & 0xFFFFFFFFFFFFFFFF)

    point = PointStruct(
        id=point_id,
        vector=vector_data,
        payload=payload
    )
    client.upsert(collection_name=collection_name, points=[point])
    print(f"Inserted: {payload.get('name', 'Unnamed')}")

In [146]:
## creating VEC DB 
def create_vec_db(candidates):
    for i, candidate in enumerate(candidates): 
        print("Inserting candidate")
        insert_candidate(candidate)
    scroll_result = client.scroll(
    collection_name="cv_data",
    with_payload=True,
    with_vectors=True,
    limit=100
)

    for point in scroll_result[0]:
        print(f"\nCandidate ID: {point.id}")
        print(f"Name: {point.payload.get('name', 'N/A')}")
        print("Vectors:")
        for vector_name, vector_values in point.vector.items():
            print(f" - {vector_name} ({len(vector_values)} dims)")
            print(f"   {vector_values[:10]}...") 

In [147]:
create_vec_db(candidates)

Inserting candidate
Inserted: AHMAD RASHAD MOJEEB
Inserting candidate
Inserted: Muhammad Azeem Chaudhry

Candidate ID: 3803328281084053030
Name: AHMAD RASHAD MOJEEB
Vectors:
 - work_experience (384 dims)
   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...
 - education (384 dims)
   [-0.051823772, -0.03190962, -0.014669334, 0.0022549776, -0.052440222, -0.06750166, -0.13645282, -0.00560706, -0.05132441, -0.01696106]...
 - skills (384 dims)
   [-0.05399983, -0.024360845, -0.009617732, 0.009675335, 0.0064823, -0.068843395, 0.028106099, -0.0030010412, -0.06823482, 0.069343664]...
 - projects (384 dims)
   [-0.009110181, -0.020662395, -0.028295567, -0.055782888, -0.030762274, -0.015234647, -0.06913017, 0.03892276, -0.08747218, 0.06663907]...

Candidate ID: 5292380097305211120
Name: Muhammad Azeem Chaudhry
Vectors:
 - work_experience (384 dims)
   [-0.057737976, -0.024317201, 0.010416603, 0.019773517, 0.01422782, -0.044006996, 0.002627371, -0.0036363723, -0.036899183, -0.03198545]...
 - 

In [148]:
def job_description_parser(job_description) : 
    
    job_prompt = f"""
    You are an **Information Extraction Assistant**.

    Your task:
    - Parse the provided **job description**.
    - Extract **explicit information only** — ***do not infer, invent, or assume***.
    - Output a **valid JSON object** that matches the schema shown below.

    ### JSON schema:
    {{
    "skills": [ "list of required skills as short strings" ],
    "work_experience": "explicit description of required work experience, as a string",
    "education": "explicit education or qualification requirements, as a string",
    "projects": [ "list of explicitly mentioned types of projects or domains" ]
    }}

    ### Rules:
    - If a field is not present in the job description, use:
    - an empty list `[]` for list fields,
    - or `null` for string fields.
    - Do **not** add any extra text outside the JSON.
    - Do **not** add markdown or explanations.
    - Preserve the original wording of the job description when filling fields.

    ### Job Description:
    {job_description}
    ### Output(matches the schema):
 """

    return LLM_call(job_prompt)

In [149]:
job_description = input("***Please Enter the Job Description***")
parsed__job_description = job_description_parser(job_description)

{
    "name": "Junior AI Engineer",
    "skills": ["Python", "NumPy", "Pandas", "TensorFlow", "PyTorch", "Git", "data structures", "algorithms"],
    "work_experience": ["Prior experience with Git and version control.", "Previous internship or project experience in machine learning."],
    "projects": ["Please share links to GitHub, Kaggle, or any relevant work that showcases your ML projects."],
    "filepath": "job_description.json"
}


In [150]:
# # generating embeddings of the query input 
# embedding_vectors = {}
# for field in weights:
#     vec = embedding_model.encode([query_texts[field]])[0]
#     embedding_vectors[field] = vec * weights[field]

In [151]:
# query_named_vectors = {
#     "skills": embedding_vectors["skills"],
#     "education": embedding_vectors["education"],
#     "work_experience": embedding_vectors["work_experience"],
#     "projects": embedding_vectors["projects"],
# }

# query_named_vectors = {
#     k: v.tolist() if hasattr(v, "tolist") else list(v)
#     for k, v in query_named_vectors.items()
# }

In [152]:
# for name, vec in query_named_vectors.items():
#     print(f"{name}: {len(vec)} dimensions")

In [153]:

# search_skills = client.query_points(
#     collection_name="cv_data",
#     query=query_named_vectors['skills'],
#     using="skills",
#     limit=100,
#     with_payload=True
# )

# search_work = client.query_points(
#     collection_name="cv_data",
#     query=query_named_vectors['work_experience'],
#     using="work_experience",
#     limit=100,
#     with_payload=True
# )

# search_edu = client.query_points(
#     collection_name="cv_data",
#     query=query_named_vectors['education'],
#     using="education",
#     limit=100,
#     with_payload=True
# )

# search_projects = client.query_points(
#     collection_name="cv_data",
#     query=query_named_vectors['projects'],
#     using="projects",
#     limit=100,
#     with_payload=True
# )


# combined_scores = {}

# def accumulate_scores(results):
#     for point in results.points:
#         cid = point.id
#         combined_scores.setdefault(
#             cid,
#             {"name": point.payload.get('name', 'N/A'), "score": 0.0}
#         )
#         combined_scores[cid]["score"] += point.score

# accumulate_scores(search_skills)
# accumulate_scores(search_work)
# accumulate_scores(search_edu)
# accumulate_scores(search_projects)


# sorted_candidates = sorted(
#     combined_scores.values(),
#     key=lambda x: x['score'],
#     reverse=True
# )

# # Print top-K
# top_k = 5
# print(f"\nTop {top_k} Candidates:\n")
# for cand in sorted_candidates[:top_k]:
#     print(f"{cand['name']} | Score: {cand['score']:.4f}")
