In [1]:
import pymupdf
import fitz 
from pydantic import BaseModel
import os 
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time 
from openai import OpenAI
import json 
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
import numpy as np 
from sentence_transformers import SentenceTransformer
from collections import defaultdict

In [None]:
from unstructured.partition.pdf import partition_pdf

def pdf_to_text(pdf_path: str) -> str:
    elements = partition_pdf(pdf_path, strategy="fast")
    print("\n".join([el.text.strip() for el in elements if el.text.strip()]))
    return "\n".join([el.text.strip() for el in elements if el.text.strip()])


In [3]:
# defining schema for input 

class ResumeInfo(BaseModel):
    name: str
    skills: list = []
    education: list = []
    work_experience: list = []
    projects: list = []
    filepath : str 


json_schema = ResumeInfo.model_json_schema()

In [4]:
def LLM_call(prompt):
    client = OpenAI(
    base_url="http://172.16.2.214:8000/v1", 
    api_key="-" 
    )
    response = client.chat.completions.create(
    model="Qwen/Qwen2.5-32B-Instruct-AWQ",
    messages=[
        {"role": "user", "content": prompt}
    ],
   
   extra_body={"guided_json": json_schema}
    )

    print(response.choices[0].message.content)

    return json.loads(response.choices[0].message.content)


In [5]:
# writing the prompt as an input to the LLM 
def parsing_helper(markdown_text,filepath):
  prompt = f"""
  You are a precise and strict **Information Extraction Assistant**.

  Your task is to extract structured data from **unstructured CV text**, strictly following the provided JSON schema.
  add this filepath as well {filepath}

  You are to only pick key words not the whole sentence, 
  Further more you are also supposed to normalize words capitals and smalls 
  and abbrivations such as bscs or some may have written it as Bachelors in Computer science or NLP and Natural Language Processing or Numpy and numpy etc normalize all the words and just extract the key words from the text not the whole thing 

  json schema as follow : 
        name: str
        skills: list = []
        education: list = []
        work_experience: list = []
        projects: list = []
        filepath : str 
    


  ---

  ### Rules:
  - Only extract information that is **explicitly stated** in the CV text.
  - If a field is **missing**, use:'
    - `null` for missing strings
    - `[]` for missing lists
  - Do **not** hallucinate, infer, summarize, or rewrite content.
  - Preserve original text exactly as it appears.
  - Return a **valid JSON object only** — no markdown, no extra explanation.
  ### CV Text: 

  {markdown_text}

  ### Output(matches the schema):
"""
  
  return LLM_call(prompt)

In [6]:
def cv_parser_pipeline(path):
    candidates = []
    if not os.path.isdir(path):
        raise ValueError(f"{path} is not a valid directory!")
    print("Converting CV to text")
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isdir(file_path):
            continue

        text =  pdf_to_text(file_path)
        
        ## passing it to the LLM 
        structured_output = parsing_helper(text,file_path)

        candidates.append(structured_output)
    return candidates

In [7]:

client = QdrantClient("localhost", port=6333)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  
client.recreate_collection(
collection_name="cv_data",
vectors_config={
    "skills": VectorParams(size=384, distance=Distance.COSINE),
    "education": VectorParams(size=384, distance=Distance.COSINE),
    "work_experience": VectorParams(size=384, distance=Distance.COSINE),
    "projects": VectorParams(size=384, distance=Distance.COSINE),
    }
)

  client.recreate_collection(


True

In [8]:
required_fields = ["skills", "education", "work_experience", "projects"]

###-------------------------------------------------------------------------
def zero_vector(dim=384):
    return [0.0] * dim
###-------------------------------------------------------------------------
def join_and_embed(field_list,embedding_model):
    if not field_list:
        return zero_vector()  
    pieces = []
    for item in field_list:
        if isinstance(item, dict):
         
            pieces.append(", ".join(f"{k}: {v}" for k, v in item.items()))
        elif isinstance(item, str):
            pieces.append(item)
        else:
       
            pieces.append(str(item))
    text = " ".join(pieces)
    return embedding_model.encode([text])[0].tolist()

###-----------------------------------------------------------------------
def insert_candidate(candidate, collection_name="cv_data"):
    vector_data = {
        field: join_and_embed(candidate.get(field, []),embedding_model)
        for field in required_fields
    }
    
    payload = {}
    if "name" in candidate:
        payload["name"] = candidate["name"]
    if "filepath" in candidate: 
        payload["filepath"] = candidate["filepath"]

    # Point ID
    point_id = candidate.get("id", hash(candidate.get("name", "unknown")) & 0xFFFFFFFFFFFFFFFF)

    point = PointStruct(
        id=point_id,
        vector=vector_data,
        payload=payload
    )
    
    client.upsert(collection_name=collection_name, points=[point])
    print(f"Inserted: {payload.get('name', 'Unnamed')}")

In [9]:
## creating VEC DB 
def create_vec_db(candidates):
    for i, candidate in enumerate(candidates): 
        print("Inserting candidate")
        insert_candidate(candidate)
    scroll_result = client.scroll(
    collection_name="cv_data",
    with_payload=True,
    with_vectors=True,
    limit=100
)

    for point in scroll_result[0]:
        print(f"\nCandidate ID: {point.id}")
        print(f"Name: {point.payload.get('name', 'N/A')}")
        print(f"Filepath: {point.payload.get('filepath', 'N/A')}")  # ✅ This line shows the path
        print("Vectors:")
        for vector_name, vector_values in point.vector.items():
            print(f" - {vector_name} ({len(vector_values)} dims)")
            print(f"   {vector_values[:10]}...")

In [10]:
def job_description_parser(job_description) : 
    
   job_prompt = f"""
   You are a strict and precise Information Extraction Assistant.

   Your task:
   - Parse the provided **job description**.
   - Extract **explicitly stated** information only — do **not** infer, invent, or assume.
   - Extract only **keywords** or **short phrases** (no full sentences).
   - Normalize all extracted data:
   - Convert all text to lowercase.
   - Normalize abbreviations and variants (e.g., `Bachelors in Computer Science`, `BSCS`, `BS in CS` → `bscs`; `Natural Language Processing` → `nlp`; `NumPy` → `numpy`).

   Output a **valid JSON object only**, matching the schema below.

   ---

   ### JSON schema:
   {{
   "skills": ["list of normalized, lowercase keywords for required skills"],
   "work_experience": ["list of normalized keywords from explicit experience requirements"],
   "education": ["list of normalized degree or qualification keywords"],
   "projects": ["list of normalized keywords for required or preferred project domains"]
   }}

   ---

   ### Rules:
   - If a field is not present in the job description:
   - Use an empty list `[]`.
   - Preserve original content as **normalized keywords only**.
   - Do **not** output any text outside the JSON.
   - Do **not** use markdown, formatting, or explanations.

   ---

   ### Job Description:
   {job_description}

   ---

   ### Output (must strictly match the schema):
   """


   return LLM_call(job_prompt)

In [11]:

def Searching_Qdrant(parsed__job_description,top_k) : 

    job_vectors = {
        field: join_and_embed(parsed__job_description.get(field, []), embedding_model)
        for field in required_fields
    }

    fields = ["skills", "education", "work_experience", "projects"]
    user_weights_raw = {}

    print("Please enter weight for each field. Total should sum to 1 (e.g. 0.4, 0.2, etc.)")

    for field in fields:
        while True:
            try:
                weight = float(input(f"Enter weight for '{field}': "))
                if weight < 0:
                    raise ValueError
                user_weights_raw[field] = weight
                break
            except ValueError:
                print("Invalid input. Please enter a non-negative number.")

    total_weight = sum(user_weights_raw.values())

    if abs(total_weight - 1.0) > 1e-6:
        print(f"\n Total weight entered is {total_weight:.3f}, normalizing to 1.")
        user_weights = {k: v / total_weight for k, v in user_weights_raw.items()}
    else:
        user_weights = user_weights_raw

    print("\n Normalized Weights:")
    for field, weight in user_weights.items():
        print(f"  {field}: {weight:.3f}")


    results = {}

    for field in required_fields:
        hits = client.search(
            collection_name="cv_data",
            query_vector=(field, job_vectors[field]), 
            limit=top_k,
            with_payload=True,
            with_vectors=False  
        )
        results[field] = hits


    score_board = defaultdict(float)

    for field in results:
        weight = user_weights.get(field, 0)
        for hit in results[field]:
            score_board[hit.id] += hit.score * weight
    return score_board


In [12]:



def sorting_candidates(score_board,top_k): 

    ranked = sorted(score_board.items(), key=lambda x: x[1], reverse=True)

    top_candidates = []  
    shown = 0

    for candidate_id, total_score in ranked:
        point = next(
            (pt for pt in client.scroll(
                collection_name="cv_data",
                with_payload=True,
                with_vectors=False,
                limit=100
            )[0] if pt.id == candidate_id),
            None
        )
        if point:
            candidate_info = {
                "name": point.payload.get("name"),
                "filepath": point.payload.get("filepath"),
                "score": round(total_score, 4),
                "id": candidate_id
            }
            top_candidates.append(candidate_info)

            # Display
            print(f"Name: {candidate_info['name']}")
            print(f"Filepath: {candidate_info['filepath']}")
            print(f"Score: {candidate_info['score']}\n")

            shown += 1
        if shown >= top_k:
            break
    
    print(top_candidates)
    return top_candidates

In [13]:
def analysis(job_description,top_candidates,candidates,top_k):
   prompt_3 = f""" You are an expert technical recruiter and AI career advisor. Use the information provided below to perform an in-depth candidate evaluation using semantic embeddings retrieved from a vector database (Qdrant).

---

### 🧾 Job Description

{job_description}
---

###Top {top_k} Candidates from Vector Similarity Search : {top_candidates}

These candidates were retrieved from the Qdrant vector database based on semantic similarity to the job description. Each candidate includes their resume filepath,a vector similarity score and id.

### 🎯 Task

Analyze the candidates above with respect to the job description and perform the following:

1. **Compare** each candidate's qualifications with the job description in terms of:
   - Skills
   - Work experience
   - Relevant projects
   - Educational background

2. **Rank** the candidates from most to least suitable based on the job description and vector match scores.

3. **Justify** your top 1–2 recommendations with detailed reasoning, focusing on fit for the role. Also the decision should not be eccentric to the match scores

4. **Highlight Gaps**:
   - Are there any key missing skills or misalignments?
   - Are there strengths that go beyond the role?

5. **Provide Insights**:
   - Strengths and weaknesses of each candidate
   - A comparison table showing each candidate's match to the job requirements
   - Visualization : skill coverage bar chart or score comparison chart
   - Suggest which candidate fits which kind of sub-role (e.g., research-focused, full-stack AI, deployment)

6. Write in a **professional tone** suitable for HR and technical hiring managers.

---
Further More you should take in account every aspect of each candidate before making a decision 
Your goal is to aid the hiring manager in making a well-informed and confident decision based on both semantic similarity and practical job fit. Dont disclose any scores to the user in the output 

 """
   
   client = OpenAI(
   base_url="http://172.16.2.214:8000/v1", 
   api_key="-" 
   )
   response = client.chat.completions.create(
   model="Qwen/Qwen2.5-32B-Instruct-AWQ",
   messages=[
      {"role": "user", "content": prompt_3}
      ],
      )

   print(response.choices[0].message.content)

In [14]:
### writing the main function for everything

def main(): 
    
    candidates = cv_parser_pipeline("Resumes")

    ## creating vec db 
    create_vec_db(candidates)

    job_description = input("***Please Enter the Job Description***")
    parsed__job_description = job_description_parser(job_description)

    top_k = int(input("Enter number of top candidates to display: "))
    score_board = Searching_Qdrant(parsed__job_description,top_k)

    top_candidates = sorting_candidates(score_board,top_k)

    return analysis(job_description,top_candidates,candidates,top_k)



In [15]:
if __name__ == "__main__": 
    main()

Converting CV to text


APIConnectionError: Connection error.