#### installing the necessary libraries needed to make the schema 

In [1]:
import pymupdf
import fitz # pymupdf (fitz is a module inside pymupdf that we are using)
from pydantic import BaseModel
import os 

In [2]:

# creating a function for parsing pdfs (resumes)
def pdf_to_markdown(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        text = page.get_text()
        # Remove characters that can't be encoded in UTF-8
        cleaned_text = text.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")
        full_text += cleaned_text

    return full_text



In [3]:
markdown_text = pdf_to_markdown(r"C:\Users\Azeem\Documents\CARE\Intelligent-Resume-Filtering\Resumes\Ahmad Rashad (7).pdf")

In [4]:
print(markdown_text)

HOBBIES
CONTACT
SKILLS
+92-348-5416025
ahmadrashadmojeeb@gmail
.com
NV-16, Isra Residence,
Sector H-12, NUST,
Islamabad
Project Management: Hands-
on experience with Jira for
tracking tasks and managing
sprints
Frameworks & Tools: Spring
Boot, .NET, React, Git,
MySQLL
Development Areas: Full-
Stack Development, Game
Development, and Operating
Systems Simulation
Deployment: Docker
(Containerisation), Website
Deployment
Programming Languages:
Java, Python, C#, SQL,
Assembly, C++, C
Firebase, mongoDB
AHMAD RASHAD MOJEEB
PROJECTS
EDUCATION
SUMMARY
A Computer Science student at FAST with hands-on experience in full-
stack development, DevOps, and Agile methodologies through academic
projects and teamwork.
Proficient in Java, C++, React, Spring Boot, SQL, and Docker, with a strong
foundation in data structures, algorithms, and operating systems.
2022-2026
Bachelor of Computer Science
FAST | nuces
2020-2022
Alevels  -  1Astar 2As
Supernova school
i221175@nu.edu.pk
SportSync – Full-Stack Sport

In [5]:
# defining schema for input 

class ResumeInfo(BaseModel):
    name: str
    skills: list = []
    education: list = []
    work_experience: list = []
    projects: list = []

### Loading LLM

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time 

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
#downloading the model locally 

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
save_path = "./models/TinyLlama-1.1B-Chat"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if os.path.exists(save_path) and os.path.isdir(save_path):

    tokenizer = AutoTokenizer.from_pretrained(save_path)
    model = AutoModelForCausalLM.from_pretrained(save_path)

else:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)

    tokenizer.save_pretrained(save_path)
    model.save_pretrained(save_path)

model.to(device)

Using device: cuda


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [8]:
# writing the prompt as an input to the LLM 

prompt = f"""
You are a precise and strict **Information Extraction Assistant**.

Your task is to extract structured data from **unstructured CV text**, strictly following the provided JSON schema.

---

### Rules:
- Only extract information that is **explicitly stated** in the CV text.
- If a field is **missing**, use:
  - `null` for missing strings
  - `[]` for missing lists
- Do **not** hallucinate, infer, summarize, or rewrite content.
- Preserve original text exactly as it appears.
- Return a **valid JSON object only** — no markdown, no extra explanation.

---

### JSON Schema (strict):
```json
{{
  "name": string,
  "skills": list of strings,
  "education": list of strings,
  "work_experience": list of strings,
  "projects": list of strings
}}

### CV Text: 

{markdown_text}

### Output(matches the schema):
"""

In [9]:
## tokenizing the input prompt 

start_tokenize = time.time()
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

inputs = {k: v.to(device) for k, v in inputs.items()}

end_tokenize = time.time()

print(f"Tokenization completed, time taken {end_tokenize - start_tokenize}")

start_generate = time.time()
## passing the tokenized input into the model 
with torch.no_grad():  
    outputs = model.generate(
        **inputs,
        max_new_tokens=400,       
        do_sample=False,          
        temperature=0.01,          
        pad_token_id=tokenizer.eos_token_id
    )


end_generate = time.time()

print(f"Generation completed, Time taken : {end_generate - start_generate}")

response = tokenizer.decode(outputs[0], skip_special_tokens=True)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Tokenization completed, time taken 0.129241943359375
Generation completed, Time taken : 317.1555962562561


In [10]:
print(response.replace(prompt, "").strip())

{
  "name": "Ahmad Rashad Mojeeb",
  "skills": ["Java", "C++", "React", "Spring Boot", "SQL", "Docker", "Firebase", "MongoDB", "AHMAD RASHAD MOJEEB"],
  "education": ["Bachelor of Computer Science", "Alevels 1Astar 2As", "Supernova school", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync", "i221175@nu.edu.pk", "SportSync",


In [11]:
resume_data = response.replace(prompt,"").strip()

### Qdrant

In [12]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams

In [13]:
#connecting locally 
client = QdrantClient("localhost", port=6333)

In [14]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  

In [15]:
sample_text = "Sample text for embedding"
embedding = embedding_model.encode(sample_text)
embedding_size = len(embedding)
print(f"Embedding size: {embedding_size}")

Embedding size: 384


In [28]:
client.recreate_collection(
    collection_name="cv_data",
    vectors_config={
        "skills": VectorParams(size=384, distance=Distance.COSINE),
        "education": VectorParams(size=384, distance=Distance.COSINE),
        "work_experience": VectorParams(size=384, distance=Distance.COSINE),
        "projects": VectorParams(size=384, distance=Distance.COSINE),
    }
)

def zero_vector(dim=384):
    return [0.0] * dim

def join_and_embed(field_list):
    if not field_list:
        return zero_vector()  # Field exists but empty
    text = " ".join(field_list)
    return embedding_model.encode([text])[0].tolist()


  client.recreate_collection(


In [29]:
required_fields = ["skills", "education", "work_experience", "projects"]
def insert_candidate(candidate: dict,collection_name = "cv_data"):
   
    vector_data = {
        field: join_and_embed(candidate.get(field, []))
        for field in required_fields
    }

    payload = {}
    if "name" in candidate:
        payload["name"] = candidate["name"]

    # Point ID: fallback if missing
    point_id = candidate.get("id", hash(candidate.get("name", "unknown")))

    point = PointStruct(
        id=point_id,
        vector=vector_data,
        payload=payload
    )
    client.upsert(collection_name=collection_name, points=[point])
    print(f"Inserted: {payload.get('name', 'Unnamed')}")

In [30]:
candidates = [
    {
        "name": "Sara Qureshi",
        "skills": ["Python", "TensorFlow", "Pandas", "Scikit-learn", "SQL", "Docker"],
        "education": [
            "BS in Artificial Intelligence - NUST (2019–2023)",
            "Intermediate - Punjab College (2017–2019)"
        ],
        "work_experience": [
            "Machine Learning Intern at VisionX (2022)",
            "Data Analyst at Upwork (2021–Present)"
        ],
        "projects": [
            "Skin Cancer Classifier using CNN",
            "Resume Parser with SpaCy and Streamlit"
        ]
    },
    {
        "name": "Ali Raza",
        "skills": ["Java", "Spring Boot", "MySQL"],
        "education": [],
        "work_experience": [],
        "projects": []
    },
    {
        "name": "Hira Khalid",
        "skills": ["C++", "React", "Firebase", "Node.js"],
        "education": ["BSCS - FAST NUCES", "O & A Levels - Roots International"],
        "work_experience": ["Frontend Developer Intern at Arbisoft"],
        "projects": ["BookMyHall (React + Firebase)", "Portfolio website"]
    },
    {
        "name": "Muhammad Azeem",
        "skills": ["Python", "Pandas", "Streamlit", "Vector Databases", "Qdrant", "Coqui TTS"],
        "education": ["BSCS - FAST (2022–2026)"],
        "work_experience": ["AI Intern at Atomcamp", "RAG System Developer at Vectorshift"],
        "projects": [
            "Roman Urdu to Urdu TTS System using Coqui + ESPnet",
            "Resume Filter using Pydantic + Qdrant"
        ]
    }
]


for i, candidate in enumerate(candidates):
    insert_candidate(candidate)

Inserted: Sara Qureshi
Inserted: Ali Raza
Inserted: Hira Khalid
Inserted: Muhammad Azeem


In [None]:
scroll_result = client.scroll(
    collection_name="cv_data",
    with_payload=True,
    with_vectors=True,
    limit=100
)

for point in scroll_result[0]:
    print(f"\nCandidate ID: {point.id}")
    print(f"Name: {point.payload.get('name', 'N/A')}")
    print("Vectors:")
    for vector_name, vector_values in point.vector.items():
        print(f" - {vector_name} ({len(vector_values)} dims)")
        print(f"   {vector_values[:10]}...") 


Candidate ID: 218321759453493964
Name: Sara Qureshi
Vectors:
 - education (384 dims)
   [-0.05388639, -0.018306209, 0.0031778235, 0.01587062, 0.019572804, -0.008160581, -0.07396886, -0.03875141, -0.08189774, 0.033437066]...
 - work_experience (384 dims)
   [-0.08474719, -0.037158173, 0.08516825, -0.010177444, -0.0019727254, -0.0562636, 0.004107879, -0.04346316, -0.12105826, -0.031767443]...
 - skills (384 dims)
   [0.011941659, -0.09691213, -0.050752, 0.015720889, 0.015734889, -0.1109149, 0.031121796, -0.046576813, -0.09108185, 0.017807573]...
 - projects (384 dims)
   [-0.040890027, -0.017174944, 0.030739445, 0.013700359, 0.0492771, -0.010928533, 0.060590174, -0.044393994, -0.101692274, -0.09218805]...

Candidate ID: 239531732132939857
Name: Ali Raza
Vectors:
 - projects (384 dims)
   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...
 - work_experience (384 dims)
   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...
 - education (384 dims)
   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

## Generation 

In [None]:

def get_user_requirements():
    print("Please enter your ideal candidate profile requirements.")
    fields = ["skills", "experience", "education", "projects"]

    query_texts = {}
    weights = {}

    for field in fields:
        value = input(f"\n🔹 Enter desired {field.replace('_', ' ')} (comma-separated or free text):\n> ")
        query_texts[field] = value.strip()

    print("\nNow assign a weight to each field (between 0.0 and 1.0). Total doesn't need to be 1.")
    for field in fields:
        while True:
            try:
                w = float(input(f"Weight for {field}: "))
                if 0 <= w <= 1:
                    weights[field] = w
                    break
                else:
                    print(" Must be between 0 and 1.")
            except ValueError:
                print(" Invalid number, try again.")

    return query_texts, weights

# Get dynamic user input
query_texts, weights = get_user_requirements()

print("\n✅ Final Input Summary:")
print("User Prompt:", query_texts)
print("Weights:", weights)
