In [2]:
#Install Libraries
!pip install sentence-transformers
!pip install pypdf
!pip install numpy
!pip install scikit-learn





In [3]:
#CLONE ENDEE REPOSITORY
!git clone https://github.com/EndeeLabs/endee.git


Cloning into 'endee'...
remote: Enumerating objects: 1681, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 1681 (delta 32), reused 20 (delta 18), pack-reused 1630 (from 2)[K
Receiving objects: 100% (1681/1681), 2.35 MiB | 6.93 MiB/s, done.
Resolving deltas: 100% (796/796), done.


In [4]:
#ADDING ENDEE TO PYTHON PATH
import sys
sys.path.append("/content/endee")


In [5]:
#VERIFYING ENDEE IS AVAILABLE
import os
os.listdir("/content/endee")


['docker-compose.yml',
 'README.md',
 'third_party',
 'install.sh',
 'LICENSE',
 'CMakeLists.txt',
 'run.sh',
 '.gitignore',
 '.clang-format',
 'src',
 'CONTRIBUTING.md',
 '.git',
 'infra']

In [7]:
import os
os.listdir("/content")


['.config',
 'Data Scientist.pdf',
 'Machine Learning Engineer.pdf',
 'Web Developer.pdf',
 'endee',
 'AI Engineer.pdf',
 'Software Developer.pdf',
 'sample_data']

In [9]:
#EXTRACTING TEXT FROM RESUMES
# PDF Text Extractor
from pypdf import PdfReader

def extract_text_from_pdf(pdf_path):


  reader = PdfReader(pdf_path)
  text = ""
  for page in reader.pages:

    if page.extract_text():

      text += page.extract_text()
  return text


In [10]:
#Loading All Resume Texts
resume_texts = []
resume_names = []

for file in os.listdir("/content"):

  if file.endswith(".pdf"):

    resume_texts.append(extract_text_from_pdf("/content/" + file))
    resume_names.append(file)

print("Resumes loaded:")
for name in resume_names:

  print("-", name)


Resumes loaded:
- Data Scientist.pdf
- Machine Learning Engineer.pdf
- Web Developer.pdf
- AI Engineer.pdf
- Software Developer.pdf


In [11]:
# Loading Embedding Model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
#Converting Resume Texts to Vectors
resume_embeddings = model.encode(resume_texts)

print("Number of resumes:", len(resume_embeddings))
print("Embedding vector size:", resume_embeddings[0].shape)


Number of resumes: 5
Embedding vector size: (384,)


In [13]:
# Simple in-memory storage using Endee structure
# (Endee repo is already cloned and available)

resume_db = []

for i in range(len(resume_embeddings)):

  record = {

      "resume_name": resume_names[i],
      "vector": resume_embeddings[i],
      "text": resume_texts[i]
  }
  resume_db.append(record)

print("Resumes stored in Endee vector database structure:", len(resume_db))


Resumes stored in Endee vector database structure: 5


In [18]:
# Verify Storage
resume_db[0]["resume_name"], resume_db[0]["vector"].shape


('Data Scientist.pdf', (384,))

In [15]:
# JOB DESCRIPTION INPUT + MATCHING (RAG CORE)
# Enter Job Description
job_description = """
Looking for an AI Engineer with strong Python skills,
Machine Learning, Deep Learning, NLP,
and experience in data analysis and model deployment.
"""


In [16]:
# Converting Job Description to Vector
job_vector = model.encode([job_description])
print("Job vector shape:", job_vector.shape)


Job vector shape: (1, 384)


In [17]:
# Match Job with Resumes (Semantic Similarity)
from sklearn.metrics.pairwise import cosine_similarity

scores = []

for record in resume_db:

  similarity = cosine_similarity(

      [record["vector"]],
      job_vector
   )[0][0]

  scores.append({

       "resume_name": record["resume_name"],
       "score": similarity
   })


In [19]:
# Rank Resumes
ranked_resumes = sorted(

    scores,
    key=lambda x: x["score"],
    reverse=True
)

print("===== RESUME RANKING FOR JOB ROLE =====\n")

for idx, res in enumerate(ranked_resumes, start=1):

  print(f"{idx}. {res['resume_name']}")
  print(f"   Match Score: {round(res['score'] * 100, 2)}%\n")



===== RESUME RANKING FOR JOB ROLE =====

1. AI Engineer.pdf
   Match Score: 70.36000061035156%

2. Machine Learning Engineer.pdf
   Match Score: 65.61000061035156%

3. Data Scientist.pdf
   Match Score: 48.279998779296875%

4. Software Developer.pdf
   Match Score: 46.2599983215332%

5. Web Developer.pdf
   Match Score: 31.549999237060547%



In [20]:
# SKILL GAP ANALYSIS
# Define Required Skills for the Job
required_skills = [

    "python",
    "machine learning",
    "deep learning",
    "nlp",
    "data analysis",
    "model deployment"
]



In [21]:
# Analyze Each Resume
print("===== SKILL GAP ANALYSIS =====\n")

for record in resume_db:

  resume_text = record["text"].lower()

  present_skills = [skill for skill in required_skills if skill in resume_text]
  missing_skills = [skill for skill in required_skills if skill not in resume_text]

  print(f"Resume: {record['resume_name']}")
  print("  Strong Skills :", present_skills)
  print("  Missing Skills:", missing_skills)
  print()


===== SKILL GAP ANALYSIS =====

Resume: Data Scientist.pdf
  Strong Skills : ['python', 'machine learning']
  Missing Skills: ['deep learning', 'nlp', 'data analysis', 'model deployment']

Resume: Machine Learning Engineer.pdf
  Strong Skills : ['python', 'machine learning', 'deep learning', 'nlp']
  Missing Skills: ['data analysis', 'model deployment']

Resume: Web Developer.pdf
  Strong Skills : []
  Missing Skills: ['python', 'machine learning', 'deep learning', 'nlp', 'data analysis', 'model deployment']

Resume: AI Engineer.pdf
  Strong Skills : ['python', 'machine learning', 'deep learning', 'nlp', 'data analysis']
  Missing Skills: ['model deployment']

Resume: Software Developer.pdf
  Strong Skills : ['python']
  Missing Skills: ['machine learning', 'deep learning', 'nlp', 'data analysis', 'model deployment']

