#### Helper Functions Testing

In [None]:
import fitz
import os

# extracting data from pdfs
def extract_text_from_pdfs(pdf_dir):
    extracted_data = {}
    for file in os.listdir(pdf_dir):
        if file.endswith('.pdf'):
            path = os.path.join(pdf_dir, file)
            with fitz.open(path) as doc:
                text = "\n".join([page.get_text() for page in doc])
            extracted_data[file] = text
    return extracted_data


In [None]:

import re
import spacy

nlp = spacy.load("en_core_web_sm")

# text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)


In [None]:

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
# paragraph to lines
def split_into_sections(text):
    lines = text.split('\n')
    return [line.strip() for line in lines if len(line.strip()) > 10]

# clustering similar sections using DBSCAN
def detect_semantic_sections(text):
    lines = split_into_sections(text)
    embeddings = model.encode(lines)
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5).fit(embeddings)
    
    sections = {}
    for label in set(clustering.labels_):
        indices = np.where(clustering.labels_ == label)[0]
        cluster_text = "\n".join([lines[i] for i in indices])
        sections[f"Section_{label}"] = cluster_text
    return sections


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:

from transformers import pipeline

# NER pipeline (BERT + normal)
ner_pipe = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def extract_named_entities(text):
    entities = ner_pipe(text)
    # for ever job role skills will be different
    skills = set(e['word'] for e in entities if e['entity_group'] in ['ORG', 'MISC', 'SKILL']) # considering organization, tech stack, domain specific tags
    return list(skills)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# need to finetune the weights based on observations
def score_resume(resume_embedding, jd_embedding, skill_matches, experience_factor,
                 alpha=0.5, beta=0.3, gamma=0.2):
    semantic_score = cosine_similarity([resume_embedding], [jd_embedding])[0][0]
    skill_score = len(skill_matches)
    return alpha * semantic_score + beta * skill_score + gamma * experience_factor


## Main Execution

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# provide job description
JD_TEXT = """We are hiring a Data Science Intern with experience in NLP, Machine Learning and Deep Learning with knowledge of data preprocessing and building end-to-end models."""
embedder = model

# create it's embeddings
JD_EMBEDDING = embedder.encode(JD_TEXT)

pdf_texts = extract_text_from_pdfs('./data/')

ranking_results = []

for filename, raw_text in pdf_texts.items():
    cleaned_text = preprocess_text(raw_text)
    sections = detect_semantic_sections(cleaned_text)
    combined_section_text = " ".join(sections.values())
    
    resume_embedding = embedder.encode(combined_section_text)
    extracted_skills = extract_named_entities(combined_section_text)
    
    # experience factor (trial can be inefficient)
    experience_factor = 1 if any(kw in combined_section_text.lower() for kw in ["senior", "lead", "5+ years"]) else 0.5
    
    score = score_resume(resume_embedding, JD_EMBEDDING, extracted_skills, experience_factor)
    
    ranking_results.append({
        "filename": filename,
        "score": round(score, 3),
        "skills": extracted_skills
    })

# results
df = pd.DataFrame(ranking_results).sort_values(by="score", ascending=False).reset_index(drop=True)
print(df[["filename", "score"]])


        filename  score
0  resume-03.pdf  0.283
1  resume-01.pdf  0.274
2  resume-02.pdf  0.261
