In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# sample job description
JD_TEXT = """
    We are hiring a Data Science Intern with experience in NLP, Machine Learning and Deep Learning with knowledge of data preprocessing and building end-to-end models.
"""


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# provide resumes
import fitz  #PyMuPDF's new library
import os

#storing key-pair value (key-file name, pair-extracted text)
def extract_text(directory):
    extracted_data = {}
    for file in os.listdir(directory):
        if file.endswith('.pdf'):
            path = os.path.join(directory, file)
            with fitz.open(path) as doc:
                text = "\n".join([page.get_text() for page in doc])
            extracted_data[file] = text
    return extracted_data


In [3]:
pdfs_text = extract_text('./data/')


In [6]:
pdfs_text


{'resume-01.pdf': 'Ayush Uttarwar\n+91-9359635906 | ayushuttarwar086@gmail.com | Linkedin | Github | Tableau Public\nSUMMARY\nI’m a ﬁnal year B.Tech student studying AI and Data Science. I’m really interested in using my skills to solve real-world\nproblems. I have a good grasp of Data Structures and Machine Learning, and I’ve been working on projects related to Data\nScience and Generative AI. I’m looking for a chance to learn gain some industry experience, and contribute to a team and\nget exposure to the industry.\nTECHNICAL SKILLS\nProgramming Languages: C++, JavaScript, Python\nML/DS: scikit-learn, NumPy, Pandas, XGBoost, Plotly\nData Visualization: Power BI, Matplotlib Tableau, Seaborn\nTools: Git, GitHub, MySQL, Excel\nPROJECTS\nDuplicate Question Detection for Social Platforms\nkaggle-url\nNLP, Random Forest, XGBoost, CountVectorizer\n• Cleaned and normalized large text datasets by removing HTML tags, expanding contractions, etc.\n• Engineered features including word counts, sh

In [8]:
# text preprocessing
import re
import spacy 

nlp = spacy.load('en_core_web_sm')

preprocessed_text = []
for file, text in pdfs_text.items():
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        # skip stopword
        if token.is_stop:
            continue
        # filters out punctuation/numbers
        if not token.is_alpha:
            continue
        
        tokens.append(token.lemma_)
    
    cleaned_text = " ".join(tokens)

    
print(cleaned_text)


ayush uttarwar linkedin github leetcode summary finalyear btech student specialize ai data science strong foundation data structure algorithm machine learning practical experience nlp generative ai project m seek entrylevel opportunity learn contribute begin career technical skill programming language c javascript python langchain framework nodejs expressjs react tailwind css redux library mongoose numpy panda scikitlearn matplotlib tool git github mongodb mysql postman project mental health chatbot dec rag chatbot langchain astradb groqs llm build llmpowere chatbot retrievalaugmented generation rag accurate contextaware response integrate astradb vector store hug face embedding efﬁcient document retrieval develop gradiobased ui optimize inference speed realtime interaction codexity aug feb web development react mongodb tailwind css nodejs etc fully functional edtech platform mern stack enable course creation consumption rating design seamless interactive learning experience student sh

In [9]:
# extract different sections from the resume
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def sections_split(text):
    lines = text.split('\n')
    filtered_lines = []

    for line in lines:
        stripped = line.strip()
        
        # consider only if it's important
        if len(stripped) > 10:
            filtered_lines.append(stripped)
            
    return filtered_lines


lines = sections_split(cleaned_text)

print(lines)


['ayush uttarwar linkedin github leetcode summary finalyear btech student specialize ai data science strong foundation data structure algorithm machine learning practical experience nlp generative ai project m seek entrylevel opportunity learn contribute begin career technical skill programming language c javascript python langchain framework nodejs expressjs react tailwind css redux library mongoose numpy panda scikitlearn matplotlib tool git github mongodb mysql postman project mental health chatbot dec rag chatbot langchain astradb groqs llm build llmpowere chatbot retrievalaugmented generation rag accurate contextaware response integrate astradb vector store hug face embedding efﬁcient document retrieval develop gradiobased ui optimize inference speed realtime interaction codexity aug feb web development react mongodb tailwind css nodejs etc fully functional edtech platform mern stack enable course creation consumption rating design seamless interactive learning experience student 

In [None]:
# form clusters of semantically similar embeddings

embeddings = embedder.encode(lines)

# cluster similar embeddings for better retrieval
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5).fit(embeddings)
