In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install transformers

In [None]:
pip install PyPDF2

In [None]:
pip install pdfplumber


In [1]:

!python -m nltk.downloader stopwords punkt

!python -m spacy download en_core_web_sm

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def find_best_matches(resume_embeddings, job_embedding):
#     similarities = cosine_similarity(job_embedding, resume_embeddings)
#     top_k_indices = np.argsort(similarities[0])[::-1][:5] 
#     return top_k_indices, similarities[0][top_k_indices]

In [14]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def find_best_matches(resume_embeddings, job_embedding):
#     # Ensure job_embedding is 2D
#     if job_embedding.ndim == 1:
#         job_embedding = job_embedding.reshape(1, -1)
   
    
#     similarities = cosine_similarity(job_embedding, resume_embeddings)
#     top_k_indices = np.argsort(similarities[0])[::-1][:5] 
#     return top_k_indices, similarities[0][top_k_indices]

# def filter_and_rank_candidates(parsed_query, resume_entities, resume_embeddings, job_embedding):
#     matched_indices, similarity_scores = find_best_matches(resume_embeddings, job_embedding)
#     filtered_candidates = []
    
#     for i in matched_indices:
#         candidate = resume_entities[i]
#         print(f"Candidate degrees: {candidate.get('degree', 'No degree found')}")  # Use get to avoid KeyError
        
#         # Check if candidate degree is not empty and matches parsed query
#         if candidate.get('degree') and parsed_query['degree'] in candidate['degree']:
#             filtered_candidates.append((i, similarity_scores[i], candidate))
    
#     filtered_candidates = sorted(filtered_candidates, key=lambda x: x[1], reverse=True)
#     return filtered_candidates




## Without Error 

In [1]:
import os
import pandas as pd
from pypdf import PdfReader
from nltk import pos_tag, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [2]:

def extract_text_from_pdf(file_path):
    """Extract text from the provided PDF file."""
    reader = PdfReader(file_path)
    text = "".join(page.extract_text() for page in reader.pages)
    return text

In [3]:
def preprocess_text(text):
    """Preprocess the text: lowercasing, removing punctuation, stop words, etc."""
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text) 
    sentences = sent_tokenize(text)
    features = ''
    
    stop_words = set(stopwords.words("english"))
    for sent in sentences:
        words = word_tokenize(sent)
        words = [word for word in words if word not in stop_words]
        tagged_words = pos_tag(words)
        filtered_words = [word for word, tag in tagged_words if tag not in ['DT', 'IN', 'TO', 'PRP', 'WP']]
        features += " ".join(filtered_words) + " "
    return features


In [4]:

def process_resume_data(resume_file_path):
    """Process resumes by extracting text and preprocessing it."""
    text = extract_text_from_pdf(resume_file_path)
    features = preprocess_text(text)
    return features

In [5]:
def get_embeddings(text, model, tokenizer, device):
    """Generate embeddings for the given text using the model."""
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad(): 
        outputs = model(**inputs)
    
    
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    
    return embeddings

In [6]:

def contextual_search(query, resume_data, model, tokenizer, device, top_k=5):
    """Perform contextual search using the query against resume data."""
   
    query_features = preprocess_text(query)
    query_embedding = get_embeddings(query_features, model, tokenizer, device)
    
   
    resume_embeddings = np.vstack([get_embeddings(text, model, tokenizer, device) for text in resume_data['Feature']])
    
    
    similarities = cosine_similarity(query_embedding, resume_embeddings)[0]
    
    
    top_indices = np.argsort(similarities)[::-1][:top_k]
   
    return resume_data.iloc[top_indices], similarities[top_indices]

In [9]:

def main():
    resume_dir = "/kaggle/input/resume-dataset/data/data/INFORMATION-TECHNOLOGY"  
    
    resume_data = pd.DataFrame(columns=["ID", "Feature"])
    
    for idx, file_name in enumerate(os.listdir(resume_dir)):
        if file_name.endswith(".pdf"):
            resume_file_path = os.path.join(resume_dir, file_name)
            features = process_resume_data(resume_file_path)
            resume_data = pd.concat([resume_data, pd.DataFrame({"ID": [file_name], "Feature": [features]})], ignore_index=True)

    
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to(device)
    
    hr_query = input("Enter the query: ")
    
    
    top_resumes, scores = contextual_search(hr_query, resume_data, model, tokenizer, device, top_k=5)
    
    
    for idx, score in enumerate(scores):
        print(f"\nCandidate {idx+1}:")
        print(f"Resume File: {top_resumes['ID'].iloc[idx]}")
        print(f"Similarity Score: {score:.4f}")
   
if __name__ == "__main__":
    main()








Enter the query:  8+ years of full stack development experience with exposure to node.js and azure, excellent verbal and written communication skills



Candidate 1:
Resume File: 40018190.pdf
Similarity Score: 0.7056

Candidate 2:
Resume File: 24083609.pdf
Similarity Score: 0.7004

Candidate 3:
Resume File: 68460556.pdf
Similarity Score: 0.6991

Candidate 4:
Resume File: 20024870.pdf
Similarity Score: 0.6935

Candidate 5:
Resume File: 12635195.pdf
Similarity Score: 0.6882


## Transformer Approach 