In [1]:
print("Ali")

Ali


In [1]:
!pip install python-docx pdfminer.six scikit-learn gensim spacy nltk sentence-transformers
!python -m spacy download en_core_web_sm

Collecting pdfminer.six
  Using cached pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting gensim
  Using cached gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Collecting spacy
  Using cached spacy-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting charset-normalizer>=2.0.0 (from pdfminer.six)
  Using cached charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (35 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Using cached cryptography-44.0.3-cp39-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Collecting numpy>=1.19.5 (

In [2]:
import os
import re
from typing import List, Dict, Union
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import PyPDF2
from docx import Document

# Initialize NLP tools
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

class TextExtractor:
    """Handles text extraction from different file formats"""
    
    @staticmethod
    def extract_text_from_pdf(file_path: str) -> str:
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
        return text
    
    @staticmethod
    def extract_text_from_docx(file_path: str) -> str:
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    
    @staticmethod
    def extract_text_from_txt(file_path: str) -> str:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    
    @staticmethod
    def extract_text(file_path: str) -> str:
        if file_path.endswith('.pdf'):
            return TextExtractor.extract_text_from_pdf(file_path)
        elif file_path.endswith('.docx'):
            return TextExtractor.extract_text_from_docx(file_path)
        elif file_path.endswith('.txt'):
            return TextExtractor.extract_text_from_txt(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")

class TextPreprocessor:
    """Handles text cleaning and preprocessing"""
    
    @staticmethod
    def clean_text(text: str) -> str:
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        return text
    
    @staticmethod
    def remove_stopwords(text: str) -> str:
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in stop_words]
        return ' '.join(filtered_text)
    
    @staticmethod
    def lemmatize_text(text: str) -> str:
        doc = nlp(text)
        lemmatized = [token.lemma_ for token in doc]
        return ' '.join(lemmatized)
    
    @staticmethod
    def preprocess(text: str) -> str:
        text = TextPreprocessor.clean_text(text)
        text = TextPreprocessor.remove_stopwords(text)
        text = TextPreprocessor.lemmatize_text(text)
        return text

[nltk_data] Downloading package punkt to /home/ali-
[nltk_data]     suleman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/ali-
[nltk_data]     suleman/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from sentence_transformers import SentenceTransformer
import pickle
import os

class ResumeEmbedder:
    """Converts resumes to embeddings and stores them"""
    
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.embeddings = {}
        self.resume_texts = {}
    
    def process_resume_folder(self, folder_path: str):
        """Process all resumes in a folder"""
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                text = TextExtractor.extract_text(file_path)
                processed_text = TextPreprocessor.preprocess(text)
                embedding = self.model.encode(processed_text)
                self.embeddings[filename] = embedding
                self.resume_texts[filename] = processed_text
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    def save_embeddings(self, file_path: str):
        """Save embeddings to disk"""
        with open(file_path, 'wb') as f:
            pickle.dump({'embeddings': self.embeddings, 'texts': self.resume_texts}, f)
    
    def load_embeddings(self, file_path: str):
        """Load embeddings from disk"""
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
            self.embeddings = data['embeddings']
            self.resume_texts = data['texts']

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class JobDescriptionEmbedder:
    """Converts job descriptions to embeddings and stores them"""
    
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.embeddings = {}
        self.jd_texts = {}
    
    def process_jd_folder(self, folder_path: str):
        """Process all job descriptions in a folder"""
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                text = TextExtractor.extract_text(file_path)
                processed_text = TextPreprocessor.preprocess(text)
                embedding = self.model.encode(processed_text)
                self.embeddings[filename] = embedding
                self.jd_texts[filename] = processed_text
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    def save_embeddings(self, file_path: str):
        """Save embeddings to disk"""
        with open(file_path, 'wb') as f:
            pickle.dump({'embeddings': self.embeddings, 'texts': self.jd_texts}, f)
    
    def load_embeddings(self, file_path: str):
        """Load embeddings from disk"""
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
            self.embeddings = data['embeddings']
            self.jd_texts = data['texts']

In [5]:
class ResumeJobMatcher:
    """Matches resumes to job descriptions based on cosine similarity"""
    
    def __init__(self, resume_embedder: ResumeEmbedder, jd_embedder: JobDescriptionEmbedder):
        self.resume_embedder = resume_embedder
        self.jd_embedder = jd_embedder
    
    def get_top_matches(self, jd_filename: str, top_n: int = 5) -> List[Dict]:
        """
        Get top N matching resumes for a given job description
        Returns list of dicts with resume filename and similarity score
        """
        if jd_filename not in self.jd_embedder.embeddings:
            raise ValueError(f"Job description {jd_filename} not found in embeddings")
        
        jd_embedding = self.jd_embedder.embeddings[jd_filename]
        similarities = []
        
        for resume_filename, resume_embedding in self.resume_embedder.embeddings.items():
            sim = cosine_similarity(
                jd_embedding.reshape(1, -1),
                resume_embedding.reshape(1, -1)
            )[0][0]
            similarities.append((resume_filename, sim))
        
        # Sort by similarity score in descending order
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        return [{'resume': name, 'similarity': score} for name, score in similarities[:top_n]]
    
    def get_all_matches(self, jd_filename: str) -> List[Dict]:
        """
        Get all matches for a given job description
        Returns list of dicts with resume filename and similarity score
        """
        return self.get_top_matches(jd_filename, top_n=len(self.resume_embedder.embeddings))

In [6]:
def main():
    # Initialize embedders
    resume_embedder = ResumeEmbedder()
    jd_embedder = JobDescriptionEmbedder()
    
    # Process files (only need to do this once, then can save/load embeddings)
    resume_embedder.process_resume_folder("Resumes")
    jd_embedder.process_jd_folder("JD")
    
    # Save embeddings for future use
    resume_embedder.save_embeddings("resume_embeddings.pkl")
    jd_embedder.save_embeddings("jd_embeddings.pkl")
    
    # Later, you can load the saved embeddings
    # resume_embedder.load_embeddings("resume_embeddings.pkl")
    # jd_embedder.load_embeddings("jd_embeddings.pkl")
    
    # Initialize matcher
    matcher = ResumeJobMatcher(resume_embedder, jd_embedder)
    
    # Example: Get top 5 matches for a specific job description
    jd_filename = "example_jd.pdf"  # replace with actual filename
    top_matches = matcher.get_top_matches(jd_filename, top_n=5)
    
    print(f"Top matches for {jd_filename}:")
    for match in top_matches:
        print(f"- {match['resume']}: {match['similarity']:.4f}")

if __name__ == "__main__":
    main()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Error processing ADNAN AHMED - CV for Research Assistants - Lab Instructors.txt: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/ali-suleman/nltk_data'
    - '/home/ali-suleman/Desktop/resume-parser-app/env/nltk_data'
    - '/home/ali-suleman/Desktop/resume-parser-app/env/share/nltk_data'
    - '/home/ali-suleman/Desktop/resume-parser-app/env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

Error processing Haris Ahmed - CV for Research Assistant.txt: 
***********************************************

ValueError: Job description example_jd.pdf not found in embeddings