In [1]:
import os
import re
import nltk
import docx
import string
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from pdfminer.high_level import extract_text

nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'https?:\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(rf'[{string.punctuation}]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

def extract_from_pdf(file_path):
    try:
        return extract_text(file_path)
    except:
        return ""

def extract_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except:
        return ""

def load_resumes(resume_folder):
    resumes = []
    file_names = []
    for filename in os.listdir(resume_folder):
        path = os.path.join(resume_folder, filename)
        text = ''
        if filename.endswith('.pdf'):
            text = extract_from_pdf(path)
        elif filename.endswith('.docx'):
            text = extract_from_docx(path)
        if text:
            resumes.append(clean_text(text))
            file_names.append(filename)
    return resumes, file_names

def load_job_description(jd_path):
    with open(jd_path, 'r', encoding='utf-8') as file:
        return clean_text(file.read())

def rank_resumes(jd, resumes, file_names):
    corpus = [jd] + resumes
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(corpus)
    jd_vec = vectors[0]
    resume_vecs = vectors[1:]
    scores = cosine_similarity(jd_vec, resume_vecs).flatten()
    ranked = sorted(zip(file_names, scores), key=lambda x: x[1], reverse=True)
    return ranked

if __name__ == "__main__":
    jd_path = 'job_description.txt'
    resume_folder = 'resumes'
    print(" Loading job description...")
    jd = load_job_description(jd_path)
    print(" Loading resumes...")
    resumes, file_names = load_resumes(resume_folder)
    print(f" {len(resumes)} resumes loaded.")
    print(" Ranking resumes...")
    ranked_resumes = rank_resumes(jd, resumes, file_names)
    print("\n Ranked Resumes:")
    for rank, (name, score) in enumerate(ranked_resumes, 1):
        print(f"{rank}. {name} (Score: {score:.4f})")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


 Loading job description...
 Loading resumes...
 5 resumes loaded.
 Ranking resumes...

 Ranked Resumes:
1. Charlie_Lee.docx (Score: 0.1918)
2. Alice_Johnson.docx (Score: 0.1653)
3. Diana_Patel.docx (Score: 0.1079)
4. Ethan_Zhang.docx (Score: 0.1006)
5. Bob_Smith.docx (Score: 0.0651)
