In [1]:
pip install pymupdf
pip install nltk


SyntaxError: invalid syntax (746932054.py, line 1)

In [9]:
import fitz  # PyMuPDF for PDF extraction
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import joblib


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

# Initialize Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    try:
        text = str(text).lower()  # Convert to lowercase
        text = re.sub(r'\W', ' ', text)  # Remove non-alphabetic characters
        text = re.sub(r'\d+', '', text)  # Remove digits
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(tokens)
    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

# Load pre-trained model and vectorizer
svm_model = joblib.load("svm_model.pkl")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Function to classify resumes
def classify_resumes(job_description, resumes):
    # Preprocess the job description
    job_desc_cleaned = preprocess_text(job_description)

    # Preprocess and vectorize the job description
    job_desc_tfidf = tfidf_vectorizer.transform([job_desc_cleaned])

    # Create a list to hold predictions
    best_fit = None
    best_similarity = -1

    for resume in resumes:
        # Extract text from each resume (assumed to be in PDF format)
        resume_text = extract_text_from_pdf(resume)
        
        if not resume_text:
            print(f"Could not extract text from {resume}. Skipping.")
            continue
        
        # Preprocess the resume text
        resume_cleaned = preprocess_text(resume_text)

        # Vectorize the resume
        resume_tfidf = tfidf_vectorizer.transform([resume_cleaned])

        # Get the similarity score between job description and resume
        similarity = cosine_similarity(job_desc_tfidf, resume_tfidf)

        if similarity > best_similarity:
            best_similarity = similarity
            best_fit = resume

    return best_fit, best_similarity

# Example usage
job_description = "We are looking for a passionate and driven Full Stack Software Engineer to join our growing engineering team. You will be responsible for developing scalable and maintainable web applications across the stack â€” from designing robust APIs to crafting responsive front-end interfaces."
resumes = ["Test1 (1).pdf", "Test1 (2).pdf", "Test1 (3).pdf", "Test2 (1).pdf", "Test2 (2).pdf", "Test2 (3).pdf", "TESTX.pdf"]  # Add paths to your PDF resumes

best_resume, similarity_score = classify_resumes(job_description, resumes)

print(f"Best Fit Resume: {best_resume}")
print(f"Similarity Score: {similarity_score}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VE00YM679\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VE00YM679\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VE00YM679\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Best Fit Resume: TESTX.pdf
Similarity Score: [[0.44422579]]
