In [2]:
import os
import re
import pdfplumber
import docx
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import win32com.client

# Load Hugging Face models for classification and NER
def load_classification_pipeline(model_name):
    return pipeline("zero-shot-classification", model=model_name)

def load_ner_pipeline(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    return pipeline("ner", model=model, tokenizer=tokenizer)

# Define model names and initialize pipelines
classification_model_name = "distilbert-base-uncased"  # Replace with your preferred classification model
classifier = load_classification_pipeline(classification_model_name)

ner_model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # Replace with your preferred NER model
ner_pipeline = load_ner_pipeline(ner_model_name)

# Function to get text from various file formats
def input_pdf_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        return ''.join([page.extract_text() for page in pdf.pages])

def input_docx_text(file_path):
    try:
        doc = docx.Document(file_path)
        return '\n'.join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error processing DOCX file {file_path}: {e}")
        return None

def input_doc_text(file_path):
    try:
        word = win32com.client.Dispatch("Word.Application")
        doc = word.Documents.Open(file_path)
        text = doc.Content.Text
        doc.Close(False)
        word.Quit()
        return text
    except Exception as e:
        print(f"Error processing DOC file {file_path}: {e}")
        return None

def extract_text(file_path):
    if file_path.endswith(".pdf"):
        return input_pdf_text(file_path)
    elif file_path.endswith(".docx"):
        return input_docx_text(file_path)
    elif file_path.endswith(".doc"):
        return input_doc_text(file_path)
    return None

# Function to classify text into dynamic categories
def classify_text(text, categories):
    result = classifier(text, candidate_labels=categories)
    return result['labels'][0], result['scores'][0]  # return top label and score

# Function to extract named entities based on dynamic labels
def extract_entities(text, entity_labels):
    entities = ner_pipeline(text)
    return [(ent['word'], ent['entity']) for ent in entities if ent['entity'] in entity_labels]

# Function to annotate text based on dynamic categories and entities
def annotate_text(text, categories, entity_labels):
    category, confidence = classify_text(text, categories)
    entities = extract_entities(text, entity_labels)
    return {
        "category": category,
        "confidence": confidence,
        "entities": entities
    }

# Function to dynamically extract skills (can be improved with pre-defined skill lists)
def extract_skills(text, skills_model=None):
    # Use a TF-IDF Vectorizer to identify potential skills from the text
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.sum(axis=0).A1
    skills = [feature_names[i] for i in scores.argsort()[::-1] if scores[i] > 0.1]  # threshold can be adjusted
    return skills

# Function to parse a single resume
def parse_resume(file_path, categories, entity_labels):
    text = extract_text(file_path)
    if not text:
        print(f"Failed to extract text from {file_path}")
        return None

    # Extract relevant information
    skills = extract_skills(text, None)  # skills_model can be None if using TF-IDF

    # Dynamically annotate the text based on categories and entities
    annotations = annotate_text(text, categories, entity_labels)

    # Parse results
    return {
        "text": text,
        "skills": ", ".join(skills) if skills else None,
        "annotations": annotations
    }

# Function to load and parse all resumes in a directory
def load_resumes(directory, categories, entity_labels):
    resumes = []
    for filename in os.listdir(directory):
        if filename.endswith(('.pdf', '.docx', '.doc')):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")
            parsed_resume = parse_resume(file_path, categories, entity_labels)
            if parsed_resume:
                resumes.append(parsed_resume)
    return resumes

# Function to match resumes with job descriptions using TF-IDF and cosine similarity
def match_resume_with_jd(resume_text, job_description):
    documents = [resume_text, job_description]
    tfidf = TfidfVectorizer().fit_transform(documents)
    similarity_matrix = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity_matrix[0][0]  # Return the similarity score

# Function to segregate files by job descriptions
def segregate_files_by_jd(resumes, job_descriptions, base_directory):
    if not os.path.exists(base_directory):
        os.makedirs(base_directory)

    for resume in resumes:
        highest_match = 0
        best_jd = None
        
        for jd_title, jd_text in job_descriptions.items():
            match_score = match_resume_with_jd(resume['text'], jd_text)
            if match_score > highest_match:
                highest_match = match_score
                best_jd = jd_title

        if best_jd:
            jd_directory = os.path.join(base_directory, best_jd)
            if not os.path.exists(jd_directory):
                os.makedirs(jd_directory)

            # Sanitize the resume name to avoid illegal characters
            resume_filename = f"resume_{resumes.index(resume)}.txt"
            file_path = os.path.join(jd_directory, resume_filename)

            try:
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(resume['text'])
                print(f"Moved resume {resumes.index(resume)} to {jd_directory}")
            except Exception as e:
                print(f"Error writing file {file_path}: {e}")

# Main execution
if __name__ == "__main__":
    # Define paths and job descriptions
    directory = "C:/Users/ananya/Downloads/resumes1"  # Directory containing resumes
    base_directory = "C:/Users/ananya/Downloads/RESULT_MAX"  # Directory to save categorized resumes
    job_descriptions = {
        "AI engineer": "The AI Engineer designs and implements artificial intelligence models and algorithms...",
        "Business development executive": "The Business Development Executive is responsible for identifying...",
        "Subject matter expert": "The Subject Matter Expert provides expert knowledge and insights..."
    }
    
    categories = ["education", "skill", "experience", "interest", "personal information"]
    entity_labels = ["PER", "ORG", "LOC", "MISC"]  # Adjust according to the NER model used
    
    resumes = load_resumes(directory, categories, entity_labels)
    segregate_files_by_jd(resumes, job_descriptions, base_directory)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS

Processing file: C:/Users/ananya/Downloads/resumes1\1707727659390_Gudi Madhu Latha-CV.pdf
Processing file: C:/Users/ananya/Downloads/resumes1\Aakash_Muthreja_CV1 (1).pdf
Processing file: C:/Users/ananya/Downloads/resumes1\Aakash_Muthreja_CV1.pdf
Processing file: C:/Users/ananya/Downloads/resumes1\Ajay _Resume_2024_PM.pdf
Processing file: C:/Users/ananya/Downloads/resumes1\ALOK NATH (1).pdf
Processing file: C:/Users/ananya/Downloads/resumes1\Aswathi p_Mcc (4).pdf
Processing file: C:/Users/ananya/Downloads/resumes1\Atul Narayan.pdf
Processing file: C:/Users/ananya/Downloads/resumes1\Avkash-Resume.pdf
Processing file: C:/Users/ananya/Downloads/resumes1\Brototi Banerjee_Final_CV_Updated (2).docx
Processing file: C:/Users/ananya/Downloads/resumes1\CURRICULUM VITAE.pdf
Processing file: C:/Users/ananya/Downloads/resumes1\cvnd24 (3).pdf
Processing file: C:/Users/ananya/Downloads/resumes1\CV_2023070509122292.pdf
Processing file: C:/Users/ananya/Downloads/resumes1\CV_Resume.pdf
Processing file: 