In [None]:
!pip install transformers torch accelerate huggingface_hub gradio fpdf shap fitz PyMuPDF PyPDF2 pdfplumber

In [1]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, pipeline
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import shap
from PyPDF2 import PdfReader
import os
import re
import traceback
from fpdf import FPDF
import pdfplumber

def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text



class JobDescriptionGenerator:
    def __init__(self):
        # Using a larger T5 model for better quality
        self.model_name = "google/flan-t5-base"  # Changed to base model for better quality
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

        # Use Colab's T4 GPU
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()

        #print(f"Using GPU: {torch.cuda.get_device_name(0)}")

    def generate_section(self, section_name, context):
        """Generate specific sections with focused prompts"""
        prompts = {
            "overview": f"Write a professional overview paragraph for a {context['job_title']} position in {context['industry']}. Mention the company is {context['company_info']}.",
            "responsibilities": f"List 5-7 key responsibilities for a {context['job_title']} with {context['experience_level']} experience. Focus on {context['key_skills']}.",
            "requirements": f"List 5-7 key requirements for a {context['job_title']} position. Include technical skills like {context['key_skills']} and experience requirements of {context['experience_level']}.",
            "benefits": f"List 4-5 competitive benefits and perks for a {context['job_title']} position at a company that is {context['company_info']}. Location: {context['location']}",
            "skills": f"Generate a list of specific technical and soft skills required for a {context['job_title']} in the {context['industry']} industry. Consider experience level: {context['experience_level']} and company description: {context['company_info']}."
        }

        prompt = prompts.get(section_name, "")

        inputs = self.tokenizer(prompt,
                              return_tensors="pt",
                              max_length=512,
                              truncation=True).to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=200,
                min_length=50,
                num_beams=4,
                temperature=0.8,
                top_k=50,
                top_p=0.9,
                do_sample=True,
                no_repeat_ngram_size=2,
                early_stopping=True
            )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def format_text_to_bullets(self, text):
        """Convert text to bullet points"""
        sentences = [s.strip() for s in re.split('(?<=[.!?])\s+', text) if s.strip()]
        return [f"\u2022 {sentence}" for sentence in sentences if len(sentence) > 20]

    def generate_job_description(self, job_title, industry, experience_level, key_skills,
                               company_info=None, location=None):
        context = {
            "job_title": job_title,
            "industry": industry,
            "experience_level": experience_level,
            "key_skills": key_skills,
            "company_info": company_info or "a leading company in the industry",
            "location": location or "Flexible"
        }

        # Generate each section separately
        job_description = []

        # Header
        job_description.extend([
            "="*50,
            f"\nPOSITION: {job_title.upper()}",
            f"Location: {location}",
            "="*50,
            "\n"
        ])

        # Overview
        job_description.extend([
            "ABOUT THE ROLE",
            "-"*15,
            self.generate_section("overview", context),
            "\n"
        ])

        # Responsibilities
        job_description.extend([
            "KEY RESPONSIBILITIES",
            "-"*20,
            *self.format_text_to_bullets(self.generate_section("responsibilities", context)),
            "\n"
        ])

        # Requirements
        job_description.extend([
            "REQUIREMENTS & QUALIFICATIONS",
            "-"*27,
            *self.format_text_to_bullets(self.generate_section("requirements", context)),
            "\n"
        ])

        # Specific Skills
        job_description.extend([
            "SPECIFIC SKILLS",
            "-"*15,
            *self.format_text_to_bullets(self.generate_section("skills", context)),
            "\n"
        ])

        # Benefits
        job_description.extend([
            "BENEFITS & PERKS",
            "-"*15,
            *self.format_text_to_bullets(self.generate_section("benefits", context)),
            "\n"
        ])

        return "\n".join(job_description)


def extract_keywords(text):
    """Extract important keywords from text using NLP"""
    # Using basic keyword extraction
    words = re.findall(r'\b\w+\b', text.lower())
    # Filter common words and short words
    keywords = [word for word in words if len(word) > 3]
    # Return unique keywords
    return list(set(keywords))

def embed_text(text, model, tokenizer, max_length=512):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")
        embedding = model(**inputs).last_hidden_state.mean(dim=1)
        return embedding
    except Exception as e:
        raise Exception(f"Embedding Error: {str(e)}")

def calculate_similarity(jd_text, cv_texts):
    try:
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        model = AutoModel.from_pretrained("distilbert-base-uncased")

        # Get embeddings
        jd_embedding = embed_text(jd_text, model, tokenizer)
        cv_embeddings = [embed_text(cv, model, tokenizer) for cv in cv_texts]

        # Calculate cosine similarity
        similarity_scores = []
        matched_keywords = []

        jd_keywords = extract_keywords(jd_text)

        for i, cv_emb in enumerate(cv_embeddings):
            score = cosine_similarity(jd_embedding.detach().numpy(), cv_emb.detach().numpy())[0][0]
            cv_keywords = extract_keywords(cv_texts[i])
            matches = list(set(jd_keywords) & set(cv_keywords))

            similarity_scores.append(score)
            matched_keywords.append(matches)

        return similarity_scores, matched_keywords
    except Exception as e:
        raise Exception(f"Similarity Calculation Error: {str(e)}")

def explain_xai(jd_text, shortlisted_cvs, is_generated_jd=False):
    try:
        explanation_text = "CV Shortlisting Explanation:\n\n"

        # Explain JD source
        if is_generated_jd:
            explanation_text += "Job Description Generation:\n"
            explanation_text += "- Generated using Facebook's FLAN-T5 Base model\n"
            explanation_text += "- Model processes structured prompts for different JD sections\n"
            explanation_text += "- Uses beam search and temperature control for balanced output\n\n"

        explanation_text += "Matching Process Explanation:\n"
        explanation_text += "1. Embedding Generation:\n"
        explanation_text += "   - Using DistilBERT model for creating text embeddings\n"
        explanation_text += "   - Both JD and CVs are converted to 768-dimensional vectors\n\n"

        explanation_text += "2. Similarity Calculation:\n"
        explanation_text += "   - Cosine similarity used to measure document similarity\n"
        explanation_text += "   - Scores range from 0 to 1 (1 being perfect match)\n\n"

        explanation_text += "Individual CV Analysis:\n"
        for cv in shortlisted_cvs:
            explanation_text += f"\nCV: {cv[0]}\n"
            explanation_text += f"Similarity Score: {cv[1]:.4f}\n"
            explanation_text += f"Matched Keywords: {', '.join(cv[3])}\n"
            explanation_text += "Matching Strength: "
            if cv[1] > 0.8:
                explanation_text += "Very Strong Match\n"
            elif cv[1] > 0.6:
                explanation_text += "Strong Match\n"
            else:
                explanation_text += "Moderate Match\n"

        return explanation_text
    except Exception as e:
        return f"XAI Error: {str(e)}\n{traceback.format_exc()}"

def process_cv_matching(jd_text, cv_files, is_generated_jd=False):
    try:
        cv_texts = []
        # Extract text from CV PDFs
        for cv_file in cv_files:
            with open(cv_file.name, 'rb') as f:
                pdf_reader = PdfReader(f)
                text = "".join([page.extract_text() for page in pdf_reader.pages])
                cv_texts.append((os.path.basename(cv_file.name), text))

        # Calculate similarities and get matched keywords
        similarity_scores, matched_keywords = calculate_similarity(jd_text, [cv[1] for cv in cv_texts])

        # Shortlist top 5 CVs
        top_indices = np.argsort(similarity_scores)[::-1][:5]
        shortlisted_cvs = [(cv_texts[i][0], similarity_scores[i], cv_texts[i][1], matched_keywords[i])
                          for i in top_indices]

        # Generate explanation
        explanation = explain_xai(jd_text, shortlisted_cvs, is_generated_jd)

        return shortlisted_cvs, explanation
    except Exception as e:
        return f"Processing Error: {str(e)}", None

def gradio_app():
    with gr.Blocks() as app:
        # Create tabs
        with gr.Tabs():
            with gr.Tab("Upload Data"):
                with gr.Row():
                    jd_source = gr.Radio(["Upload JD", "Create JD via Prompt"],
                                       label="Job Description Source")
                with gr.Row():
                    jd_file = gr.File(label="Upload JD PDF")
                    jd_prompt = gr.Textbox(label="Enter JD Prompt",
                                         placeholder="Enter job details to generate JD...")
                with gr.Row():
                    cv_folder_path = gr.File(label="Upload CVs (PDFs Only)",
                                           file_types=[".pdf"],
                                           file_count="multiple")
                with gr.Row():
                    run_button = gr.Button("Process")

            with gr.Tab("Results"):
                with gr.Row():
                    shortlisted_cvs = gr.Dataframe(
                        headers=["CV Name", "Match Score", "Matched Keywords"],
                        label="Shortlisted CVs"
                    )
                with gr.Row():
                    explanation_text = gr.Textbox(
                        label="Analysis Explanation",
                        lines=10
                    )
                with gr.Row():
                    jd_text_display = gr.Textbox(
                        label="Job Description",
                        lines=5
                    )

        def process_and_display(jd_source, jd_file, jd_prompt, cv_files):
            try:
                # Get JD text based on source
                is_generated_jd = False
                if jd_source == "Upload JD":
                    #with open(jd_file.name, 'rb') as f:
                     #   pdf_reader = PdfReader(f)
                      #  jd_text = "".join([page.extract_text() for page in pdf_reader.pages])
                    with open(jd_file.name, 'rb') as f:
                      jd_text = extract_text_from_pdf(f)

                else:
                    generator = JobDescriptionGenerator()
                    jd_text = generator.generate_job_description(
                        job_title="Role from Prompt",
                        industry="General",
                        experience_level="Not Specified",
                        key_skills=jd_prompt,
                        location="Not Specified"
                    )
                    is_generated_jd = True

                # Process CVs
                shortlisted_cvs, explanation = process_cv_matching(jd_text, cv_files, is_generated_jd)

                # Format data for display
                cv_display_data = [
                    [cv[0], f"{cv[1]:.4f}", ", ".join(cv[3])]
                    for cv in shortlisted_cvs
                ]

                return cv_display_data, explanation, jd_text

            except Exception as e:
                return [], f"Error: {str(e)}", ""

        run_button.click(
            process_and_display,
            inputs=[jd_source, jd_file, jd_prompt, cv_folder_path],
            outputs=[shortlisted_cvs, explanation_text, jd_text_display]
        )

    return app

# Launch the app

app = gradio_app()
app.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f336f4be75ec13332b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f336f4be75ec13332b.gradio.live


