In [12]:
!pip install gradio sentence-transformers faiss-cpu PyPDF2 docx2txt requests PyMuPDF
with gr.Blocks() as app:
    gr.Markdown("## 📄 CV Matcher App")



In [13]:
!pip install torch



In [14]:
import os
import gradio as gr
import numpy as np
import torch
import requests
import faiss
import docx2txt
import PyPDF2
from transformers import AutoTokenizer, AutoModel

In [15]:
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        try:
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                return " ".join([page.extract_text() or "" for page in reader.pages])
        except:
            return "[Error extracting PDF text]"

    elif ext == ".docx":
        try:
            return docx2txt.process(file_path)
        except:
            return "[Error extracting DOCX text]"

    else:
        return "[Unsupported file type]"


In [16]:

MODEL_NAME = "thenlper/gte-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

def get_embeddings(texts, max_length=512):
    if isinstance(texts, str):
        texts = [texts]
    final_embeddings = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
        chunk_embeddings = []
        for chunk in chunks:
            input_ids = tokenizer.convert_tokens_to_ids(chunk)
            input_ids = torch.tensor([input_ids])
            with torch.no_grad():
                output = model(input_ids=input_ids)
                embedding = output.last_hidden_state.mean(dim=1)
                chunk_embeddings.append(embedding)
        if chunk_embeddings:
            avg_embedding = torch.stack(chunk_embeddings).mean(dim=0)
            final_embeddings.append(avg_embedding.squeeze(0).numpy())
        else:
            final_embeddings.append(np.zeros(model.config.hidden_size))
    return np.array(final_embeddings)


In [17]:

def create_faiss_index(vectors):
    try:
        dim = vectors[0].shape[0]
        index = faiss.IndexFlatL2(dim)
        index.add(np.array(vectors).astype("float32"))
        return index
    except Exception as e:
        print(f"❌ Error creating FAISS index: {e}")
        return None

def search_similar_cvs(query_vector, index, k=3):
    try:
        query_vector = np.array([query_vector]).astype("float32")
        distances, indices = index.search(query_vector, k)
        return indices[0].tolist()
    except Exception as e:
        print(f"❌ Error searching index: {e}")
        return []


In [18]:
GROQ_API_KEY = "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"

def summarize_match(job_description, cv_names, cv_snippets):
    if not GROQ_API_KEY:
        return "❌ GROQ_API_KEY not set."

    try:
        job_description = job_description.strip()[:800] or "[No description provided]"
        cv_snippets = [(text.strip()[:700] or "[No content]") for text in cv_snippets]
        cv_names = [name[:60] for name in cv_names]

        cv_section = ""
        for i, (name, snippet) in enumerate(zip(cv_names, cv_snippets), start=1):
            cv_section += f"\n{i}. {name}:\n{snippet}\n"

        prompt = f"""
You are an AI recruitment assistant helping to evaluate candidate CVs for a job opening.

Below is the job description, followed by {len(cv_names)} candidate CV summaries.

Your job is to:
- Analyze each candidate's relevance based on their technical skills, tools, and experience
- Be honest: clearly state if a candidate is a good fit or not
- Avoid generic praise unless it's supported by actual content
{"- Rank the candidates based on fit if more than one is provided." if len(cv_names) > 1 else ""}

### Job Description:
{job_description}

### Candidate CVs:{cv_section}
""".strip()

        if len(prompt) > 8000:
            prompt = prompt[:8000]

        response = requests.post(
            url="https://api.groq.com/openai/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {GROQ_API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": "llama3-8b-8192",
                "messages": [
                    {"role": "system", "content": "You are a helpful recruitment assistant."},
                    {"role": "user", "content": prompt}
                ],
                "temperature": 0.4
            },
            timeout=30
        )
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"]

    except requests.exceptions.RequestException as e:
        return f"❌ Groq API error: {e}"
    except Exception as e:
        return f"❌ Unexpected error: {e}"
        # At the end of your notebook or script




In [22]:
cv_texts = []
cv_names = []
cv_vectors = []
faiss_index = None

def upload_cvs(files):
    global cv_texts, cv_names, cv_vectors, faiss_index
    if len(files) > 10:
        return "❌ Limit exceeded: Max 10 CVs."

    # Remove duplicates
    unique_files = []
    seen = set()
    for f in files:
        if f.name not in seen:
            seen.add(f.name)
            unique_files.append(f)
    files = unique_files

    cv_texts = [extract_text_from_file(f) for f in files]
    cv_names = [f.name for f in files]
    cv_vectors = get_embeddings(cv_texts)

    if cv_vectors is None or np.array(cv_vectors).size == 0:
        return "❌ No valid CVs."

    faiss_index = create_faiss_index(cv_vectors)
    return f"✅ Uploaded and indexed {len(files)} CV(s)."

def match_jd(jd_text, match_mode):
    if faiss_index is None:
        return "❌ Please upload CVs first."
    if not jd_text.strip():
        return "⚠️ Job description is empty."

    jd_vector = get_embeddings([jd_text])[0]
    if match_mode == "Top 3 Matches":
        indices = search_similar_cvs(jd_vector, faiss_index, k=3)
    else:
        indices = list(range(len(cv_names)))

    seen = set()
    unique_indices = []
    for i in indices:
        if cv_names[i] not in seen:
            seen.add(cv_names[i])
            unique_indices.append(i)

    matched = [cv_names[i] for i in unique_indices]
    texts = [cv_texts[i] for i in unique_indices]
    summary = summarize_match(jd_text, matched, texts)
    title = f"✅ Matching {len(matched)} CVs:"
    return f"{title}\n\n" + "\n".join(matched) + f"\n\n📝 Summary:\n{summary}"

def clear_data():
    global cv_texts, cv_names, cv_vectors, faiss_index
    cv_texts, cv_names, cv_vectors, faiss_index = [], [], [], None
    return "🧹 Cleared."

import gradio as gr

with gr.Blocks() as app:
    gr.Markdown("## 📄 CV Matcher App")

    # Upload section
    file_input = gr.File(file_types=[".pdf", ".docx"], file_count="multiple", label="📤 Upload CVs")
    upload_button = gr.Button("📁 Upload & Index")
    upload_status = gr.Textbox(label="Upload Status")

    # Job description input
    jd_input = gr.Textbox(label="📋 Paste Job Description", lines=6, placeholder="Paste JD here...")

    # Matching mode (Top 3 or All)
    match_mode = gr.Radio(["Top 3 Matches", "All Uploaded CVs"], value="Top 3 Matches", label="Matching Mode")

    match_button = gr.Button("🔍 Match CVs")
    result_output = gr.Textbox(label="Results", lines=20)

    # Clear session
    clear_button = gr.Button("🧹 Clear All")
    clear_status = gr.Textbox(label="Clear Status")

    # Click actions
    upload_button.click(upload_cvs, inputs=[file_input], outputs=[upload_status])
    match_button.click(match_jd, inputs=[jd_input, match_mode], outputs=[result_output])
    clear_button.click(clear_data, inputs=[], outputs=[clear_status])

# Launch the app
app.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9b57660317b88342d5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


