In [26]:
# Core Python
import os
import re
import json
from typing import List, Tuple, Dict

# Data Handling
import pandas as pd
import numpy as np

# Progress & Logging
from tqdm import tqdm
from rich import print

# Document Loaders & Parsing
from pypdf import PdfReader
import docx2txt

# NLP & Embedding
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# LLM & RAG tools
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter

# Optional: if using cloud later
# from dotenv import load_dotenv
# import boto3


In [28]:
import os
from PyPDF2 import PdfReader

# Set your folder path containing the CVs
pdf_folder = r"C:\Users\user\OneDrive\Desktop\CVs"  # Use raw string to avoid escape issues
pdf_texts = []

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        reader = PdfReader(os.path.join(pdf_folder, filename))
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
        pdf_texts.append(text)


In [29]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = []

for text in pdf_texts:
    chunks = text_splitter.split_text(text)
    for chunk in chunks:
        documents.append(Document(page_content=chunk))


In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS

# Set your Gemini API key
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyAZXvGUB1CoDFl9fmMEtHP6EZ0-Xyvsryo"

# Create embeddings and store in FAISS
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = FAISS.from_documents(documents, embeddings)


In [54]:
from langchain.prompts import PromptTemplate

custom_prompt = PromptTemplate(
    input_variables=["context", "job_role", "requirements"],
    template="""
You are an expert CV evaluator.

Job Role: {job_role}
Requirements:
{requirements}

Candidate CV Content:
{context}

Please respond in the following format:

Strong Points:
- ...

Weak Points:
- ...

Final Decision:
Accepted or Rejected (with reason)
"""
)


In [58]:
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize Gemini model
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3)

# Create a custom LLM chain
llm_chain = LLMChain(
    llm=llm,
    prompt=custom_prompt
)


In [59]:
# Run similarity search
retrieved_docs = vectorstore.similarity_search("Data Analyst", k=5)
context = "\n\n".join([doc.page_content for doc in retrieved_docs])


In [60]:
response = llm_chain.run({
    "context": context,
    "job_role": "Data Analyst",
    "requirements": "Proficiency in SQL, data visualization tools like Tableau, and experience with Python."
})

print(response)

