In [1]:
import os
import json
import pickle
import faiss
import numpy as np
import pytesseract
from pdf2image import convert_from_path
from sentence_transformers import SentenceTransformer
import requests
import re

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Path to Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Directory and file paths
RESUME_DIR = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs'
STORED_DATA_DIR = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'
os.makedirs(STORED_DATA_DIR, exist_ok=True)

INDEX_PATH = os.path.join(STORED_DATA_DIR, 'faiss_index.bin')
METADATA_PATH = os.path.join(STORED_DATA_DIR, 'metadata.pkl')

# Initialize FAISS and embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
DIMENSION = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(DIMENSION) if not os.path.exists(INDEX_PATH) else faiss.read_index(INDEX_PATH)



In [3]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)
        text = ""
        for img in images:
            text += pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

In [4]:



# Function to print metadata and vectors
def print_metadata_and_vectors():
    # Load and print metadata
    if os.path.exists(METADATA_PATH):
        with open(METADATA_PATH, 'rb') as f:
            metadata = pickle.load(f)
        print("Metadata:")
        for idx, entry in enumerate(metadata, start=1):
            print(f"{idx}. {entry}")
    else:
        print("No metadata found.")

    # Print vectors stored in the FAISS index
    if index.ntotal > 0:
        print("\nVectors:")
        for i in range(index.ntotal):
            try:
                vector = index.reconstruct(i)  # Retrieve vector by index
                print(f"Vector {i}: {vector}")
            except Exception as e:
                print(f"Error reconstructing vector {i}: {e}")
    else:
        print("No vectors found in the FAISS index.")

# Call the function to display metadata and vectors
print_metadata_and_vectors()


Metadata:
1. {'doc_name': 'Name:** Kris D. Prasad, PE', 'summary': "Here is a summary of Kris D. Prasad's resume:\n\n**Name:** Kris D. Prasad, PE\n**Role:** Program Manager/Senior Structural Engineer\n**Years Experience:** 29 years\n\n**Licenses:**\n\n* Professional Engineer (PE) - TX #91952, LA #34186, NM #20737, PR# 28316\n* MS, Civil Engineering, 1992\n\n**Qualifications:**\n\n* Over 29 years of professional engineering and management experience with federal, state, municipal, and local programs\n* Currently serving as Program Manager for USACE Fort Worth and Tulsa Districts on various civil works and military projects\n* Experience includes:\n\t+ 1000 projects with a value of over $100M\n\t+ Excellent ratings for all criteria on two NAVFAC Southeast projects in Meridian Base\n\n**Experience:**\n\n* Program Manager/Senior Structural Engineer, USACE Fort Worth District (various years)\n* Task Order Manager, NAVFAC Southeast (Meridian Base)\n\n**Projects:**\n\n* Renovation of Building

In [5]:
# Function to summarize text using Ollama API
def summarize_text_with_llm(text):
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    prompt = f"""
    Summarize the following resume:
    - Name
    - Role
    - Licenses
    - Qualifications
    - Experience
    - Projects
    - Roles and Responsibilities

    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        try:
            response_json = json.loads(response.text)
            return response
        except json.JSONDecodeError:
            print(f"Failed to decode JSON: {response.text}")
            return {}
    else:
        print(f"Error in LLM API: {response.status_code} - {response.text}")
        return {}

In [6]:
# Function to load metadata
def load_metadata():
    if os.path.exists(METADATA_PATH):
        try:
            with open(METADATA_PATH, 'rb') as f:
                metadata = pickle.load(f)
            print("Metadata loaded successfully.")
            return metadata
        except (EOFError, pickle.UnpicklingError) as e:
            print(f"Error loading metadata: {e}. Initializing empty metadata.")
            return []
    else:
        print("No metadata found. Initializing empty metadata.")
        return []

# Function to load FAISS index
def load_faiss_index(index_path):
    if os.path.exists(index_path):
        try:
            index = faiss.read_index(index_path)
            print(f"FAISS index loaded from {index_path}.")
            return index
        except Exception as e:
            print(f"Error loading FAISS index: {e}")
            return None
    else:
        print(f"Index file {index_path} not found.")
        return None

# Function to save FAISS index
def save_faiss_index(index, index_path):
    try:
        faiss.write_index(index, index_path)
        print(f"FAISS index saved to {index_path}.")
    except Exception as e:
        print(f"Error saving FAISS index: {e}")

In [7]:
def process_resumes():
    metadata_list = load_metadata()  # Load existing metadata
    processed_names = {meta.get('name') for meta in metadata_list if 'name' in meta}  # Track processed names

    # No need for rebuilding here, just add new data
    for file_name in os.listdir(RESUME_DIR):
        file_path = os.path.join(RESUME_DIR, file_name)

        if not file_name.endswith('.pdf'):
            continue

        print(f"Processing: {file_name}")

        # Extract text from the PDF
        text = extract_text_from_pdf(file_path)
        if not text:
            print(f"No text extracted from {file_name}. Skipping.")
            continue

        # Summarize text using LLM
        summary = summarize_text_with_llm(text)

        # Ensure summary is a string first, and then convert to dictionary
        if hasattr(summary, "text"):
            summary = summary.text  # If summary is a response object with 'text'

        # Try to convert summary into a dictionary (assuming JSON format)
        try:
            summary_dict = json.loads(summary)  # Convert string to dictionary
        except json.JSONDecodeError:
            summary_dict = {}  # In case conversion fails, fallback to empty dictionary
            print(f"Failed to parse JSON for {file_name}. Using empty metadata.")

        print(f"Summary dict for {file_name}: {summary_dict}")  # Now you have a structured dictionary

        # Extract name and other details from the summary (dictionary format)
        name = summary_dict.get('name', 'Unknown Name')

        # Check if this resume has been processed
        if name in processed_names:
            # Update metadata if name already exists
            for meta in metadata_list:
                if meta.get('name') == name:  # Use .get() to avoid KeyError
                    meta['projects'] += summary_dict.get('projects', [])
                    meta['experience'] += summary_dict.get('experience', [])
                    break
        else:
            # Append new summary if it's a new person
            metadata_list.append(summary_dict)
            processed_names.add(name)

        # Generate embedding text from structured dictionary
        embedding_text = f"{summary_dict.get('name', '')} {summary_dict.get('role', '')} " \
                         f"{' '.join(summary_dict.get('experience', []))} " \
                         f"{' '.join(summary_dict.get('projects', []))}"

        # Generate embeddings (Assuming 'model' is your embedding model)
        embedding = model.encode([embedding_text], convert_to_tensor=False)
        embedding = np.array(embedding, dtype='float32')

        # Handle dimensionality if needed
        if embedding.ndim == 1:
            embedding = np.expand_dims(embedding, axis=0)

        # Add embedding to FAISS index
        index.add(embedding)
        print(f"Added embedding for {name}.")

    # Save the FAISS index and metadata
    faiss.write_index(index, INDEX_PATH)
    save_metadata(metadata_list)
    print("Processing completed. Index and metadata saved.")


In [8]:
def query_resumes(query_text, top_k=5):
    # Convert the query text into an embedding
    query_embedding = model.encode([query_text], convert_to_tensor=False)
    query_embedding = np.array(query_embedding, dtype='float32')

    # Search for the most similar embeddings
    distances, indices = index.search(query_embedding, top_k)
    
    # Load metadata to get details of the results
    metadata_list = load_metadata()

    # Debug: Check if metadata and FAISS index lengths match
    print(f"FAISS index size: {index.ntotal}, Metadata list size: {len(metadata_list)}")

    # Retrieve metadata for the top results
    results = []
    for idx in indices[0]:
        # Ensure the index is within the bounds of metadata_list
        if 0 <= idx < len(metadata_list):  # Ensure the index is valid
            results.append(metadata_list[idx])
        else:
            print(f"Warning: Index {idx} is out of range for metadata list.")

    return results, distances[0]  # Return results and their distances


In [9]:
# Run the process and test query
if __name__ == "__main__":
    # Process and index all resumes
    process_resumes()

    # Query the FAISS index
    query = "Software engineer with experience in Python and machine learning"
    results, distances = query_resumes(query)

    print("Top matches:")
    for i, (result, distance) in enumerate(zip(results, distances)):
        print(f"Result {i+1}:")
        print(f"Name: {result.get('name', 'Unknown')}")
        print(f"Role: {result.get('role', 'Unknown')}")
        print(f"Experience: {result.get('experience', 'N/A')}")
        print(f"Projects: {result.get('projects', 'N/A')}")
        print(f"Distance: {distance}")
        print("-" * 40)

Metadata loaded successfully.
Processing: page_1.pdf
Summary dict for page_1.pdf: {'model': 'llama3.2-vision:latest', 'created_at': '2024-12-05T16:46:15.2518875Z', 'response': "Here is a summary of Kris D. Prasad's resume:\n\n**Name:** Kris D. Prasad, PE\n**Role:** Program Manager/Senior Structural Engineer\n**Years of Experience:** 29 years\n\n**Licenses:**\n\n* PE (Civil): TX #91952 (2003), LA #34186, NM #20737, PR# 28316\n\n**Qualifications:**\n\n* Over 29 years of professional engineering and management experience with various federal, state, municipal, and local programs\n* Currently serving as Program Manager for USACE Fort Worth and Tulsa Districts on various civil works and military projects\n* Previously served as Task Order Manager for two NAVFAC Southeast projects in Meridian Base, which received excellent ratings\n\n**Experience:**\n\n* Program Manager/Senior Structural Engineer with SAT-Kennall JV (Houston, TX)\n\t+ Managed contracts worth over $100M\n\t+ Coordinated with 

NameError: name 'save_metadata' is not defined