In [2]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle


model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set the directory path containing the PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API endpoint and headers
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}


def loading_faiss(summary_embeddings,file_names,sum):
    # FAISS index initialization
    index = faiss.IndexFlatL2(384)  # d = 384

    # Add your vectors to the index
    index.add(summary_embeddings)  # v contains the document embeddings

    storage_directory = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'

    # Ensure the directory exists
    if not os.path.exists(storage_directory):
        os.makedirs(storage_directory)

    file_metadata = [{"doc_name": file_names[i], "summary": sum[i] } for i in range(len(file_names))]


    # Save FAISS index
    faiss.write_index(index, os.path.join(storage_directory, 'faiss_index.bin'))

    

    # Optionally save metadata
    with open(os.path.join(storage_directory, 'metadata.pkl'), 'wb') as f:
        pickle.dump(file_metadata, f)


    print("Metadata saved:")


# Helper function to extract text using Tesseract OCR
def extract_text_from_pdf(pdf_path):
    try:
        # Convert PDF pages to images using pdf2image
        images = convert_from_path(pdf_path)
        
        # Extract text from each image using Tesseract OCR
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

# Function to summarize text using the Ollama API
def summarize_text_with_ollama(text):
    prompt = f"""
    Summarize the following resume:
    - Name
    - Role
    - Licenses
    - Qualifications
    - Experience
    - Projects
    - Roles and Responsibilities

    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}

    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()
    summary = response_json.get("response", "No response found")
    return summary

# Function to generate embeddings for key points
def generate_embeddings(texts):
    return model.encode(texts, convert_to_tensor=False)




# Function to extract key points (simple placeholder function for demo)
def extract_key_point(summary, key):
    # This is a simple way to extract key points, you can refine it based on your summary format
    start = summary.find(key)
    end = summary.find("\n", start)
    return summary[start:end].strip() if start != -1 else ""

# Process each PDF file, extract text, summarize, and add data to FAISS
def process(sum, file_names):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF using OCR
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary = summarize_text_with_ollama(text)
            filename = extract_key_point(summary, "Name")
            print(f"Summary of {filename}: {summary[:100]}...")  # Limit printed summary lengt
        sum.append(summary)
        file_names.append(filename)

        


    return(sum,file_names)


sum=[]
file_names=[]
# Run the process
sum , file_names= process(sum, file_names)

summary_embeddings = generate_embeddings(sum)
loading_faiss(summary_embeddings,file_names,sum)


Summary of Name:** Brett Witte, PE: Here is a summary of the resume:

**Name:** Brett Witte, PE
**Role:** Project Manager / Civil Engine...
Processing: page_4.pdf
Extracted text from page 1 of page_4.pdf...
Summary of Name:** Robert Morris: Here is a summary of the resume:

**Name:** Robert Morris
**Role:** DQC Manager

**Licenses and Qual...
Processing: page_5.pdf
Extracted text from page 1 of page_5.pdf...
Summary of Name:** Robert Gaylord, PE: Here is a summary of the resume:

**Name:** Robert Gaylord, PE
**Role:** Sr. Civil Engineer
**Licens...
Metadata saved:


In [3]:
import faiss
import pickle
import os

# Directory where FAISS index and metadata are stored
storage_directory = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'

# Load FAISS index
faiss_index_path = os.path.join(storage_directory, 'faiss_index.bin')
index = faiss.read_index(faiss_index_path)

# Load Metadata
metadata_path = os.path.join(storage_directory, 'metadata.pkl')
with open(metadata_path, 'rb') as f:
    metadata = pickle.load(f)

# Inspect Vectors
total_vectors = index.ntotal  # Total number of vectors in the FAISS index
print(f"Total Vectors in FAISS: {total_vectors}")

# Loop through stored vectors
print("\nStored Vectors:")
for i in range(total_vectors):
    vector = index.reconstruct(i)  # Retrieve the i-th vector
    print(f"Vector {i}: {vector[:5]}...")  # Print first 5 values for brevity

# Inspect Metadata
print("\nMetadata:")
for i, meta in enumerate(metadata):
    print(f"Metadata {i}: {meta}")


Total Vectors in FAISS: 5

Stored Vectors:
Vector 0: [-0.01065707 -0.03517349  0.03286987 -0.01445845  0.00735596]...
Vector 1: [-0.00201408 -0.03097828 -0.0607443  -0.03417638  0.05095847]...
Vector 2: [-0.01048094 -0.01979299  0.00982213  0.04763299  0.07968677]...
Vector 3: [-0.01953927 -0.00663608 -0.03171891  0.01063996  0.04692906]...
Vector 4: [-0.05252428  0.02563553  0.03139726 -0.01500012  0.03395924]...

Metadata:
Metadata 0: {'doc_name': 'Name:** Kris D. Prasad, PE', 'summary': "Here is a summary of Kris D. Prasad's resume:\n\n**Name:** Kris D. Prasad, PE\n**Role:** Program Manager/Senior Structural Engineer\n**Years Experience:** 29 years\n\n**Licenses:**\n\n* Professional Engineer (PE) - TX #91952, LA #34186, NM #20737, PR# 28316\n* MS, Civil Engineering, 1992\n\n**Qualifications:**\n\n* Over 29 years of professional engineering and management experience with federal, state, municipal, and local programs\n* Currently serving as Program Manager for USACE Fort Worth and Tul

In [4]:
# Iterate through all vectors and their metadata
for i in range(len(metadata)):
    vector = index.reconstruct(i)  # Retrieve the vector at index i
    meta = metadata[i]  # Retrieve the metadata at index i

    print(f"Vector {i}: {vector[:5]}...")  # Print first 5 dimensions of the vector
    print(f"Metadata {i}: {meta}")  # Print corresponding metadata
    print("-" * 50)


Vector 0: [-0.01065707 -0.03517349  0.03286987 -0.01445845  0.00735596]...
Metadata 0: {'doc_name': 'Name:** Kris D. Prasad, PE', 'summary': "Here is a summary of Kris D. Prasad's resume:\n\n**Name:** Kris D. Prasad, PE\n**Role:** Program Manager/Senior Structural Engineer\n**Years Experience:** 29 years\n\n**Licenses:**\n\n* Professional Engineer (PE) - TX #91952, LA #34186, NM #20737, PR# 28316\n* MS, Civil Engineering, 1992\n\n**Qualifications:**\n\n* Over 29 years of professional engineering and management experience with federal, state, municipal, and local programs\n* Currently serving as Program Manager for USACE Fort Worth and Tulsa Districts on various civil works and military projects\n* Experience includes:\n\t+ 1000 projects with a value of over $100M\n\t+ Excellent ratings for all criteria on two NAVFAC Southeast projects in Meridian Base\n\n**Experience:**\n\n* Program Manager/Senior Structural Engineer, USACE Fort Worth District (various years)\n* Task Order Manager, NAV

In [5]:
import numpy as np

# Example: Create a query vector (you would typically generate this using your embedding model)
query_text = "Who has more than 30 years of experience?"
# Assume `generate_embeddings` is a function to convert query text to a vector
query_vector = generate_embeddings(query_text)  # Should be of shape (1, 384)

# Ensure the query vector has the right shape for FAISS
query_vector = np.array(query_vector).astype('float32').reshape(1, -1)

# Perform a search in FAISS
top_k = 4 # Number of nearest neighbors to retrieve
distances, indices = index.search(query_vector, top_k)

# Display Results
print("\nQuery Results:")
for i in range(top_k):
    idx = indices[0][i]  # Index of the nearest neighbor
    idx = int(idx)
    distance = distances[0][i]  # Distance to the query vector
    print(f"\nResult {i + 1}:")
    print(f"  Distance: {distance}")
    print(f"  Metadata: {metadata[idx]}")
    vector = index.reconstruct(idx)  # Retrieve the corresponding vector (optional)
    print(f"  Vector: {vector[:5]}...")  # Print first 5 values of the vector for brevity



Query Results:

Result 1:
  Distance: 1.3242015838623047
  Metadata: {'doc_name': 'Name:** Robert Morris', 'summary': 'Here is a summary of the resume:\n\n**Name:** Robert Morris\n**Role:** DQC Manager\n\n**Licenses and Qualifications:**\n\n* Registered Architect: TX #12112 (1987), NM\n* Registered Interior Designer: TX #1985 (1993)\n* AIA, TSA, AIA Lubbock, SAME member\n* Secretary, AIA Lubbock\n\n**Qualifications:**\n\n* 40 years of experience in architecture and interior design\n* Experienced with various building projects and systems\n* Skilled in master planning, programming, and design\n* Performed over 100 Facility Condition Assessments (FCAs) in the past 15 years\n\n**Experience:**\n\n* DQC Manager for various projects, including:\n\t+ USACE Fort Worth District - Renovation of Buildings 16, 44, 615 (2002-2022)\n\t+ USACE Fort Worth District - Design of Tactical Equipment Maintenance Facility (TEMF) Renovations at Fort Hood, TX (2019-2022)\n\t+ Repair Building B499 Randolph Air

In [10]:
filter_condition = lambda meta: 10 <= meta.get('years_experience', 0) <= 20
def search_with_filter(query_vector, top_k, filter_condition):
    distances, indices = index.search(query_vector, top_k)
    results = []
    for i in range(len(distances[0])):
        idx = indices[0][i]
        if idx == -1: 
            continue
        metadata_item = metadata[idx]
        print(metadata_item)
        if filter_condition(metadata_item):  
            results.append({
                "index": idx,
                "distance": distances[0][i],
                "metadata": metadata_item
            })
    return results

query = "Who has more than 30 years of experience?"
query_vector = generate_embeddings(query)
query_vector = np.array(query_vector).astype('float32').reshape(1, -1)
results = search_with_filter(query_vector, top_k=10, filter_condition=filter_condition)
print(results)


{'doc_name': 'Name:** Robert Morris', 'summary': 'Here is a summary of the resume:\n\n**Name:** Robert Morris\n**Role:** DQC Manager\n\n**Licenses and Qualifications:**\n\n* Registered Architect: TX #12112 (1987), NM\n* Registered Interior Designer: TX #1985 (1993)\n* AIA, TSA, AIA Lubbock, SAME member\n* Secretary, AIA Lubbock\n\n**Qualifications:**\n\n* 40 years of experience in architecture and interior design\n* Experienced with various building projects and systems\n* Skilled in master planning, programming, and design\n* Performed over 100 Facility Condition Assessments (FCAs) in the past 15 years\n\n**Experience:**\n\n* DQC Manager for various projects, including:\n\t+ USACE Fort Worth District - Renovation of Buildings 16, 44, 615 (2002-2022)\n\t+ USACE Fort Worth District - Design of Tactical Equipment Maintenance Facility (TEMF) Renovations at Fort Hood, TX (2019-2022)\n\t+ Repair Building B499 Randolph Air Force Base, TX (2019-2022)\n\t+ Canadian River Wagon Bridge Deck Repl

In [None]:
a=['eghwrgwrgww','fdjcfj','yfcf']
print(generate_embeddings(a))