In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import numpy as np
import os
import uuid
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange


In [2]:

def read_pdf_file(file_path):
    try:
        pages = convert_from_path(file_path, dpi=300)
        extracted_text = []
        
        for page in pages:
            text = pytesseract.image_to_string(page, lang='eng')
            extracted_text.append(text)
        
        # Combine text from all pages
        full_text = '\n'.join(extracted_text)
        
        return full_text
    except Exception as e:
        print(f"Error reading PDF file {file_path}: {e}")
        return ""


In [3]:


def read_doc(directory):
    docs = []
    try:
        for filename in os.listdir(directory):
            if filename.lower().endswith('.pdf'):
                file_path = os.path.join(directory, filename)
                text = read_pdf_file(file_path)
                if text: 
                    docs.append(text)
    except Exception as e:
        print(f"Error processing directory {directory}: {e}")
    return docs


In [4]:
def compute_embeddings(documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings



In [105]:
directory_path = 'docs/'
documents = read_doc(directory_path)

# Compute embeddings
v = compute_embeddings(documents)



d = len(v[0])



In [35]:
d

384

In [116]:
import faiss
import numpy as np
import faiss
import pickle

# FAISS index initialization
index = faiss.IndexFlatL2(384)  # d = 384

# Add your vectors to the index
index.add(v)  # v contains the document embeddings

storage_directory = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'

# Ensure the directory exists
if not os.path.exists(storage_directory):
    os.makedirs(storage_directory)

file_metadata = []
for i, doc in enumerate(documents):
    doc_number = f"Doc {i + 1}"
    first_few_words = ' '.join(doc.split()[:10])  
    file_metadata.append(f"{doc_number}: {first_few_words}")

# Save FAISS index
faiss.write_index(index, os.path.join(storage_directory, 'faiss_index.bin'))

# Save document texts
with open(os.path.join(storage_directory, 'documents.pkl'), 'wb') as f:
    pickle.dump(documents, f)

# Optionally save metadata
with open(os.path.join(storage_directory, 'metadata.pkl'), 'wb') as f:
    pickle.dump(file_metadata, f)

embeddings = np.array(v)
print(embeddings.shape)

print("Metadata saved:", file_metadata)

(7, 384)
Metadata saved: ['Doc 1: Synopsis Architect-Engineer (A-E) Services Indefinite Delivery/Indefinite Quantity (IDIQ) Contract For', 'Doc 2: Green underlined is add-in, Red strike through is deleted This', 'Doc 3: = An official website of the United States government Here’s', 'Doc 4: An official website of the United States government Here’s how', 'Doc 5: INDEFINITE DELIVERY CONTRACT (IDC) A-E SERVICES FOR USE WITHIN SOUTHWESTERN', 'Doc 6: C -- ARCHITECT AND ENGINEERING SERVICES W912DQ24R4002 SYNOPSIS Architect-Engineer (A-E)', 'Doc 7: Indefinite-Delivery (MATOC) for Multi-Discipline Miscellaneous Works Design and Other Architect']


In [89]:
print(file_metadata)

['Doc 1: Synopsis Architect-Engineer (A-E) Services Indefinite Delivery/Indefinite Quantity (IDIQ) Contract For', 'Doc 2: Green underlined is add-in, Red strike through is deleted This', 'Doc 3: = An official website of the United States government Here’s', 'Doc 4: An official website of the United States government Here’s how', 'Doc 5: INDEFINITE DELIVERY CONTRACT (IDC) A-E SERVICES FOR USE WITHIN SOUTHWESTERN', 'Doc 6: C -- ARCHITECT AND ENGINEERING SERVICES W912DQ24R4002 SYNOPSIS Architect-Engineer (A-E)', 'Doc 7: Indefinite-Delivery (MATOC) for Multi-Discipline Miscellaneous Works Design and Other Architect']


In [117]:
import faiss
import os
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer

# Define storage directory
storage_directory = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'

# Function to process new files and update FAISS, documents, and metadata
def process_new_file(file_path):
    text = ocr_processor.process_pdf(file_path)
    
    if text:  # If the text is successfully extracted
        # Load the existing FAISS index, documents, and metadata
        index_path = os.path.join(storage_directory, 'faiss_index.bin')
        documents_path = os.path.join(storage_directory, 'documents.pkl')
        metadata_path = os.path.join(storage_directory, 'metadata.pkl')

        # Load FAISS index
        if os.path.exists(index_path):
            index = faiss.read_index(index_path)
        else:
            index = faiss.IndexFlatL2(384)  # Create a new FAISS index if none exists

        # Load existing documents and metadata
        try:
            with open(documents_path, 'rb') as f:
                documents = pickle.load(f)
        except FileNotFoundError:
            documents = []

        try:
            with open(metadata_path, 'rb') as f:
                metadata = pickle.load(f)
        except FileNotFoundError:
            metadata = []

        # Convert new text to vector embedding
        vector = embedding_model.encode([text])[0]

        # Append the new vector to the FAISS index
        index.add(np.array([vector]).astype('float32'))

        # Append the new text to the documents list
        documents.append(text)

        # Generate and append new metadata (document number and first few words)
        doc_number = f"Doc {len(documents)}"  # Make sure document number matches documents list
        first_few_words = ' '.join(text.split()[:10])
        metadata.append(f"{doc_number}: {first_few_words}...")

        # Debugging: Print length after update
        print(f"After update: len(documents)={len(documents)}, len(metadata)={len(metadata)}")

        # Save the updated FAISS index
        faiss.write_index(index, os.path.join(storage_directory, 'faiss_index.bin'))

        # Save the updated documents
        with open(os.path.join(storage_directory, 'documents.pkl'), 'wb') as f:
            pickle.dump(documents, f)

        # Save the updated metadata
        with open(os.path.join(storage_directory, 'metadata.pkl'), 'wb') as f:
            pickle.dump(metadata, f)

        print(f"New file '{file_path}' successfully processed and added to FAISS.")


# Initialize components
tesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
ocr_processor = OCRProcessor(tesseract_path)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Start monitoring
directory_to_watch = 'docs/'
file_monitor = FileMonitor(directory_to_watch, process_new_file)
file_monitor.start()




Monitoring directory: docs/
New PDF detected: docs/week6.pdf
Processing PDF for OCR: docs/week6.pdf
After update: len(documents)=8, len(metadata)=8
New file 'docs/week6.pdf' successfully processed and added to FAISS.
New PDF detected: docs/week7.pdf
Processing PDF for OCR: docs/week7.pdf
After update: len(documents)=9, len(metadata)=9
New file 'docs/week7.pdf' successfully processed and added to FAISS.


In [123]:
print(len(metadata))


10


In [120]:
print(len(documents))


7
