In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import numpy as np
import os
import uuid
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from unstructured.partition.auto import partition
import magic

import logging
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from sentence_transformers import SentenceTransformer
import time


  from tqdm.autonotebook import tqdm, trange


In [4]:

def read_file(file_path):
    try:
        # Use Unstructured's partition method to handle all supported file types
        elements = partition(file_path)
        extracted_text = "\n".join([str(element) for element in elements])
        return extracted_text
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

In [5]:
def read_doc(directory):
    docs = []
    try:
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            
            # Just a log for debugging
            print(f"Processing file: {file_path}")
            
            text = read_file(file_path)
            if text:
                docs.append(text)
    except Exception as e:
        print(f"Error processing directory {directory}: {e}")
    return docs

In [6]:
def compute_embeddings(documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings





[-0.03502822667360306,
 -0.061370596289634705,
 -0.06487809866666794,
 -0.10317597538232803,
 0.05656087398529053,
 -0.039374515414237976,
 0.0045332289300858974,
 -0.01872977986931801,
 0.0673169195652008,
 -0.01203030813485384,
 0.024705583229660988,
 -0.1389947384595871,
 -0.028534933924674988,
 -0.06784308701753616,
 0.06698959320783615,
 0.03491327166557312,
 -0.006042407359927893,
 0.011355974711477757,
 -0.036314621567726135,
 -0.012291446328163147,
 -0.0033638228196650743,
 0.005186763592064381,
 0.0025869079399853945,
 -0.018876023590564728,
 -0.06674593687057495,
 -0.020897457376122475,
 0.04364779219031334,
 -0.044121790677309036,
 -0.03957903012633324,
 -0.004937487654387951,
 0.026632657274603844,
 -0.12499779462814331,
 0.041241228580474854,
 0.015724308788776398,
 -0.005520887207239866,
 -0.043687932193279266,
 -0.06279928237199783,
 -0.07055669277906418,
 -0.07633178681135178,
 0.02122650295495987,
 0.016601521521806717,
 0.0029815812595188618,
 -0.03803899139165878,
 -

In [7]:
directory_path = 'docs/'
documents = read_doc(directory_path)

# Compute embeddings
v = compute_embeddings(documents)
len(v[0])



Processing file: docs/KC USACE_B_01_03_2024_AFCS_AE_Synopsis_SB_Set_Aside_W912DQ24R4019 (1) (1).pdf
Processing file: docs/Revised+Synposis++-+6.14.23 (1).pdf
Processing file: docs/Solicitation Vicksburg.pdf
Processing file: docs/USACE Jacksonville SB AE.pdf
Processing file: docs/W912BV24R0001_Synopsis_99M_General_Services_IDC.pdf
Processing file: docs/W912DQ24R4002_2024_Military_SB_AE_Synopsis.pdf
Processing file: docs/W912EP22R0042 Singhofen-Kenall JV-SF 330-I.pdf




384

In [8]:

import faiss

import pickle

# FAISS index initialization
index = faiss.IndexFlatL2(384)  # d = 384

# Add your vectors to the index
index.add(v)  # v contains the document embeddings

storage_directory = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'

# Ensure the directory exists
if not os.path.exists(storage_directory):
    os.makedirs(storage_directory)

file_metadata = []
for i, doc in enumerate(documents):
    doc_number = f"Doc {i + 1}"
    first_few_words = ' '.join(doc.split()[:10])  
    file_metadata.append(f"{doc_number}: {first_few_words}")

# Save FAISS index
faiss.write_index(index, os.path.join(storage_directory, 'faiss_index.bin'))

# Save document texts
with open(os.path.join(storage_directory, 'documents.pkl'), 'wb') as f:
    pickle.dump(documents, f)

# Optionally save metadata
with open(os.path.join(storage_directory, 'metadata.pkl'), 'wb') as f:
    pickle.dump(file_metadata, f)

embeddings = np.array(v)
print(embeddings.shape)

print("Metadata saved:", documents)

(7, 384)


In [22]:
# File Monitor class
class FileMonitor(FileSystemEventHandler):
    def __init__(self, directory_to_watch, callback):
        self.directory_to_watch = directory_to_watch
        self.callback = callback
    
    def on_created(self, event):
        if event.is_directory:
            return
        self.callback(event.src_path)

        _, file_extension = os.path.splitext(event.src_path)
        
        # List of allowed file extensions
        allowed_extensions = ['.pdf', '.docx', '.pptx']

        # Process only if the file extension is in the allowed list
        if file_extension.lower() in allowed_extensions:
            self.callback(event.src_path)
    
    def start(self):
        observer = Observer()
        observer.schedule(self, self.directory_to_watch, recursive=False)
        observer.start()
        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            observer.stop()
        observer.join()



In [28]:


logging.basicConfig(level=logging.DEBUG)

# Define storage directories and paths
storage_directory = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'
faiss_index_path = os.path.join(storage_directory, 'faiss_index.bin')
documents_path = os.path.join(storage_directory, 'documents.pkl')
metadata_path = os.path.join(storage_directory, 'metadata.pkl')

# Initialize embedding model (Same dimensions as FAISS index)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
d = 384  # Embedding dimension from the model

# Load or initialize FAISS index, documents, and metadata
if os.path.exists(faiss_index_path):
    index = faiss.read_index(faiss_index_path)
else:
    index = faiss.IndexFlatL2(d)

if os.path.exists(documents_path):
    with open(documents_path, 'rb') as f:
        documents = pickle.load(f)
else:
    documents = []

if os.path.exists(metadata_path):
    with open(metadata_path, 'rb') as f:
        file_metadata = pickle.load(f)
else:
    file_metadata = []

# Function to generate embeddings
def compute_embeddings(text):
    return embedding_model.encode([text])[0]

# Function to process new files and update FAISS, documents, and metadata
def process_new_file(file_path):
    global index, documents, file_metadata

    try:
        logging.info(f"Attempting to process new file: {file_path}")

        # Extract text using Unstructured's partition method
        elements = partition(file_path)
        extracted_text = "\n".join([str(element) for element in elements])

        # Check if extraction returned valid text
        if extracted_text:
            documents.append(extracted_text)

            # Update metadata
            doc_number = f'Doc {len(documents)}'
            first_few_words = ' '.join(extracted_text.split()[:10])
            file_metadata.append(f"{doc_number}: {first_few_words}...")

            # Save the updated documents and metadata
            with open(documents_path, 'wb') as f:
                pickle.dump(documents, f)
            with open(metadata_path, 'wb') as f:
                pickle.dump(file_metadata, f)

            # Load FAISS index, add new embedding
            new_embedding = compute_embeddings(extracted_text)
            index.add(np.array([new_embedding]).astype('float32'))
            faiss.write_index(index, faiss_index_path)

            logging.info(f"New file '{file_path}' successfully processed and added to FAISS.")
            logging.info(f"After update: len(documents)={len(documents)}, len(metadata)={len(file_metadata)}")
        else:
            logging.warning(f"Failed to process text extraction for {file_path}. No valid text found.")

    except PermissionError:
        logging.error(f"Permission denied when trying to process {file_path}")
    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")

# Start monitoring
directory_to_watch = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs'
file_monitor = FileMonitor(directory_to_watch, process_new_file)
file_monitor.start()


ERROR:root:Permission denied when trying to process C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs\week6.pdf
ERROR:root:Permission denied when trying to process C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs\week6.pdf


In [21]:
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler




def process_new_file(file_path):

    global documents, file_metadata, v, index

    print(f"Processing the new file: {file_path}")

    # Read the file and extract text
    text = read_file(file_path)
    documents.append(text)

    # Create metadata for the document
    doc_number = f"Doc {len(documents)}"  # Since you're adding, len(documents) reflects the new size
    first_few_words = ' '.join(text.split()[:10])  
    file_metadata.append(f"{doc_number}: {first_few_words}")

    # Compute embeddings and add to FAISS index
    index.add(v.append(compute_embeddings(text).tolist()))
    # Save the updated FAISS index
    faiss.write_index(index, os.path.join(storage_directory, 'faiss_index.bin'))

    # Save updated documents and metadata
    with open(os.path.join(storage_directory, 'documents.pkl'), 'wb') as f:
        pickle.dump(documents, f)

    with open(os.path.join(storage_directory, 'metadata.pkl'), 'wb') as f:
        pickle.dump(file_metadata, f)

    print(f"Successfully processed {file_path}")






class FileEventHandler(FileSystemEventHandler):
    def __init__(self, process_function):
        super().__init__()
        self.process_function = process_function  # Pass in the function you want to run when a file is added

    def on_created(self, event):
        if not event.is_directory:
            print(f"New file added: {event.src_path}")
            self.process_function(event.src_path)  # Call the provided function

def monitor_folder(path, process_function):
    event_handler = FileEventHandler(process_function)
    observer = Observer()
    observer.schedule(event_handler, path, recursive=False)
    observer.start()
    
    try:
        while True:
            time.sleep(1)  # Keep the script running
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

if __name__ == "__main__":
    folder_to_monitor = r"C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs"
    
    # Choose which function you want to execute when a new file is added
    monitor_folder(folder_to_monitor, process_new_file)  # This will run the `process_new_file` function


Exception in thread Thread-54:
Traceback (most recent call last):
  File "C:\Users\AbhinavKasubojula\AppData\Local\Programs\Python\Python312\Lib\threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "c:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\.venv\Lib\site-packages\watchdog\observers\api.py", line 213, in run
    self.dispatch_events(self.event_queue)
  File "c:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\.venv\Lib\site-packages\watchdog\observers\api.py", line 391, in dispatch_events
    handler.dispatch(event)
  File "c:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\.venv\Lib\site-packages\watchdog\events.py", line 217, in dispatch
    getattr(self, f"on_{event.event_type}")(event)
  File "C:\Users\AbhinavKasubojula\AppData\Local\Temp\ipykernel_3344\2071368452.py", line 50, in on_created
  File "C:\Users\AbhinavKasubojula\AppData\Local\Temp\ipykernel_3344\2071368452.py", line 24, in process_new_file
AttributeError: 'numpy.

New file added: C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs\Resume_Abhinav .pdf
Processing the new file: C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs\Resume_Abhinav .pdf
Error reading file C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\docs\Resume_Abhinav .pdf: [Errno 13] Permission denied: 'C:\\Users\\AbhinavKasubojula\\OneDrive - Kenall Inc\\Desktop\\code\\docs\\Resume_Abhinav .pdf'
