In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import numpy as np
import os
import uuid
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from unstructured.partition.auto import partition
import magic

import logging
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from sentence_transformers import SentenceTransformer
import time


  from tqdm.autonotebook import tqdm, trange


In [2]:

def read_file(file_path):
    try:
        # Use Unstructured's partition method to handle all supported file types
        elements = partition(file_path)
        extracted_text = "\n".join([str(element) for element in elements])
        return extracted_text
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

In [3]:
def read_doc(directory):
    docs = []
    doc_names = []
    
    try:
        for filename in os.listdir(directory):
            doc_names.append(filename)
            file_path = os.path.join(directory,filename)
            
            # Just a log for debugging
            print(f"Processing file: {file_path}")
            
            text = read_file(file_path)
            if text:
                docs.append(text)
    except Exception as e:
        print(f"Error processing directory {directory}: {e}")
    return docs, doc_names

In [4]:
def compute_embeddings(documents):
    model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings



In [5]:
directory_path = 'docs/'
documents, doc_names = read_doc(directory_path)

# Compute embeddings
v = compute_embeddings(documents)
len(v[0])



Processing file: docs/page_1.pdf
Processing file: docs/page_2.pdf
Processing file: docs/page_3.pdf
Processing file: docs/page_4.pdf


384

In [6]:
v

array([[ 0.03079841,  0.01256095,  0.00396155, ..., -0.04093738,
        -0.01922737,  0.01460585],
       [ 0.00672184,  0.00303666, -0.03385414, ..., -0.04045713,
        -0.00080454,  0.00728154],
       [-0.00998642,  0.0072642 , -0.010931  , ..., -0.04368636,
         0.04215321,  0.02219727],
       [-0.01511251,  0.01407608, -0.03394361, ..., -0.05967831,
        -0.02801795, -0.00423528]], dtype=float32)

In [None]:

import faiss

import pickle

# FAISS index initialization
index = faiss.IndexFlatL2(384)  # d = 384

# Add your vectors to the index
index.add(v)  # v contains the document embeddings

storage_directory = r'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code\stored_data'

# Ensure the directory exists
if not os.path.exists(storage_directory):
    os.makedirs(storage_directory)

file_metadata = [{"doc_number": i+1, "doc_name": name} for i, name in enumerate(doc_names)]


# Save FAISS index
faiss.write_index(index, os.path.join(storage_directory, 'faiss_index.bin'))

# Save document texts
with open(os.path.join(storage_directory, 'documents.pkl'), 'wb') as f:
    pickle.dump(documents, f)

# Optionally save metadata
with open(os.path.join(storage_directory, 'metadata.pkl'), 'wb') as f:
    pickle.dump(file_metadata, f)


print("Metadata saved:", documents)

In [None]:
print(file_metadata[2])

In [None]:
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler


def update_faiss_index(new_embedding):
    global index  # Ensure we're using the global index variable
    print("Updating FAISS index...")

    # Load existing index if not already loaded
    if index is None:
        index_path = os.path.join(storage_directory, 'faiss_index.bin')
        if os.path.exists(index_path):
            index = faiss.read_index(index_path)
        else:
            # If no index exists, create a new one with the correct dimension
            index = faiss.IndexFlatL2(new_embedding.shape[1])  # Assuming new_embedding is a 2D array

    # Add the new embedding to the existing index
    index.add(np.array([new_embedding]).astype('float32'))  # Wrap in another array to add a single embedding

    # Save the updated index
    faiss.write_index(index, os.path.join(storage_directory, 'faiss_index.bin'))
    print("FAISS index updated successfully.")

def process_new_file(file_path):

    global documents, file_metadata, v, index, storage_directory

    print(f"Processing the new file: {file_path}")
    time.sleep(5)

    # Read the file and extract text
    t = read_file(file_path)
    documents.append(t)
    with open(os.path.join(storage_directory, 'documents.pkl'), 'wb') as f:
        pickle.dump(documents, f)

    doc_number = f"Doc {len(documents)}"
    file_name = os.path.basename(file_path)
    file_metadata.append(f"{doc_number}:{file_name}")
    with open(os.path.join(storage_directory, 'metadata.pkl'), 'wb') as f:
        pickle.dump(file_metadata, f)

    em = compute_embeddings(t)
    update_faiss_index(em)

class FileEventHandler(FileSystemEventHandler):
    def __init__(self, process_function):
        super().__init__()
        self.process_function = process_function  # Pass in the function you want to run when a file is added

    def on_created(self, event):
        if not event.is_directory:
            print(f"New file added: {event.src_path}")
            self.process_function(event.src_path)  # Call the provided function

def monitor_folder(path, process_function):
    print("started monitoring...",path)
    event_handler = FileEventHandler(process_function)
    observer = Observer()
    observer.schedule(event_handler, path, recursive=False)
    observer.start()
    
    try:
        while True:
            time.sleep(1)  # Keep the script running
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

if __name__ == "__main__":
    folder_to_monitor = "C:\\Users\\AbhinavKasubojula\\OneDrive - Kenall Inc\\Desktop\\code\\docs"
    
    # Choose which function you want to execute when a new file is added
    monitor_folder(folder_to_monitor, process_new_file)  # This will run the `process_new_file` function


In [16]:
def print_metadata_and_vectors():
    # Load and print metadata
    if os.path.exists(METADATA_PATH):
        with open(METADATA_PATH, 'rb') as f:
            metadata = pickle.load(f)
        print("Metadata:")
        for idx, entry in enumerate(metadata, start=1):
            print(f"{idx}. {entry}")
    else:
        print("No metadata found.")

    # Print vectors stored in the FAISS index
    if index.ntotal > 0:
        print("\nVectors:")
        for i in range(index.ntotal):
            vector = index.reconstruct(i)  # Retrieve vector by index
            print(f"Vector {i}: {vector}")
    else:
        print("No vectors found in the FAISS index.")

# Call the function to display metadata and vectors
METADATA_PATH = R'C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\code'
print_metadata_and_vectors()

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\AbhinavKasubojula\\OneDrive - Kenall Inc\\Desktop\\code'