In [11]:
import os
import shutil
from pathlib import Path
from typing import List, Dict
from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

dev_mode = "True" 

In [12]:
def load_document(file_path):
    """
    Load a document from the given file path.
    This is a placeholder function and should be replaced with actual document loading logic.
    """
    with open(file_path, 'r') as file:
        return file.read()

In [13]:
def load_documents_with_ac(docs_path: str, document_tags: Dict[str, List[str]], exclude_patterns=None) -> List[
    Document]:
    """
    Load documents and add access control metadata.
    
    Args:
        docs_path: Path to documents
        document_tags: Dictionary mapping document paths to list of allowed roles
        exclude_patterns: Patterns to exclude from loading
    """
    if exclude_patterns is None:
        exclude_patterns = [".DS_Store", ".ipynb_checkpoints"]

    base_path = Path(docs_path)
    all_files = base_path.rglob("*")

    documents = []
    for file in all_files:
        if not file.is_file():
            continue

        if any(file.match(pattern) for pattern in exclude_patterns):
            continue

        # Get document tags for this file
        file_path = str(file.relative_to(base_path))  # Get relative path
        file_tags = document_tags.get(file_path, ["employee"])  # Default to employee access if not specified
        print(f"Loading {file_path} with access roles: {file_tags}")

        # Decide loader by extension
        if file.suffix.lower() == ".pdf":
            loader = PyPDFLoader(str(file.absolute()))
        elif file.suffix.lower() == ".csv":
            loader = CSVLoader(str(file.absolute()), encoding='utf-8')
        else:
            loader = TextLoader(str(file.absolute()), encoding='utf-8')

        # Load and add metadata
        docs = loader.load()
        for doc in docs:
            # Convert list of roles to comma-separated string
            access_roles_str = ",".join(file_tags)
            doc.metadata.update({
                "access_roles": access_roles_str,
                "source": str(file)
            })
        documents.extend(docs)

    print(f"Total documents loaded: {len(documents)}")
    return documents

In [14]:
def chunk_documents_with_ac(documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200) -> List[
    Document]:
    """
    Split documents into chunks while preserving access control metadata.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunked_docs = []
    for doc in documents:
        # Preserve original metadata
        original_metadata = doc.metadata.copy()
        doc_chunks = splitter.split_documents([doc])

        # Add chunk-specific metadata
        for i, chunk in enumerate(doc_chunks):
            chunk.metadata.update(original_metadata)
            chunk.metadata["chunk_id"] = f"{original_metadata['source']}_{i}"

        chunked_docs.extend(doc_chunks)

    print(f"Total documents after chunking: {len(chunked_docs)}")
    return chunked_docs

In [15]:
def build_faiss_vectorstore_ac(docs: list, embeddings, persist_dir: str):
    vectorstore = FAISS.from_documents(
        documents=docs,
        embedding=embeddings
    )
    vectorstore.save_local(persist_dir)
    print(f"FAISS index saved at: {persist_dir}")
    return vectorstore

In [16]:
def get_embedding_model(model_name: str = 'llama3.2:3b'):
    print(f"Loading embedding model: {model_name}")
    if dev_mode:
        return HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    else:
        return OllamaEmbeddings(model=model_name)

In [19]:
def main():
    docs_path = "./a_files"
    persist_dir = "faiss_index_ac"

    # Define document tags
    document_tags = {
        "Electric_Inc_Onboarding.pdf": ["manager", "employee"],
        "Strategy_2025.pdf": ["manager"],
        "Report_25.pdf": ["manager"],
    }

    # Load documents with access control
    raw_documents = load_documents_with_ac(docs_path, document_tags)

    # Chunk documents while preserving access control metadata
    chunked_documents = chunk_documents_with_ac(raw_documents)

    # Get embedding model
    embeddings = get_embedding_model()

    # Build and save FAISS vector store
    vectorstore = build_faiss_vectorstore_ac(chunked_documents, embeddings, persist_dir)

In [20]:
if __name__ == "__main__":
    main()

Loading Electric_Inc_Onboarding.pdf with access roles: ['manager', 'employee']
Loading Report_25.pdf with access roles: ['manager']
Loading Strategy_2025.pdf with access roles: ['manager']
Total documents loaded: 5
Total documents after chunking: 11
Loading embedding model: llama3.2:3b
FAISS index saved at: faiss_index_ac


In [10]:
def retrive_ac(query: str, vectorstore: FAISS, access_roles: List[str]) -> List[Document]:
    """
    Retrieve documents based on query and access control roles.
    
    Args:
        query: The search query
        vectorstore: The FAISS vector store
        access_roles: List of roles that are allowed to access the documents
    
    Returns:
        List of documents that match the query and access control
    """
    results = vectorstore.similarity_search(query)
    filtered_results = []
    
    for doc in results:
        roles = doc.metadata.get("access_roles", "").split(",")
        if any(role in access_roles for role in roles):
            filtered_results.append(doc)
    
    return filtered_results

In [18]:
# Example usage of retrieval function
query = "What is the strategy for 2025?"
access_roles = ["manager"] 
vectorstore = FAISS.load_local("./faiss_index_ac", embeddings=get_embedding_model(), allow_dangerous_deserialization=True)
results = retrive_ac(query, vectorstore, access_roles)
for doc in results:
    print(f"Document: {doc.metadata['source']}, Access Roles: {doc.metadata['access_roles']}")

Loading embedding model: llama3.2:3b
Document: a_files\Report_25.pdf, Access Roles: manager
Document: a_files\Strategy_2025.pdf, Access Roles: manager
Document: a_files\Report_25.pdf, Access Roles: manager
Document: a_files\Strategy_2025.pdf, Access Roles: manager


In [19]:
query = "What is the strategy for 2025?"
access_roles = ["employee"] 
results = retrive_ac(query, vectorstore, access_roles)
for doc in results:
    print(f"Document: {doc.metadata['source']}, Access Roles: {doc.metadata['access_roles']}")

Loading embedding model: llama3.2:3b


In [17]:
# Example usage of retrieval function
query = "What is the onboarding in Electronic Inc?"
access_roles = ["employee"] 
results = retrive_ac(query, vectorstore, access_roles)
for doc in results:
    print(f"Document: {doc.metadata['source']}, Access Roles: {doc.metadata['access_roles']}")

Document: a_files\Electric_Inc_Onboarding.pdf, Access Roles: manager,employee
Document: a_files\Electric_Inc_Onboarding.pdf, Access Roles: manager,employee
Document: a_files\Electric_Inc_Onboarding.pdf, Access Roles: manager,employee
Document: a_files\Electric_Inc_Onboarding.pdf, Access Roles: manager,employee
