In [25]:
import copy
import nltk
import os
import re
from pathlib import Path
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import DirectoryLoader, PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
embedding_model='sentence-transformers/all-MiniLM-L6-v2'
llm_model ="llama3.2"

root_path=Path(os.getcwd())
# pdf_path=root_path.joinpath('apra_standards\pdf')
word_path=root_path.joinpath('apra_standards\word')
db_path=root_path.joinpath('vectorstore\db_faiss')

Path(db_path).mkdir(parents=True, exist_ok=True)

In [5]:
# Download required NLTK resources
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

# Define the embedding model
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\crowl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\crowl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\crowl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [21]:
# Define text preprocessing function with lemmatization
def preprocess_text(text):
    # 1. Strip whitespace
    text = text.strip()

    # 2. Replace No. with number, this is to fix issue with stopwords step
    text = re.sub('/No/.','number')

    # 3. Tokenize the text
    tokens = nltk.word_tokenize(text)

    # 4. Remove stopwords and apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]

    # 5. Join the tokens back into a string
    preprocessed_text = " ".join(lemmatized_tokens)

    return preprocessed_text

# Function to preprocess the page_content of each document in a list of Document objects
def preprocess_documents(documents):
    for doc in documents:
        # Apply the preprocessing to the page_content of each document
        doc.page_content = preprocess_text(doc.page_content)
    return documents

In [None]:
def prepare_and_split_docs(directory,apply_preprocessing=False):
    # Load the documents
    loaders = [
        DirectoryLoader(directory, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader),
        DirectoryLoader(directory, glob="**/*.docx", show_progress=True),
        DirectoryLoader(directory, glob="**/*.csv", loader_cls=CSVLoader)
    ]

    documents = []
    for loader in loaders:
        data = loader.load()
        documents.extend(data)

    if apply_preprocessing==True:
        documents = preprocess_documents(documents)

    # Initialize a text splitter
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512,  # Use the smaller chunk size here to avoid repeating splitting logic
        chunk_overlap=256,
        disallowed_special=(),
        separators=["\n\n", "\n", " "]
    )

    # Split the documents and keep metadata
    split_docs = splitter.split_documents(documents)

    print(f"Documents are split into {len(split_docs)} passages")
    return split_docs


def ingest_into_vectordb(split_docs, db_path, force_rebuild=False):
    """
    Ingest documents into the vector database, and rebuild it if it doesn't exist
    or if force_rebuild is set to True.
    """
    # Check if the FAISS database already exists
    db_exists = os.path.exists(db_path)
    
    # Rebuild if force_rebuild is True or the database doesn't exist
    if not db_exists or force_rebuild:
        print("Building/rebuilding the vector database...")
        db = FAISS.from_documents(split_docs, embeddings)
        db.save_local(db_path)
        print("Documents are inserted into FAISS vectorstore.")
    else:
        print("Vector database already exists, skipping rebuild.")
        db = FAISS.load_local(db_path, embeddings)
    
    return db


In [None]:
split_docs = prepare_and_split_docs(word_path)

In [None]:
# Set force_rebuild=True to rebuild or leave it as False to only build if the vector DB doesn't exist
db = ingest_into_vectordb(split_docs, db_path, force_rebuild=False)