## Set Up AstraDB to store docs

I want to utilize AstraDB as the DB for storing documents as the docs are pretty large.

In [3]:
from astrapy import DataAPIClient
from dotenv import load_dotenv
import os
load_dotenv()

ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
# Initialize the client

client = DataAPIClient()
db = client.get_database(
  ENDPOINT,
  token=TOKEN,
)
print(f"Connected to Astra DB: {db.name()}")
print(f"Collections: {db.list_collection_names()}")


Connected to Astra DB: stat_rag_docs
Collections: []


# Data Processor Construction

In [1]:
from logging import raiseExceptions
from langchain_astradb import AstraDBLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

doc_path = '../RAG_Docs'

In [2]:
def path_validate(path):
    if not os.path.exists(path):
        return False
    else:
        return True


class DocIngestion:
    def __init__(self, docs_path: str, chunk_size=100, chunk_overlap=20):
        self.docs_path = docs_path if path_validate(docs_path) else raiseExceptions
        self.chunk_size = -1
        self.chunk_overlap = -1
        self.splitter = None
        self.update_splitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    def batch_ingest(self):
        for file in [f for f in os.listdir(self.docs_path) if not f.startswith('.')]: # Get all files name for ingestion
            res = self.individual_ingest(file)

    # Return True if the file is loaded successfully, False otherwise
    def individual_ingest(self, filename: str):
        if path_validate(os.path.isfile(os.path.join(self.docs_path, filename))):
            loader = PyPDFLoader(os.path.join(self.docs_path, filename))
            document = loader.load()
            return self.chunking(document)
        else:
            return None

    def chunking(self, doc):
        chunks = self.splitter.split_documents(doc)
        valid_chunks = [doc for doc in chunks if len(doc.page_content) > 50]
        return valid_chunks

    def get_whole_pdf(self, filename: str):
        if path_validate(os.path.isfile(os.path.join(self.docs_path, filename))):
            loader = PyPDFLoader(os.path.join(self.docs_path, filename))
            document = loader.load()
            return document
        return None

    def update_splitter(self, chunk_size=-1, chunk_overlap=-1):

        if chunk_size == -1 and chunk_overlap == -1:
            print("Nothing to update")
            return

        new_size = chunk_size if chunk_size != -1 else self.chunk_size
        new_overlap = chunk_overlap if chunk_overlap != -1 else self.chunk_overlap

        if new_size <= new_overlap:
            print(f"Error: Chunk size ({new_size}) must be greater than chunk overlap ({new_overlap}). Update aborted.")
            return

        self.chunk_size = new_size
        self.chunk_overlap = new_overlap

        try:
            self.splitter = RecursiveCharacterTextSplitter(
                separators=["\n\n", "\n", " ", ""],
                chunk_size=self.chunk_size,
                chunk_overlap=self.chunk_overlap,
            )
            print(f"Current splitter: name={self.splitter.__class__}, chunk_size={self.chunk_size}, chunk_overlap={self.chunk_overlap}")
        except Exception as e:
            print(f"Unexpected error updating splitter: {e}")




In [3]:
import re
# Taken from https://medium.com/@gnkbhuvan/how-to-clean-text-data-for-rag-models-a-beginners-guide-8f62559f259c
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
    if header_patterns is None:
        header_patterns = [r'^.*Header.*$']
    if footer_patterns is None:
        footer_patterns = [r'^.*Footer.*$']

    for pattern in header_patterns + footer_patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)

    return text.strip()

def remove_special_characters(text, special_chars=None):
    if special_chars is None:
        special_chars = r'[^A-Za-z0-9\s\.,;:\'\"\?\!\-]'

    text = re.sub(special_chars, '', text)
    return text.strip()

def remove_repeated_substrings(text, pattern=r'\.{2,}'):
    text = re.sub(pattern, '.', text)
    return text.strip()

def remove_extra_spaces(text):
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def preprocess_text(text):
    # Remove headers and footers
    text = remove_headers_footers(text)

    # Remove special characters
    text = remove_special_characters(text)

    # Remove repeated substrings like dots
    text = remove_repeated_substrings(text)

    # Remove extra spaces between lines and within lines
    text = remove_extra_spaces(text)

    # Additional cleaning steps can be added here

    return text.strip()


example_text = """Hello ................"""

cleaned_text = preprocess_text(example_text)
print(cleaned_text)

Hello .


In [4]:
ingestor = DocIngestion("../RAG_Docs", chunk_size=600, chunk_overlap=100)
# chunks = ingestor.individual_ingest("assumptions.pdf")
book = ingestor.get_whole_pdf("assumptions.pdf")


Current splitter: name=<class 'langchain_text_splitters.character.RecursiveCharacterTextSplitter'>, chunk_size=600, chunk_overlap=100




In [12]:
preprocess_text(book[30].page_content)

"TESTING STATISTICAL ASSUMPTIONS 2012 Edition Copyright c 2012 by G. David Garson and Statistical Associates Publishing Page 31 multivariate normality exists. This approach does no t assure correct conclusions. Bivariate screening for multivariate normality A bivariate scatterplot for any pair of variables in the model should yield and oval -shaped array of points if both variables are linearly related and normally distributed. While a step up, this is considered an exploratory approach. Residuals test. One approach is to regress each variable in the model on all other variables in the model, then save the residuals. If all the residual variables are normally distributed ex., by Q -Q plots, acceptable skew and kurtosis, etc., then it is assumed that the data are multivariate normal. OUTLIERS Outlying observations can radically alter the outcome of analysis and are also violations of normality. Outliers arise from four different causes, requiring different courses of action: Errors of d

[Document(metadata={'producer': 'Adobe PDF Library 10.0', 'creator': 'Acrobat PDFMaker 10.1 for Word', 'creationdate': '2012-08-23T11:47:02-04:00', 'author': 'G. David Garson', 'category': 'Statistics', 'comments': '@c 2012 by G. David Garson and Statistical Associates Publishers. All rights reserved. No permission is granted to republish this material in any form or to post it on a web server.\nAll statistical procedures have underlying assumptions, some more stringent than others. In some cases, violation of these assumptions will not change substantive research conclusions. In other cases, violation of assumptions will undermine meaningful research. Establishing that one\'s data meet the assumptions of the procedure one is using is an expected component of all quantitatively-based journal articles, theses, and dissertations. \nFor all volumes in the Statistical Associates "blue book" series, the assumptions of each statistical procedure are indicated in an "Assumptions" section. Thi