## Imports

In [51]:
import os
import fitz  # PyMuPDF
import tiktoken
import chromadb
import numpy as np
from openai import OpenAI
from docx import Document
from chromadb.config import Settings
from langchain_openai import OpenAIEmbeddings
from chromadb.utils import embedding_functions
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma

## Microservices

### Dynamic file reader

In [43]:
def detect_paragraph_delimiter(text_sample, possible_delimiters=["\n\n", "\n \n", "\n\t\n"]):
    """
    Detects the most common paragraph delimiter in a text sample.

    Parameters:
    - text_sample (str): A sample of the text to analyze.
    - possible_delimiters (list of str): A list of possible paragraph delimiters.

    Returns:
    - The most common paragraph delimiter found in the sample.
    """
    delimiter_counts = {delimiter: text_sample.count(delimiter) for delimiter in possible_delimiters}
    # Sort delimiters by their occurrence count in descending order
    sorted_delimiters = sorted(delimiter_counts, key=delimiter_counts.get, reverse=True)
    # Return the most common delimiter
    delimiter = sorted_delimiters[0] if sorted_delimiters else None
#     print(delimeter)
    return delimeter

def split_paragraph_into_overlapping_chunks(paragraph, token_limit=28000, overlap_size=1000):
    """
    Splits a paragraph into smaller chunks based on the token_limit, with an
    overlap of overlap_size characters between consecutive chunks.

    Parameters:
    - paragraph (str): The paragraph to be split.
    - token_limit (int): The maximum number of characters per chunk.
    - overlap_size (int): The number of characters to overlap between chunks.

    Returns:
    - list: A list of overlapping text chunks.
    """
    chunks = []
    start_index = 0

    while start_index < len(paragraph):
        # If we're not at the start, move back to create overlap
        if start_index > 0:
            start_index = max(start_index - overlap_size, 0)

        end_index = start_index + token_limit
        chunk = paragraph[start_index:end_index]
        chunks.append(chunk)

        # Break if we're at the end of the paragraph
        if end_index >= len(paragraph):
            break

        start_index = end_index

    return chunks

def split_text_into_paragraphs_and_chunks(text, token_limit=28000, overlap_size=1000):
    """
    Splits the given text into paragraphs and then into overlapping chunks with
    a maximum of token_limit characters. If a paragraph is larger than token_limit,
    it's further split into smaller chunks with overlap for better context.
    """
    delimiter = detect_paragraph_delimiter(text[:1000])  # Sample the first 1000 characters
#     print(f"{detect_paragraph_delimiter(text[:1000])}")
    if not delimiter:
        delimiter = "\n\n"  # Default to "\n\n" if no delimiter is detected
    paragraphs = text.split(delimiter)
#     print(paragraphs)
    all_chunks = []
#     print(len(paragraphs))
#     print(type(paragraphs))
    for paragraph in paragraphs:
        if len(paragraph) > token_limit:
            # Split large paragraphs into smaller overlapping chunks
            chunks = split_paragraph_into_overlapping_chunks(paragraph, token_limit, overlap_size)
            all_chunks.extend(chunks)
        else:
            all_chunks.append(paragraph)

    return all_chunks

def read_text_from_file(file_path):
    """
    Reads text from a the file path

    Parameters:
    - file_path (str): The path to the file.

    Returns:
    - full_text (str): File content.
    """
    full_text = ""
    if file_path.endswith('.pdf'):
        with fitz.open(file_path) as doc:
            for page in doc:
                full_text += page.get_text()
#                 print(detect_paragraph_delimiter(full_text))
    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        full_text = ' '.join(paragraph.text for paragraph in doc.paragraphs)
    elif file_path.endswith('.txt'):
        with open(file_path, 'r') as file:
            full_text = file.read()
    else:
        raise ValueError("Unsupported file type")
    
    return full_text

def split_into_chunks(text, token_limit=28000):
    """
    Splits the given text into chunks with a maximum of token_limit tokens.
    Token limit is an estimate of character count hence its more than the required 8191 for text-embedding-3-large model.

    Parameters:
    - text (str): The text to be split.
    - token_limit (int): The maximum number of tokens per chunk.

    Returns:
    - list: A list of text chunks.
    """
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(' '.join(current_chunk + [word])) > token_limit:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

### Tokenize and vector embedding

In [31]:
def num_tokens_from_string(string: str, encoding_name="cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def get_embedding(text, model="text-embedding-3-large"):
    """
    Takes in the chunk of text from the file content and return the vector embeddings using the specified model
    
    Parameters:
    - text (str): The chunk of text from the file read
    - model (str): The model to use for embedding. As of writing this, text-embedding-3-large is the latest
    
    Returns:
    - list: A list of the vector embeddings for the text provided.
    """
    client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

def populate_embeddings_to_chromadb(folder_path):
    # Read each file in the folder, break it into chunks and return a list of chuncks for vectorization
    for filename in os.listdir(folder_path):
        # Read the file content
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            file_content = read_text_from_file(file_path)
#             paragraphs = file_content.split('\n \n')
#             text_chunks = split_into_chunks(file_content)
            text_chunks = split_text_into_paragraphs_and_chunks(file_content)
#     return text_chunks
        # Tokenize and chunk the file content
        vector = []
        print(filename)
        for index, chunk in enumerate(text_chunks, start=1):
            try:
#                 vector.append(get_embedding(chunk))
                unique_id = f"{filename}_{index}"
                vector = get_embedding(chunk)
                store_vector_in_chromadb(vector, filename, chunk, unique_id)
            except Exception as e:
                print("Could not embed the text chunk for file (check token limit): ", filename)
                print(e)
            print(num_tokens_from_string(chunk))

#         if len(vector) > 1:
            # Compute the mean embedding
#             vector = np.mean(vector, axis=0)

            # Optionally, normalize the mean embedding
            # vector = np.array(mean_embedding / np.linalg.norm(mean_embedding))
#             vector = vector.tolist()

#         print(type(vector))
        print(len(text_chunks))
        print(type(text_chunks))
#         store_vector_in_chromadb(vector, filename, ' '.join(text_chunks))


### Save vector to Chromadb

In [32]:
def store_vector_in_chromadb(vector, filename, text, unique_id):
    # Placeholder for storing the vector in ChromaDB
    # Implement according to your ChromaDB setup
    
    client = chromadb.PersistentClient(path=".chromadb/",settings=Settings(allow_reset=True))
    collection = client.get_or_create_collection(name="policy_files", metadata={"hnsw:space": "cosine"})
        
    collection.add(documents = [text], embeddings = vector, metadatas = [{"source": filename}], ids = [unique_id])

### Vector query

In [41]:
def read_vector_in_chromadb(query):
    client = chromadb.PersistentClient(path=".chromadb/",settings=Settings(allow_reset=True))
    collection = client.get_collection(name="policy_files")
    vector = get_embedding(query)

#     openai_embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OPENAI_API_KEY'], model = "text-embedding-3-large")
#     langchain_chroma = Chroma(
#         client = client,
#         collection_name="policy_files",
#         embedding_function=openai_embeddings,
#     )
#     docs = langchain_chroma.similarity_search(query)
#     return docs[0].page_content

    return collection.query(query_embeddings = vector, n_results=2)


### OpenAI GPT-4 LLM

In [None]:
# models
GPT_CLIENT = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-4"

### if _name == "main"_

In [50]:
# Data files folder path
folder_path = "D:\\My Projects\\Policy Chatbot\\rag-model\\files"

text_chunks = populate_embeddings_to_chromadb(folder_path)
query = input("What would you like to know?")
print(read_vector_in_chromadb(query))


Interactions-with-HCPs-Policy-Field.pdf
337
695
181
130
278
125
181
310
192
145
118
7
235
152
308
575
652
366
1106
289
1516
208
931
13
328
675
30
34
155
276
54
204
35
62
220
77
68
1109
38
<class 'list'>


KeyboardInterrupt: Interrupted by user

#### Reset vector database

In [49]:
client = chromadb.PersistentClient(path=".chromadb/", settings=Settings(allow_reset=True))
client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.
client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

True