In [2]:
from embedding.embedding_models import EmbeddingModels
from utils.logger import logger

bgeEmbedding = EmbeddingModels().get_bge_embedding()

[32m2025-01-12 @ 17:44:38[0m | [1mINFO    [0m | [36mutils.logger[0m:[36m<module>[0m:[36m56[0m - [1mLogger initialized successfully[0m
[32m2025-01-12 @ 17:44:48[0m | [1mINFO    [0m | [36membedding.embedding_models[0m:[36mget_bge_embedding[0m:[36m28[0m - [1mUsing device: mps[0m
[32m2025-01-12 @ 17:44:48[0m | [1mINFO    [0m | [36membedding.embedding_models[0m:[36mget_bge_embedding[0m:[36m29[0m - [1mLoading BGE embedding model: BAAI/bge-m3[0m
[32m2025-01-12 @ 17:44:53[0m | [1mINFO    [0m | [36membedding.embedding_models[0m:[36mget_bge_embedding[0m:[36m41[0m - [1mSuccessfully loaded BGE embedding model[0m


### Load Aging care forum and Alzconnect forum data

*Since, we need to process the Aging Care forum data first before chunking*

---

In [3]:
import pandas as pd

# Aging care forum data
agingcare_df = pd.read_json(
    "../../data/raw_content/knowledge/aging-care-forum.json", orient="records"
)

agingcare_df["question"] = agingcare_df["question"].astype(str)
agingcare_df["answer"] = agingcare_df["answer"].astype(str)
agingcare_df["tags"] = agingcare_df["tags"].astype(str).str.strip('[]')

# remove duplicate entries
agingcare_df.drop_duplicates(
    subset=["answer", "question"], 
    keep="first", 
    inplace=True
)

print(agingcare_df[['title','question','answer', 'tags']].head())

# Alzconnect forum data
alzconnect_df = pd.read_parquet(
    "../../data/raw_content/knowledge/alz-connect-forum.parquet"
)

# print(alzconnect_df.info())

                                               title  \
0           Should Someone With Dementia Be Driving?   
1           Should Someone With Dementia Be Driving?   
2           Should Someone With Dementia Be Driving?   
3  Early Diagnosis of Alzheimer’s Is Crucial for ...   
4  Early Diagnosis of Alzheimer’s Is Crucial for ...   

                                            question  \
0    When Should Someone With Dementia Stop Driving?   
1  Why is letting a senior with dementia drive da...   
2   How to Stop a Person With Dementia from Driving?   
3  How is caring for someone with Alzheimer’s dis...   
4  What does the Alzheimer's treatment journey lo...   

                                              answer  \
0  Driving is one of the most difficult issues th...   
1  Wandering or “elopement” is a common behavior ...   
2  Car keys are a symbol of independence for Amer...   
3  AD poses real challenges not only for the peop...   
4  While there are no treatments available tha

### Recursively split the documents into chunks

---

In [4]:
from langchain.text_splitter import (
    # CharacterTextSplitter,
    # MarkdownHeaderTextSplitter,
    # MarkdownTextSplitter,
    RecursiveCharacterTextSplitter,
)
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
)

agingcare_document_chunks = []
alzconnect_document_chunks = []

logger.info("Chunking Aging Care forum data")
for doc in agingcare_df.itertuples():
    for chunk in text_splitter.split_text(" ".join([doc.question,doc.answer])):
        agingcare_document_chunks.append(
            Document(
                page_content=chunk,
                metadata={
                    "source": doc.url,
                    "title": doc.title,
                    "author": getattr(doc, "author", "Unknown"),
                    "tag": getattr(doc, "tags", "Unknown"),
                    "source-tag": "agingcare",
                },
            )
        )
        
logger.info("Chunking Alzconnect forum data")
for doc in alzconnect_df.itertuples():
    for chunk in text_splitter.split_text(" ".join([doc.title,doc.text])):
        alzconnect_document_chunks.append(
            Document(
                page_content=chunk,
                metadata={
                    "source": doc.source,
                    "title": doc.title,
                    "author": getattr(doc, "author", "Unknown"),
                    "tag": getattr(doc, "tags", "Unknown"),
                    "source-tag": "alzconnect",
                },
            )
        )

# Merge all chunks 
peer_support_document_chunks = []
for group in [agingcare_document_chunks, alzconnect_document_chunks]:
    peer_support_document_chunks.extend(group)

type(peer_support_document_chunks)

print(len(agingcare_document_chunks))

[32m2025-01-12 @ 17:44:59[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1mChunking Aging Care forum data[0m
[32m2025-01-12 @ 17:44:59[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mChunking Alzconnect forum data[0m
5014


### Generate vector store for peer support knowledgebase

In [5]:
# Create a Chroma vector store
# TODO Import function from vector_store.py that builds a Chroma vector store

import os
import shutil
from embedding.vector_store import VectorStore

vectorstore_path = "../../data/vector_database/peer_kb"
force_rebuild = True


def build_peer_support_vectorstore(vectorstore_path: str, force_rebuild: bool):
    if os.path.isdir(vectorstore_path) and force_rebuild:
        shutil.rmtree(vectorstore_path)
        logger.info(f"Vector store {vectorstore_path} already exists and force_rebuild is True. Rebuilding...")
    
    try:
        vectorstore = Chroma.from_documents(
            documents=peer_support_document_chunks,
            embedding=bgeEmbedding,
            persist_directory=vectorstore_path,
        )
        logger.info(f"Vector store built successfully at {vectorstore_path}")
        # return vectorstore
    except Exception as e:
        logger.error(f"Failed to build vector store: {str(e)}")
        raise

# build_peer_support_vectorstore(vectorstore_path, force_rebuild)

# peer_support_kb =VectorStore().build_chroma_vectorstore(
#     docs=agingcare_document_chunks,
#     embedding_model=bgeEmbedding,
#     collection_name="peer_support_knowledgebase",
#     vectorstore_path="../../data/vector_database/peer_kb",
#     force_rebuild=True
# )


--- 

### Process research paper data

Load pubmed data and process it. For each document, we straighten the array and convert it to a string since the array is not a valid in Langchain document metadata type.

And then we chunk it. 

In [None]:
# Read pubmed data
import json
from utils.tools import ToolKits

tools = ToolKits()

pubmed_path = "../../data/raw_content/knowledge/pubmed-central-delirium-family-caregiving.json"
with open(pubmed_path, 'r') as file:
    pubmed_data = json.load(file)

 # Chunk pubmed data
pubmed_document_chunks = []

def remove_square_brackets(text: str) -> str:
    return text.replace("[", "").replace("]", "")

try:
    for doc in pubmed_data:
        if doc['abstract'] != "":
            for chunk in text_splitter.split_text(doc['title'] + " " + doc['abstract']):
                if chunk != "":
                    pubmed_document_chunks.append(
                        Document(
                            page_content=chunk,
                            metadata={
                                "source": doc['url'],
                                "title": doc['title'],
                                "author": tools.straight_array_to_string(doc['authors']),
                                "tag": '',
                                "source-tag": "pubmed",
                            },
                        )
                    )
except Exception as e:
    logger.error(f"Failed to chunk pubmed data: {str(e)}")

print(len(pubmed_document_chunks))
print(pubmed_document_chunks[0])

### Build vector store for pubmed data

Use key word "delirium" to test the vector store. 

In [None]:

from embedding.vector_store import VectorStore

vs = VectorStore();

print(vs)

pubmed_kb = vs.build_chroma_vectorstore(
    docs=pubmed_document_chunks,
    embedding_model=bgeEmbedding,
    # collection_name="research_knowledgebase",
    vectorstore_path="../../data/vector_database/research_kb",
    force_rebuild=True
)

print(pubmed_kb.similarity_search("delirium"))