In [1]:
# Install required packages
%pip install pandas langchain langchain-community langchain-openai chromadb openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
from langchain_community.document_loaders import CSVLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import warnings
warnings.filterwarnings('ignore')

In [None]:

# Set your OpenAI API key
import os
os.environ["OPENAI_API_KEY"] = "OPENAI-API-KEY"  # Replace with your actual API key

# Define the path to your CSV files
dataset_dir = "../Dataset/diseases/"
csv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]

In [6]:

# Initialize the OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Process each CSV file
for csv_file in csv_files:
    file_path = os.path.join(dataset_dir, csv_file)
    print(f"Processing {csv_file}...")
    
    # Load the CSV file
    loader = CSVLoader(file_path=file_path)
    documents = loader.load()
    
    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    chunks = text_splitter.split_documents(documents)
    
    # Create a vector store
    db_name = f"chroma_{csv_file.split('.')[0]}"
    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=f"./chroma_db/{db_name}"
    )
    
    # Persist the vector store
    vectordb.persist()
    print(f"Created vector store for {csv_file} with {len(chunks)} chunks")

print("All CSV files processed and embedded successfully!")


Processing diseases_guideline - Sheet1.csv...
Created vector store for diseases_guideline - Sheet1.csv with 20 chunks
All CSV files processed and embedded successfully!


In [11]:
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader
import os

# Define the path to your disease CSV files
disease_dataset_dir = "../Dataset/diseases/"
disease_csv_files = [f for f in os.listdir(disease_dataset_dir) if f.endswith('.csv')]

# Check where disease embeddings are stored
print("Disease embeddings are stored in the following directories:")
for csv_file in disease_csv_files:
    db_name = f"chroma_{csv_file.split('.')[0]}"
    db_path = f"./chroma_db/{db_name}"
    if os.path.exists(db_path):
        print(f"- {db_path}")
    else:
        print(f"- {db_path} (not created yet)")


Disease embeddings are stored in the following directories:


In [8]:
# Merge all three ChromaDB collections into a single collection
from langchain_community.vectorstores import Chroma
import shutil

# Define the source ChromaDB directories
source_dbs = [
    "chroma_dhan - Aman",
    "chroma_dhan - Aus",
    "chroma_dhan - Boro"
]

# Define the target merged ChromaDB directory
merged_db_dir = "./chroma_db/chroma_dhan_merged"

# Create a new merged ChromaDB
print("Merging ChromaDB collections...")

# Check if the merged directory already exists and remove it if it does
if os.path.exists(merged_db_dir):
    shutil.rmtree(merged_db_dir)
    print(f"Removed existing merged database at {merged_db_dir}")

# Initialize a new empty ChromaDB for the merged data
merged_db = Chroma(
    persist_directory=merged_db_dir,
    embedding_function=embeddings
)

# Iterate through each source database and add its documents to the merged database
total_docs = 0
for db_name in source_dbs:
    source_db_path = f"./chroma_db/{db_name}"
    
    # Check if the source database exists
    if not os.path.exists(source_db_path):
        print(f"Warning: Source database {source_db_path} does not exist. Skipping.")
        continue
    
    # Load the source database
    source_db = Chroma(
        persist_directory=source_db_path,
        embedding_function=embeddings
    )
    
    # Get all documents from the source database
    source_docs = source_db.get()
    
    if source_docs and len(source_docs['documents']) > 0:
        # Add documents to the merged database
        merged_db.add_documents(
            documents=[Document(page_content=doc) for doc in source_docs['documents']],
            embeddings=source_docs['embeddings'],
            metadatas=source_docs['metadatas'],
            ids=source_docs['ids']
        )
        
        doc_count = len(source_docs['documents'])
        total_docs += doc_count
        print(f"Added {doc_count} documents from {db_name}")
    else:
        print(f"No documents found in {db_name}")

# Persist the merged database
merged_db.persist()
print(f"Successfully merged {len(source_dbs)} databases into {merged_db_dir} with a total of {total_docs} documents")


Merging ChromaDB collections...


NameError: name 'Document' is not defined