In [None]:

import os

from libs.agents import Agent, AgentConfig, create_agent, invoke_agent
from libs.ai import *
from libs.base import Directories
from libs.graphs import run_team_workflow
from libs.io import read_text, write_text
from security.apis import APIS

dirs = Directories()
log_file_path = os.path.join(dirs.logs,'log.py')

# print_heading("Available APIs and Models",'green')
# eprint(MODELS)
# print("\n")

print_heading("Available Agent Personas",'green')
eprint([x for x in list(AGENTS.keys())])
print("\n")

print_heading("Project Directories",'green')
print_dict(dirs.__dict__,'green')



# from openai import OpenAI
# from langchain_core.prompts import PromptTemplate
# from langchain_openai import OpenAI
# import langgraph


# from langchain_openai import ChatOpenAI
# from langchain_core.messages import HumanMessage
# from langgraph.graph import END, MessageGraph

# from langchain.vectorstores import FAISS
import chromadb
# from chromadb import Client as VectorDBClient
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import TextLoader,PyMuPDFLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter


from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings





In [None]:

config_root = "ELH/code/configs/"

model_config = libs.io.file_open(f"{config_root}models.json")

print("Available APIs and Models")
print("--------------------------")
# _=[print(x) for x in list(model_config.keys())]
_=[print(f"{x}:{model_config[x]['models']}") for x in list(model_config.keys())]


In [None]:
#### CONFIGURE THE RAG SYSTEM

api = 'nvidia' 
# api_key = model_config[api]['key']
api_key = model_config.get(api).get('key')


model_name = "mistralai/mixtral-8x7b-instruct-v0.1"
# embedder_name = "NV-Embed-QA"
embedder_name = "all-MiniLM-L6-v2"


dir_vector_db = "data/vector_db"
dir_project = "dawgpyl"
dir_filestore = f"{dir_project}data/docs/"
DOCS_DIR = os.path.abspath(dir_filestore)



In [None]:
# test run and see that you can generate a response successfully
from langchain_nvidia_ai_endpoints import ChatNVIDIA
 
llm = ChatNVIDIA(model=model_name, nvidia_api_key=api_key, max_tokens=1024)

result = llm.invoke("Can I run multiple docker containers on the NVIDIA API?")
print(result.content)

#### Test the vector embeddings in chromadb using the langchain wrapper

In [None]:
#load the vector database
# create the open-source embedding function

document_embedder = SentenceTransformerEmbeddings(model_name=embedder_name)

text_splitter_config = {
    "chunk_size":2500,
    "chunk_overlap":250,
    "separator":" ",
}
text_splitter = CharacterTextSplitter(**text_splitter_config)


In [None]:

### Spare attempts at creating a chromadb client

# vector_db_client = chromadb.PersistentClient(path=vector_db_dir)
# print(type(vector_db_client))
# vector_db_client

# chroma_client = VectorDBClient()
# collection_name = 'science'
# collection = chroma_client.create_collection(collection_name)


In [None]:

if os.path.exists(dir_vector_db):    
    ######## LOAD THE VECTOR DATABASE
    print("VECTOR DB ALREADY EXISTS")
    print(f"LOADING: {dir_vector_db}")
    vector_db = Chroma(persist_directory=dir_vector_db, embedding_function=document_embedder)

else:
    ####### CREATE THE VECTOR DATABASE IF IT DOESN'T ALREADY EXIST

    # Load a pdf file
    # document_filepath = "data/Principles of Neural Science - Fifth Edition.pdf"
    knowledge_dir_files = [os.path.join(dir_filestore,x) for x in os.listdir(dir_filestore)]
    for idx,document_filepath in enumerate(knowledge_dir_files):
        # document_filepath = "data\docs\American Psychiatric Association - Diagnostic and Statistical Manual of Mental Disorders, 5th Edition_ DSM-5 (2013, American Psychiatric Publishing).pdf"
        if '.pdf' in document_filepath:
            document_loader = PyMuPDFLoader(document_filepath)
            pages = document_loader.load()

        # Split the text into chunks smallter than a maximum size    
        pages_split = text_splitter.split_documents(pages)

        # Embed the chunks and save in a Chroma vector database
        if idx==0:
            vector_db = Chroma.from_documents(pages_split, document_embedder, persist_directory=dir_vector_db)
        else:
            vector_db = Chroma.add_documents(pages_split, document_embedder, persist_directory=dir_vector_db)
    # print(type(vector_db))
    

_=[print(x) for x in list(vector_db.get().keys())]


So the CharacterTextSplitter appears to only split lines that are greater than 400 characters in length.  <br>
I'm not sure what the implications are for this, but that is okay for now.

In [None]:
#### Apparently I re-wrote the 'text_splitter.split_documents' function... OH BOY

# all_page_text = []
# page_nums = []


# for idx,page in enumerate(pages):
#     page_dict = page.dict()
#     page_text = [page_dict['page_content']]
#     all_page_text.extend(page_text)
#     page_nums.extend([str(page_dict['metadata']['page'])])

# del pages

# text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0, separator=" ")
# pages_split = []
# pages_split_metadata = []

# for idx, page_text in enumerate(all_page_text):
#     splits = text_splitter.split_text(page_text)
#     pages_split.extend(splits)
#     pages_split_metadata.extend([{"page_num": page_nums[idx]}] * len(splits))




In [None]:
######## QUERY THE VECTOR DATABASE

# query = "Who is Donald Hebb?"
# query = "What are some symptoms of borderline personality disorder?"
query = "What is a machine learning model?"
# query_results = chroma_db.similarity_search(query)
query_results = vector_db.similarity_search_with_score(query,k=5)


print(f"SEARCHING VECTOR DATABASE: {dir_vector_db}")
print()
print(f"QUERY: {query}")
print()
for result,score in query_results:
    result = result.dict()
    result_meta = result['metadata']

    print("==================================================")
    print()
    print(f"SIMILARITY SCORE: {score}")
    print(f"Source: {result_meta['source']}")
    print(f"Page: {result_meta['page']}")
    print()
    print(result['page_content'])
    print()



### **END RAG**