# chroma_db
> ChromaDB application for embedding text

In [1]:
# |default_exp chroma_db

In [2]:
# | hide
from nbdev.showdoc import *
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

## Install dependencies

## Make an app with Gradio

In [3]:
# |export
import chromadb
from chromadb.config import Settings
from dotenv import load_dotenv
import os
import ollama
import numpy as np
import hashlib

This is a simple RAG chatbot built on top of Llama Index and Gradio. It allows you to upload any text or PDF files and ask questions about them!
Before running this, make sure you have exported your OpenAI API key as an environment variable:

```bash
export OPENAI_API_KEY="mykey"
```

In [4]:
# |export
load_dotenv()

True

In [5]:
oai_key = os.getenv("GEMINI_API_KEY")
print(oai_key)

AIzaSyDuuJSGLUjITEB_w4JQkhmBg10j6nHUj5I


In [6]:
http_proxy = os.getenv("HTTP_PROXY")
https_proxy = os.getenv("HTTPS_PROXY")
print(http_proxy, https_proxy)

# os.environ["HTTP_PROXY"] = ''
# os.environ["HTTPS_PROXY"] = ''
# http_proxy = os.getenv("HTTP_PROXY")
# https_proxy = os.getenv("HTTPS_PROXY")
# print(http_proxy, https_proxy)

http://127.0.0.1:20171 http://127.0.0.1:20171


In [15]:
# |export
# client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",persist_directory="db/"))
documents = [
  "Llamas are members of the camelid family meaning they're pretty closely related to vicuñas and camels",
  "Llamas were first domesticated and used as pack animals 4,000 to 5,000 years ago in the Peruvian highlands",
  "Llamas can grow as much as 6 feet tall though the average llama between 5 feet 6 inches and 5 feet 9 inches tall",
  "Llamas weigh between 280 and 450 pounds and can carry 25 to 30 percent of their body weight",
  "Llamas are vegetarians and have very efficient digestive systems",
  "Llamas live to be about 20 years old, though some only live for 15 years and others live to be 30 years old",
]
# client = chromadb.PersistentClient(path="../db", settings=Settings(allow_reset=True))
client = chromadb.EphemeralClient()
try:
  collection = client.get_or_create_collection(name="test_collection")
  # client.reset()
  # client.delete_collection(name="test_collection2")
except Exception as e:
  print(e)

# collection = client.create_collection(name="test_collection")
client.list_collections()

['test_collection']

In [12]:
# |export
student_info = """
Alexandra Thompson, a 19-year-old computer science sophomore with a 3.7 GPA,
is a member of the programming and chess clubs who enjoys pizza, swimming, and hiking
in her free time in hopes of working at a tech company after graduating from the University of Washington.
"""

club_info = """
The university chess club provides an outlet for students to come together and enjoy playing
the classic strategy game of chess. Members of all skill levels are welcome, from beginners learning
the rules to experienced tournament players. The club typically meets a few times per week to play casual games,
participate in tournaments, analyze famous chess matches, and improve members' skills.
"""

university_info = """
The University of Washington, founded in 1861 in Seattle, is a public research university
with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell.
As the flagship institution of the six public universities in Washington state,
UW encompasses over 500 buildings and 20 million square feet of space,
including one of the largest library systems in the world.
"""

documents.extend([student_info, club_info, university_info])

In [16]:
#| export
for i, d in enumerate(documents):
    response = ollama.embed(model='bge-m3',input=d)
    embeddings = response.embeddings
    print(np.linalg.norm(response.embeddings),response.embeddings)
    collection.add(
        ids=[hashlib.sha256(d.encode()).hexdigest()],
        metadatas=[{"text": d[:15]}],
        embeddings=embeddings,  # type: ignore
        documents=[d],
    )

1.000000270653921 [[-0.03653153, 0.0049295872, -0.051388804, 0.00925592, -0.022587107, -0.07063023, 0.02079605, 0.03403494, -0.028162213, 0.0056333444, -0.048977707, 0.014629725, -0.041148596, 0.003836173, -0.015667222, -0.0013165281, -0.008164036, -0.006001229, -0.00097603904, -0.029270856, -0.02428818, 0.014002893, -0.03326469, -0.028342912, -0.017684218, 0.019690303, -0.036894836, 0.016331632, 0.0014821631, -0.041890513, 0.018319702, -0.009291583, -0.005093488, -0.0071026795, -0.041731633, -0.019618254, -0.0059820926, 0.051778756, -0.019840999, -0.0111655975, -0.03327436, -0.027209193, -0.011235171, -0.028525027, 0.0045002243, 0.014080402, -0.03373001, -0.034068793, -0.021335745, -0.02243444, 0.008387012, -0.015821738, 0.040808845, 0.013840043, -0.035178285, -0.0091792, -0.021001507, -0.029622288, -0.0654441, 0.030479422, -0.009204984, 0.02971195, -0.01964115, -0.042733982, -0.0015723933, 0.0742553, -0.040040866, -0.00017652386, -0.029360749, -0.007705113, -0.004696732, -0.002355852

In [17]:
# |export
# an example input
# request = "What animals are llamas related to?"
request = "What is the club name?"

# generate an embedding for the input and retrieve the most relevant doc
response = ollama.embed(
  model="bge-m3",
  input=request
)
results = collection.query(
  query_embeddings=response["embeddings"],
  n_results=6
)
# data = results['documents'][0][0]

In [18]:
results['documents'][0][0] if results['documents'] else None
results['ids'][0][0] if results['ids'] else None
results['metadatas'][0][0] if results['metadatas'] else None

"Llamas are members of the camelid family meaning they're pretty closely related to vicuñas and camels"

'4b9fdd5d73ae31dacf894e7c21df9efc39eacccc03b1527ecfa44e5128d774b2'

{'text': 'Llamas are memb'}

In [19]:

results['documents'][0][1] if results['documents'] else None
results['ids'][0][1] if results['ids'] else None
results['metadatas'][0][1] if results['metadatas'] else None

'Llamas live to be about 20 years old, though some only live for 15 years and others live to be 30 years old'

'd35d12d03c2a83f66d63e018cf2644d2862588352d641279a535e9fbedef6f13'

{'text': 'Llamas live to '}

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()