In [1]:
import chromadb
chroma_client = chromadb.Client()

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


In [9]:
import pandas as pd

df = pd.read_csv('./../data/top-1k-repos-with-embeddings.csv')

## Importing existing embeddings into Chroma

I already had embeddings stored in a CSV. Can easily bring them into Chroma.

In [17]:
import ast

repos = chroma_client.get_collection(name="my_repos") or chroma_client.create_collection(name="my_repos")

repos.add(
  embeddings=[ast.literal_eval(e) for e in df['embedding'].values.tolist()],
  metadatas=[{'description': d} for d in df['description'].values.tolist()],
  ids=df['github_id'].values.tolist()
)



## Using Chroma to embed my entries 

Creating a new collection that also takes care of embedding for me 

In [18]:
from openai.embeddings_utils import get_embedding
import tiktoken

embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

def embedding_function(text):
  encoding = tiktoken.get_encoding(embedding_encoding)
  if len(encoding.encode(text)) >= max_tokens:
    print("Text too long. Text: %s", text)
    return []
    
  return get_embedding(text, engine=embedding_model)


collection = chroma_client.create_collection(name="repos_embedder", embedding_function=embedding_function)

collection.add(
  documents=df['description'].values.tolist(),
  ids=df['github_id'].values.tolist(),
  metadatas=[{ 'score': row['Score'], 'id': row['id'], 'full_name': row['full_name'] } for _, row in df.iterrows()]
)  



TypeError: LocalAPI.create_collection() got an unexpected keyword argument 'embedding_function'