## Data Embedding and Storing to Chroma vector database as persistence

In [None]:
# imports
import pandas as pd
import chromadb
from openai import OpenAI
from dotenv import load_dotenv

<h4 style="color:#3f79ffff; margin:0; font-weight:600;">First I am setting up the clients that I need (OpenAI for embedding and Chroma for persistence)

In [None]:
# loading env vars
load_dotenv()

# setup clients
client = OpenAI()
chroma_client = chromadb.PersistentClient()

<h4 style="color:#3f79ffff; margin:0; font-weight:600;">Loading our previously made CSV with our cleaned data and creating a collection in our ChromaDB where we will store our embeddings

In [5]:
# load CSV
df = pd.read_csv("../data/phone_transcripts_v0.1_chunks.csv")

In [6]:
# create collection
collection = chroma_client.get_or_create_collection("phone_transcripts_v0.1")

<h4 style="color:#3f79ffff; margin:0; font-weight:600;">Looping through all our csv rows and creating the embeddings with the OpenAi "text-embedding-3-large" model, within the loop I add the transcripts to the ChromaDB collection and add the other columns as metadata which will help with more advanced searches later

In [7]:
# loop through df & create embeddings
for _, row in df.iterrows():
    text = row["transcript"]
    metadata = {
        "call_id": str(row["call_id"]),
        "date": str(row["date"]),
        "duration": str(row["duration"]),
        "speaker": row["speaker"]
    }

    # create embedding with OpenAI
    emb = client.embeddings.create(
        model="text-embedding-3-large",
        input=text
    ).data[0].embedding

    # save to Chroma
    collection.add(
        ids=[row["chunk_id"]],
        embeddings=[emb],
        documents=[text],
        metadatas=[metadata]
    )

print("All chunks saved in Chroma")


All chunks saved in Chroma
