In [1]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from openai import AzureOpenAI
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

True


In [2]:
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [3]:
azure_client = AzureOpenAI(
  api_key = azure_openai_api_key,  
  api_version = "2023-07-01-preview",
  azure_endpoint = azure_openai_endpoint
)
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [7]:
collection = chroma_client.create_collection(name="titanic_small")

UniqueConstraintError: Collection titanic_small already exists

In [8]:
file_dir = here("data/for_upload/titanic_small.csv")
df = pd.read_csv(file_dir, nrows=5)

In [9]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


NOTE: Process in chunks if dataset is big.

In [10]:
docs = []
metadatas = []
ids = []
embeddings = []
for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    response = azure_client.embeddings.create(
        input = output_str,
        model= "text-embedding-ada-002"
    )
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({"source": "titanic_small"})
    ids.append(f"id{index}")

In [11]:
docs

['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n',
 'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n',
 'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n']

In [12]:
print(metadatas)
print(ids)

[{'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}]
['id0', 'id1', 'id2', 'id3', 'id4']


In [13]:
embeddings[0][:10]

[-0.005509279202669859,
 -0.019893482327461243,
 -0.017736192792654037,
 -0.025068243965506554,
 0.010083277709782124,
 0.034653160721063614,
 -0.013264596462249756,
 -0.0008772520232014358,
 -0.024890746921300888,
 -0.0176952313631773]

In [14]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

NameError: name 'collection' is not defined

Verify the vectorDB creation

In [15]:
print("Number of vectors in vectordb:", collection.count())

NameError: name 'collection' is not defined

### RAG

In [16]:
from openai import AzureOpenAI

In [17]:
model_name = "gpt-35-turbo"
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [18]:
azure_client = AzureOpenAI(
    api_version=os.getenv("OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("OPENAI_API_BASE"),
    api_key=os.getenv("OPENAI_API_KEY"),
)

**Perform similarity search**

In [19]:
query_texts = "what's the average age of survivors"
response = azure_client.embeddings.create(
        input = query_texts,
        model= "text-embedding-ada-002"
    )
query_embeddings = response.data[0].embedding

**Load the chromaDB collection for vector search**

In [None]:
vectordb = chroma_client.get_collection(name="titanic_small")
vectordb.count()

30

In [26]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

NameError: name 'vectordb' is not defined

Pass the results to an LLM

In [22]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

NameError: name 'results' is not defined

In [23]:
response = azure_client.chat.completions.create(
    model=os.getenv("gpt_deployment_name"),
    messages=messages
)

NameError: name 'messages' is not defined

In [24]:
response.choices[0].message.content

AttributeError: 'CreateEmbeddingResponse' object has no attribute 'choices'

**Fact check**

In [25]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05
