In [None]:
import os
from openai import OpenAI

token = os.environ["GITHUB_TOKEN"]
# print(token)
# print(os.environ["OPENAI_API_KEY"])
endpoint = "https://models.inference.ai.azure.com"
model_name = "gpt-4o-mini"

client = OpenAI(
    base_url=endpoint,
    api_key=token,
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": "What is the capital of England?",
        }
    ],
    temperature=0,
    top_p=1.0,
    max_tokens=1000,
    model=model_name
)


In [3]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [20]:
persist_directory = "docs/chroma2"

In [27]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [28]:
print(vectordb._collection.count())

150


In [29]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [30]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [31]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [32]:
smalldb.similarity_search(question, k=2)

[Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

In [33]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

### Addressing Diversity: Maximum marginal relevance

In [34]:
question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)

In [35]:
docs_ss[0].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [36]:
docs_ss[1].page_content[:100]

'into his office and he said, "Oh, professor, professor, thank you so much for your \nmachine learning'

In [37]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

In [38]:
docs_mmr[0].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [39]:
docs_mmr[1].page_content[:100]

'reasonable choice on many problems, but you can actually plug in other functions as \nwell. Did I men'

### Addressing Specificity: working with metadata

In [40]:
question = "what did they say about regression in the third lecture?"

In [46]:
docs = vectordb.similarity_search(
    question,
    k=5,
    filter={"source":"MachineLearning-Lecture03.pdf"}
)

In [47]:
for d in docs:
    print(d.metadata)

{'page': 0, 'source': 'MachineLearning-Lecture03.pdf'}
{'page': 14, 'source': 'MachineLearning-Lecture03.pdf'}
{'page': 6, 'source': 'MachineLearning-Lecture03.pdf'}
{'page': 2, 'source': 'MachineLearning-Lecture03.pdf'}
{'page': 13, 'source': 'MachineLearning-Lecture03.pdf'}


### Addressing Specificity: working with metadata using self-query retriever

But we have an interesting challenge: we often want to infer the metadata from the query itself.

To address this, we can use `SelfQueryRetriever`, which uses an LLM to extract:
 
1. The `query` string to use for vector search
2. A metadata filter to pass in as well

Most vector databases support metadata filters, so this doesn't require any new databases or indexes.