In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain.evaluation import load_evaluator
from scipy.spatial.distance import cosine

# https://python.langchain.com/docs/integrations/text_embedding/ollama/
# https://python.langchain.com/api_reference/ollama/embeddings/langchain_ollama.embeddings.OllamaEmbeddings.html

# https://github.com/ollama/ollama#model-library

In [2]:
# ollama pull nomic-embed-text
embeddings = OllamaEmbeddings(model='nomic-embed-text')


In [3]:
# https://python.langchain.com/docs/integrations/text_embedding/ollama/
# Create a vector store with a sample text
from langchain_core.vectorstores import InMemoryVectorStore

text = "LangChain is the framework for building context-aware reasoning applications"

vectorstore = InMemoryVectorStore.from_texts(
    [text],
    embedding=embeddings,
)

# Use the vectorstore as a retriever
retriever = vectorstore.as_retriever()

# Retrieve the most similar text
retrieved_documents = retriever.invoke("What is LangChain?")

# show the retrieved document's content
retrieved_documents[0].page_content

'LangChain is the framework for building context-aware reasoning applications'

In [10]:
# test embedding
# https://python.langchain.com/api_reference/ollama/embeddings/langchain_ollama.embeddings.OllamaEmbeddings.html


# single
input_text = "Queen king man women park"
vector = embeddings.embed_query(input_text)
# print(vector[:3])
print(vector[:10])
print(len(vector))
# old
# [-0.00017720745, 0.00987124, 0.0049231793]

[0.025141606, 0.057847306, -0.16881683, 0.0029497452, -0.021047872, 0.07405089, -0.02485957, -0.028323729, -0.019682856, 0.010503033]
768


In [11]:
# # multiple

# input_texts = ["Document 1...", "Document 2..."]
# vectors = embeddings.embed_documents(input_texts)
# print(len(vectors))
# # The first 3 coordinates for the first vector
# print(vectors[0][:3])

In [13]:
# how chroma work and store, how the sizes calcualted and varies, is the size scalabable with speed and memory 


from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [14]:
import chromadb

# init from client

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=embeddings,
)

C:\Users\y3kma\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:19<00:00, 4.26MiB/s]


In [15]:
# add to vector store

from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['32c41076-7500-49bc-abfd-b5d9cfbe5038',
 'b39a983d-22d1-4c45-8a06-c5c5ac1e1100',
 'bdee71ad-d91f-4468-924f-eaa6c8462e8f',
 '7d367f21-f500-497a-82cd-a1ca637bfa49',
 'a1fe53e3-6c1f-4e29-a6a8-103a118a8db3',
 'fa3266b6-11ef-4d7d-8987-da7ef6234790',
 '2624912b-5f02-4b8a-a64c-b8a0a78ef32e',
 '091e0ae1-09c3-4233-8819-d1933bd59dc5',
 '64b6a91b-81f4-47ed-a89c-9e849899653d',
 '5babe17c-cd1a-47a1-9550-83942e2f2c9b']

In [16]:
# update vector store

updated_document_1 = Document(
    page_content="I had chocolate chip pancakes and fried eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

updated_document_2 = Document(
    page_content="The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees.",
    metadata={"source": "news"},
    id=2,
)

vector_store.update_document(document_id=uuids[0], document=updated_document_1)
# You can also update multiple documents at once
vector_store.update_documents(
    ids=uuids[:2], documents=[updated_document_1, updated_document_2]
)

In [18]:
# delete vector store

vector_store.delete(ids=uuids[-1])

In [None]:
# query

results = vector_store.similarity_search(
    "What is langchain?",
    k=4,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

# results is quite shit


* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]
* Wow! That was an amazing movie. I can't wait to see it again. [{'source': 'tweet'}]
* I had chocolate chip pancakes and fried eggs for breakfast this morning. [{'source': 'tweet'}]


In [38]:
# similarity score

results = vector_store.similarity_search_with_score(
    "Will it be hot tomorrow?", k=4, 
    # filter={"source": "news"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.360257] The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees. [{'source': 'news'}]
* [SIM=1.089755] The stock market is down 500 points today due to fears of a recession. [{'source': 'news'}]
* [SIM=1.111669] Wow! That was an amazing movie. I can't wait to see it again. [{'source': 'tweet'}]
* [SIM=1.158021] I had chocolate chip pancakes and fried eggs for breakfast this morning. [{'source': 'tweet'}]


In [39]:
# search vector
results = vector_store.similarity_search_by_vector(
    embedding=embeddings.embed_query("I love green eggs and ham!"), k=1
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* I had chocolate chip pancakes and fried eggs for breakfast this morning. [{'source': 'tweet'}]


In [None]:
# https://python.langchain.com/docs/how_to/#qa-with-rag

measure the quality of embeddings, how can eval between them


add chunk to the embedding so it knows and is more determisitic

update the chunk accurately

if id in the chunk, then update the chunk, if not then add it 

evaluate response with pytest correct/wrong assert


https://graphacademy.neo4j.com/
