In [3]:
%%capture
!pip install llama-index==0.10.37 llama-index-embeddings-cohere--0.2.0

In [4]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

True

In [6]:
CO_API_KEY = os.environ['API_KEY_COHERE'] or getpass("API Key:")

In [8]:
from llama_index.embeddings.cohere import CohereEmbedding

embed_v3 = CohereEmbedding(model_name="embed-english-v3.0",  cohere_api_key=CO_API_KEY)

embed_v3_light = CohereEmbedding(model_name="embed-english-light-v3.0", cohere_api_key=CO_API_KEY)

embed_v2 = CohereEmbedding(model_name="embed-english-v2.0", cohere_api_key=CO_API_KEY)

In [9]:
string = "A"

string_2 = "This is a complete sentence."

string_3 = """In the pursuit of a life well-lived, one must recognize the transient nature of the
material world and the enduring value of virtue. The Sikh Gurus taught us that the Divine Light
resides within all, and thus, we are united in our essence beyond the superficial distinctions of
caste, creed, or status. Similarly, the Stoics emphasized the cultivation of inner virtues such as courage,
temperance, and wisdom, understanding that true freedom lies in mastery over one's own perceptions and actions.
As we navigate the vicissitudes of life, let us remember that our choices are our own, and in choosing virtue,
we align ourselves with the cosmic order and the teachings of the Gurus. It is through selfless service,
compassion, and the relentless pursuit of truth that we may attain a state of inner peace and contribute
to the harmony of the world, embodying the principles of both Sikhism and Stoicism in our daily lives
"""

In [10]:
example_embedding = embed_v3_light.get_text_embedding(string)

In [11]:
len(example_embedding)

384

In [12]:
def get_embedding_dimensions(embed_model, list_of_strings):
    embeddings = embed_model.get_text_embedding_batch(list_of_strings)
    embed_lens = []
    for embedding in embeddings:
        embed_lens.append(len(embedding))
    return embed_lens

In [14]:
get_embedding_dimensions(embed_v3, [string, string_2, string_3])

[1024, 1024, 1024]

In [16]:
embed_v3.similarity(
    embed_v3.get_text_embedding("""In embracing both the wisdom of the Sikh Gurus and the Stoic philosophers,
                              we find a path to tranquility by accepting what is beyond our control and focusing
                              our efforts on living virtuously and with purpose."""),
    embed_v3.get_text_embedding(string_3),
    mode="cosine"
    )

0.7476348651610869

In [30]:
import requests

def load_text_from_url(url: str) -> str:
    """
    Fetches and returns the text content from the specified URL.

    Parameters:
    - url: The URL of the text file to fetch.

    Returns:
    - The text content of the file if the request is successful; otherwise, an error message.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # This will raise an HTTPError if the response was an error
        return response.text
    except requests.RequestException as e:
        return f"Failed to load content from {url}. Error: {e}"

url = "https://www.gutenberg.org/files/10763/10763.txt"

text_content = load_text_from_url(url)

In [31]:
print(text_content[:500])

Failed to load content from https://www.gutenberg.org/files/10763/10763.txt. Error: 404 Client Error: Not Found for url: https://www.gutenberg.org/files/10763/10763.txt


In [33]:
from llama_index.core import Document, VectorStoreIndex

partial_document = Document(text="This is the content of the document, ensure it is not empty.",
    doc_id="doc1"  # Optional: Provide a unique document ID
)

In [35]:
index = VectorStoreIndex.from_documents(
    # remember, you must pass a list of documents!
    [partial_document],
    embed_model=embed_v3,
    show_progress=True)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

In [37]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

In [38]:
from llama_index.core.node_parser import SentenceSplitter

# instantiate a node parser
splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=16,
    paragraph_separator="\n\n\n\n",
)

# pass a list of documents to the node paraser
nodes = splitter.get_nodes_from_documents([partial_document])

# create the index from the nodes
index_from_nodes = VectorStoreIndex(
    nodes,
    embed_model=embed_v3,
    show_progress=True
    )

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]