In [1]:
#! pip install langchain_community tiktoken langchainhub chromadb langchain bs4 sentence-transformers
#! pip install ipywidgets --upgrade jupyter --upgrade huggingface_hub[hf_xet] -U langchain-huggingface


In [None]:
import bs4
import numpy as np
import os
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"#elimante a warning from HF
os.environ["USER_AGENT"] = "my-langchain-app/0.1"
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = "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"

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
#### INDEXING ####

# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [5]:

import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

8

In [6]:
model_name = "sentence-transformers/all-mpnet-base-v2"# the choice of the model is crucial and depends on
# resources, task, performances, language supported
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embd = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

768

In [7]:
def cosine_similarity(vec1,vec2):
    dot_prod = np.dot(vec1,vec2)
    norm_dot_prd = dot_prod/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

    return norm_dot_prd

similarity = cosine_similarity(query_result, document_result)
print(f"Cosine Similarity: {np.round(similarity,2)}")

Cosine Similarity: 0.56


In [8]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

In [9]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [10]:
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embd)

retriever = vectorstore.as_retriever(search_kwargs={"k":1})

In [11]:
docs = retriever.invoke("What is Task Decomposition?")

In [12]:
len(docs)

1