In [None]:
pip install -U langchain langchain_community langchain_groq langchain_experimental langchain_core tiktoken rank_bm25 pypdf langchain_chroma langchain_huggingface

# **Load LIB**

In [2]:
import os
import torch
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.chains import RetrievalQA
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document

from dotenv import load_dotenv

# **Load Model**

In [3]:
load_dotenv(".env")
api_key = os.getenv("API_KEY")

model_llm = ChatGroq(groq_api_key=api_key,model_name="llama-3.1-8b-instant")
model_llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7d7ce05511d0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7d7ce054e150>, model_name='llama-3.1-8b-instant', model_kwargs={}, groq_api_key=SecretStr('**********'))

# **Load Embedding Model**

In [4]:
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name= embedding_model_name, model_kwargs=model_kwargs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# **Load Docs**

In [7]:
folder_path = 'contoh.pdf'
loader = PyPDFLoader(folder_path)
documents = loader.load()

        # Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)
texts = text_splitter.split_documents(documents)

In [8]:
texts

[Document(metadata={'source': 'contoh.pdf', 'page': 0, 'page_label': '1'}, page_content='Embeddings  \n& Vector Stores\nAuthors: Anant Nawalgaria  \nand Xiaoqi Ren'),
 Document(metadata={'source': 'contoh.pdf', 'page': 1, 'page_label': '2'}, page_content='Embeddings & Vector Stores\n2\nSeptember 2024\nReviewers and Contributors\nAntonio Gulli\nGrace Mollison\nRuiqi Guo\nIftekhar Naim\nJinhyuk Lee\nAlan Li\nPatricia Florissi\nAndrew Brook\nOmid Fatemieh\nZhuyun Dai\nLee Boonstra\nPer Jacobsson\nSiddhartha Reddy Jonnalagadda\nXi Cheng\nRaphael Hoffmann\nCurators and Editors\nAntonio Gulli\nAnant Nawalgaria\nGrace Mollison \nTechnical Writer\nJoey Haymaker\nDesigner\nMichael Lanning \nAcknowledgements'),
 Document(metadata={'source': 'contoh.pdf', 'page': 2, 'page_label': '3'}, page_content='Introduction 5\nWhy embeddings are important 6\n Types of embeddings 9\n  Text embeddings 9\n   Word embeddings 11\n   Document embeddings 15\n    Shallow BoW models 16\n    Deeper pretrained large la

# **Insert VectorDB**

In [9]:
vector_store = Chroma(
    collection_name="pdf_data",
    embedding_function=embeddings,
    persist_directory="./chroma_db",  # Where to save data locally, remove if not necessary
)

In [10]:
vector_store.add_documents(documents=texts)

['f0cdbd9f-3e53-4e1f-9ff2-20f02703077f',
 'bbb0522a-ba61-49f0-805c-59b6e0b65170',
 'a98a44a9-69f3-41e7-b937-771e30fab06a',
 '0fb59854-2e73-486e-b9c3-5632d90e8dff',
 '1eab5c21-319c-4620-89a5-679d37729bf5',
 'cddb1701-9b7d-4d32-9953-fa15da8163ee',
 'dcdc89f8-b32e-4c8f-a185-9f1b493532ee',
 '86b7c75f-95bc-4d19-87f1-83902b02ef2c',
 'b7d5934e-763b-4b55-b6f4-77d671cf6eff',
 '4b4ae806-a0e0-460d-ad11-651398a00444',
 '068fca3d-ba40-4160-9267-22c9ade98df7',
 '3d779163-aff7-4aa6-8958-f0d4dca7883e',
 'ec5cf3d9-9eaa-498d-9a6c-2cb70adbab5f',
 '1ace247d-2eae-4b1f-8fcb-539ca213cb00',
 'ccbbe882-a3d6-44b6-bc6b-fe9b2b9e375a',
 '5f75e36d-291a-4311-9dfd-91ffdc6b4b99',
 '2498bf95-e983-4238-a711-e21acbb75296',
 'd0b4dd14-7bf8-4530-a0fc-99eb915a4d38',
 '092984b2-ebb6-412d-8d4e-2c4abafe583b',
 'fa8182e5-60f6-4174-a67c-7efc1963018c',
 'b09ab025-f993-4336-84c7-699adbf2c394',
 '1731b62b-9d21-4429-be26-ef0e164c15ea',
 '1289976e-1032-454f-91fe-17f8909dc059',
 'b170bf4f-b43c-4eba-8af9-8d2544dc09c6',
 '23e515bc-f00a-

# **Set Up BM25**

In [11]:
bm25_retriever = BM25Retriever.from_documents(texts)

# **Retriever**

In [12]:
retriever = vector_store.as_retriever()
bm25_retriever.k = 2  # Retrieve top 2 results

print("type of bm25", type(bm25_retriever))

type of bm25 <class 'langchain_community.retrievers.bm25.BM25Retriever'>


In [13]:
# Initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever], weights=[0.2, 0.8]
)

# **Chain - Query**

In [14]:
# Example customer query
query = " Why embeddings are important ?"


# Retrieve relevant documents/products
docs = ensemble_retriever.get_relevant_documents(query)

docs

  docs = ensemble_retriever.get_relevant_documents(query)


[Document(metadata={'source': 'contoh.pdf', 'page': 5, 'page_label': '6'}, page_content='Embeddings & Vector Stores\n6\nSeptember 2024\n• Efficient Management: Techniques for storing, retrieving, and searching vast collections \nof embeddings.\n• Vector Databases: Specialized systems for managing and querying embeddings, \nincluding practical considerations for production deployment.\n• Real-World Applications: Concrete examples of how embeddings and vector databases \nare combined with large language models (LLMs) to solve real-world problems.\nThroughout the whitepaper, code snippets provide hands-on illustrations of key concepts.\nWhy embeddings are important\nIn essence, embeddings are numerical representations of real-world data such as text, \nspeech, image, or videos. They are expressed as low-dimensional vectors where the \ngeometric distances of two vectors in the vector space is a projection of the relationships \nbetween the two real-world objects that the vectors represent.

In [18]:
def generate_answers(query: str, llm = model_llm, retriever = ensemble_retriever):

    qa_chain = RetrievalQA.from_llm(
                llm,
                retriever = retriever
            )
    response = qa_chain.invoke(query)

    # return response["result"]
    return response["result"]

In [21]:
from pprint import pprint

In [22]:
pprint(generate_answers('Why embeddings are important'))

('Embeddings are important because they provide a numerical representation of '
 'real-world data, such as text, speech, images, or videos, as low-dimensional '
 'vectors. These vectors capture the relationships between the objects they '
 'represent, allowing for a compact and informative projection of the original '
 "object's characteristics.\n"
 '\n'
 'Ideally, embeddings are created to place objects with similar semantic '
 'properties closer together in the embedding space, making them a condensed '
 'and meaningful input for downstream applications. This representation '
 'preserves the semantic meanings for a specific task or across various tasks, '
 'enabling the generation of different embeddings for the same object, '
 'optimized for the task at hand.\n'
 '\n'
 'Embeddings are crucial because they facilitate a range of applications, '
 'including:\n'
 '\n'
 '1. **Feature extraction**: Embeddings can be used as features for machine '
 'learning models, allowing for more effic

In [23]:
pprint(generate_answers('what is types of embeddings'))

('According to the provided context, embeddings can be of various forms and '
 'are used for different types of data. Some standard techniques used for '
 'different types of data include:\n'
 '\n'
 '1. **Text embeddings**: These are used for natural language processing (NLP) '
 'and are often used to embed the meaning of natural language in machine '
 'learning for processing in various downstream applications, such as text '
 'generation, classification, sentiment analysis, and more.\n'
 '\n'
 '   - **Token/word embeddings**: These are used to represent words or tokens '
 'in a vector space.\n'
 '   - **Document embeddings**: These are used to represent entire documents '
 'in a vector space.\n'
 '\n'
 '2. **Image embeddings**: These are used for image processing and can be used '
 'for tasks such as image classification, object detection, and image '
 'generation.\n'
 '\n'
 "It's worth noting that these are not the only types of embeddings, and the "
 'context mentions that embeddin

In [24]:
pprint(generate_answers('what is vectorsearch and the important thing about it'))

('Based on the provided context, Vector Search refers to a technique used in '
 'machine learning and natural language processing (NLP) to efficiently query '
 'and retrieve relevant data based on its semantic meaning. This is achieved '
 'by representing data as numerical vectors, known as embeddings, which can be '
 'compared and searched using various algorithms.\n'
 '\n'
 'The important thing about Vector Search is that it allows for:\n'
 '\n'
 '1. **Efficient querying**: Vector Search enables fast and efficient querying '
 'of large datasets, making it suitable for real-time applications and complex '
 'analysis.\n'
 '2. **Semantic search**: By capturing the semantic meaning of data, Vector '
 'Search can retrieve relevant data even when the query is not an exact '
 'match.\n'
 '3. **Scalability**: Vector Search can handle large datasets and scale to '
 'meet the demands of modern applications.\n'
 '4. **Flexibility**: Vector Search can be used in various domains, including '
 'bu