In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]


In [None]:
query = "keyword-based search"

In [None]:
import re
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [None]:
preprocess_document = [preprocess_text(doc) for doc in documents]

In [None]:
preprocess_document

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [None]:
print("Preprocessed Query")
print(query)

Preprocessed Query
keyword-based search


In [None]:
preprocessed_query = preprocess_text(query)
preprocessed_query

'keywordbased search'

In [None]:
vector = TfidfVectorizer()

In [None]:
x = vector.fit_transform(preprocess_document)

In [None]:
x.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [None]:
x.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [None]:
query_embedding = vector.transform([preprocessed_query])

In [None]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [None]:
similarities = cosine_similarity(x, query_embedding)
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [None]:
np.argsort(similarities,axis = 0)

array([[0],
       [2],
       [3],
       [1]])

In [None]:
#Ranking
rancked_indeces = np.argsort(similarities,axis = 0)[::-1].flatten()

In [None]:
rancked_indeces

array([1, 3, 2, 0])

In [None]:
rancked_doc = [documents[i] for i in rancked_indeces]

In [None]:
rancked_doc

['Keywords are important for keyword-based search.',
 'Keyword-based search relies on sparse embeddings.',
 'Document analysis involves extracting keywords.',
 'This is a list which containig sample documents.']

In [None]:
for i,doc in enumerate(rancked_doc):
    print(f"Rank {i+1}. {doc}")

Rank 1. Keywords are important for keyword-based search.
Rank 2. Keyword-based search relies on sparse embeddings.
Rank 3. Document analysis involves extracting keywords.
Rank 4. This is a list which containig sample documents.


In [None]:
query

'keyword-based search'

In [None]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])


In [None]:
# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [None]:
# Calculate cosine similarity between query and documents
similarity = cosine_similarity(document_embeddings, query_embedding)

In [None]:
similarity

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [None]:
rancked_index = np.argsort(similarity,axis = 0)[::-1].flatten()

In [None]:
rancked_index

array([0, 2, 1])

In [None]:
# Output the rancked documents
for i, idx in enumerate(rancked_index):
    print(f"Rank {i+1}: document_embeddings{[idx+1]}")

Rank 1: document_embeddings[1]
Rank 2: document_embeddings[3]
Rank 3: document_embeddings[2]


In [None]:
doc_path = "/content/Retrieval Augumented Generation Paper.pdf"

In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


In [None]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting langchain<0.3.0,>=0.2.5 (from langchain_community)
  Downloading langchain-0.2.5-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.3.0,>=0.2.7 (from langchain_community)
  Downloading langchain_core-0.2.9-py3-none-any.whl (321 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.8/321.8 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloading langsmith-0.1.81-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader(doc_path)

In [None]:
docs = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 30)

In [None]:
chunks = splitter.split_documents(docs)

In [None]:
#chunks

In [None]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [None]:
HF_TOKEN = "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"

In [None]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key = HF_TOKEN, model_name = "BAAI/bge-base-en-v1.5")

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2

In [None]:
from langchain.vectorstores import Chroma

In [None]:
vector_store = Chroma.from_documents(chunks, embeddings)

In [None]:
vectorstore_retriever = vector_store.as_retriever(search_kwargs = {"k":3})

In [None]:
vectorstore_retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x790b3e441480>, search_kwargs={'k': 3})

In [None]:
!pip install rank_bm25 # updated version of TF-IDF

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [None]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [None]:
keyword_retriever.k = 3

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers = [vectorstore_retriever, keyword_retriever], weights = [0.3,0.7])

In [None]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (41

In [None]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m204.8/309.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [None]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline)
from langchain import HuggingFacePipeline

In [None]:
def load_quantized_model(model_name:str):
  """
  model_name : Name or path of the model to be loaded
  return : Quantized model
  """
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype = torch.bfloat16,
      quantization_config=bnb_config,
  )
  return model

In [None]:
# initializing tokenizer
def initialize_tokenizer(model_name:str):
  """
  model_name : Name or path of the model to be loaded
  return : Initialized Tokenizer
  """
  tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids = False)
  tokenizer.bos_token_id = 1
  return tokenizer

In [None]:
tokenizer = initialize_tokenizer(model_name)

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
model = load_quantized_model(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
pipeline = pipeline("text-generation",
                    model = model,
                    tokenizer = tokenizer,
                    use_cache = True,
                    device_map = "auto",
                    max_length = 2048,
                    do_sample = True,
                    top_k = 5,
                    num_return_sequences = 1,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.pad_token_id,
)

In [None]:
llm = HuggingFacePipeline(pipeline = pipeline)

  warn_deprecated(


In [None]:
from langchain.chains import RetrievalQA

In [None]:
normal_chain = RetrievalQA.from_chain_type(llm = llm, chain_type = "stuff",
                                           retriever = ensemble_retriever)

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                           retriever=ensemble_retriever)

In [None]:
response1 = normal_chain.invoke("What is RAG token model?")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
response1

{'query': 'What is RAG token model?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nSun Also Rises”. Similarly, document 1 dominates the posterior when “A Farewell to Arms” is\ngenerated. Intriguingly, after the ﬁrst token of each book is generated, the document posterior ﬂattens.\n\nthe parameters of a language model? arXiv e-prints , 2020. URL https://arxiv.org/abs/\n2002.08910 .\n[53] Stephen Robertson and Hugo Zaragoza. The probabilistic relevance framework: Bm25 and\n\nretriever, and then the generator produces a distribution for the next output token for each document,\n\nRAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-\ntor with transition probability: p′\nθ(yi|x,y 1:i−1) =∑\nz∈top-k(p(·|x))pη(zi|x)pθ(yi|x,zi,y1:i−1)To\n\ndistribution over generated text. In one approach, RAG-Sequence , the model uses the same 

In [None]:
response1.get('result')

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nSun Also Rises”. Similarly, document 1 dominates the posterior when “A Farewell to Arms” is\ngenerated. Intriguingly, after the ﬁrst token of each book is generated, the document posterior ﬂattens.\n\nthe parameters of a language model? arXiv e-prints , 2020. URL https://arxiv.org/abs/\n2002.08910 .\n[53] Stephen Robertson and Hugo Zaragoza. The probabilistic relevance framework: Bm25 and\n\nretriever, and then the generator produces a distribution for the next output token for each document,\n\nRAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-\ntor with transition probability: p′\nθ(yi|x,y 1:i−1) =∑\nz∈top-k(p(·|x))pη(zi|x)pθ(yi|x,zi,y1:i−1)To\n\ndistribution over generated text. In one approach, RAG-Sequence , the model uses the same document\nto predict each target token. The secon

In [None]:
response2 = hybrid_chain.invoke("What is RAG token model?")

In [None]:
response2

{'query': 'What is RAG token model?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nSun Also Rises”. Similarly, document 1 dominates the posterior when “A Farewell to Arms” is\ngenerated. Intriguingly, after the ﬁrst token of each book is generated, the document posterior ﬂattens.\n\nthe parameters of a language model? arXiv e-prints , 2020. URL https://arxiv.org/abs/\n2002.08910 .\n[53] Stephen Robertson and Hugo Zaragoza. The probabilistic relevance framework: Bm25 and\n\nretriever, and then the generator produces a distribution for the next output token for each document,\n\nRAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-\ntor with transition probability: p′\nθ(yi|x,y 1:i−1) =∑\nz∈top-k(p(·|x))pη(zi|x)pθ(yi|x,zi,y1:i−1)To\n\ndistribution over generated text. In one approach, RAG-Sequence , the model uses the same 

In [None]:
print(response2.get('result'))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Sun Also Rises”. Similarly, document 1 dominates the posterior when “A Farewell to Arms” is
generated. Intriguingly, after the ﬁrst token of each book is generated, the document posterior ﬂattens.

the parameters of a language model? arXiv e-prints , 2020. URL https://arxiv.org/abs/
2002.08910 .
[53] Stephen Robertson and Hugo Zaragoza. The probabilistic relevance framework: Bm25 and

retriever, and then the generator produces a distribution for the next output token for each document,

RAG-Token The RAG-Token model can be seen as a standard, autoregressive seq2seq genera-
tor with transition probability: p′
θ(yi|x,y 1:i−1) =∑
z∈top-k(p(·|x))pη(zi|x)pθ(yi|x,zi,y1:i−1)To

distribution over generated text. In one approach, RAG-Sequence , the model uses the same document
to predict each target token. The second approach, RAG-To