In [None]:
import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

#importing text and embeddings df
text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

#convert embeddings col to np array (got converted to str while converted to csv)
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep = " "))

#convert df to list of dicts
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient = "records")

#convert embeddings to torch tensor and save to device
embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].tolist()), dtype = torch.float32).to(device)
embeddings.shape

torch.Size([657, 768])

In [None]:
from sentence_transformers import SentenceTransformer, util

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Functionizing the pipeline

In [None]:
import time

In [None]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:
def retrieve_relevent_resources(query:str, embeddings:torch.tensor, model: SentenceTransformer=embedding_model, top_k:int=5, print_time:bool = True):

  """embed the query with embedding model and return top k scores and indices from embeddings"""

  #embed the query
  query_embedding = model.encode(query, convert_to_tensor = True)

  #get the dot scores on embeddings
  start_time = time.perf_counter()
  dot_scores = util.dot_score(query_embedding, embeddings)[0]
  end_time = time.perf_counter()

  if print_time:
    print(f"[INFO] time taken to get scores on ({len(embeddings)}) embeddings: {end_time-start_time:.5f} seconds")

  scores, indices = torch.topk(input = dot_scores, k = top_k)

  return scores, indices

def print_top_results(query:str,
                      embeddings: torch.tensor,
                      pages_and_chunks: list[dict]=pages_and_chunks,
                      topk: int = 5):
  scores, indices = retrieve_relevent_resources(query=query, embeddings=embeddings, top_k=topk)
  for score, idx in zip(scores, indices):

    print(f"score: {score:.5f}")
    print("text: \n")
    print_wrapped(f"page: {pages_and_chunks[idx]['sentence_chunk']}")
    print(f"page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

In [None]:
print_top_results(query = "attention mechanism", embeddings=embeddings)

[INFO] time taken to get scores on (657) embeddings: 0.00008 seconds
score: 0.72871
text: 

page: Figure 3-15. A simplified framing of attention: an input sequence and a
current position being processed. As we’re mainly concerned with this position,
the figure shows an input vector and an output vector that incorporates
information from the previous elements in the sequence according to the
attention mechanism. Two main steps are involved in the attention mechanism: 1.
A way to score how relevant each of the previous input tokens are to the current
1.token being processed (in the pink arrow).2. Using those scores, we combine
the information from the various positions into a 2.single output vector. Figure
3-16 shows these two steps.
page number: 88


score: 0.64558
text: 

page: To demonstrate different kinds of attention, review Figure 3-23, which
shows how different attention mechanisms work. Each figure shows which previous
tokens (light blue) can be attended to when processing the c

In [None]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 15 GB


In [None]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")



GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create quantization config for smaller model loading
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >=8):
  attn_implentation = "flash_attention_2"
else:
  attn_implentation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implentation}")

model_id = model_id
print(f"[INFO] Using model: {model_id}")

#tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

#model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_id,
                                                 dtype = torch.float16,
                                                 quantization_config = quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage = False,
                                                 attn_implementation = attn_implentation)
if not use_quantization_config:
  llm_model.to("cuda")


[INFO] Using attention implementation: sdpa
[INFO] Using model: google/gemma-2b-it


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/config.json
Model config GemmaConfig {
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "dtype": "bfloat16",
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 16384,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "rope_parameters": {
    "rope_theta": 10000.0,
    "rope_type": "default"
  },
  "tie_word_embeddings": true,
  "transformers_version": "5.0.0",
  "use_bidirectional_attention": null,
  "use_cache": true,
  "vocab_size": 256000
}



tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/config.json
Model config GemmaConfig {
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "dtype": "float16",
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 16384,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "rope_parameters": {
    "rope_theta": 10000.0,
    "rope_type": "default"
  },
  "tie_word_embeddings": true,
  "transformers_version": "5.0.0",
  "use_bidirectional_attention": null,
  "use_cache": true,
  "vocab_size": 256000
}



model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/model.safetensors.index.json


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "use_cache": true
}



Loading weights:   0%|          | 0/164 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id": 1,
  "pad_token_id": 0
}



In [None]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,), 

In [None]:
def get_model_num_params(model: torch.nn.Module):
  return sum([params.numel() for params in model.parameters()])

print(f"{get_model_num_params(llm_model)/(10**9)} B ")

2.506172416 B 


In [None]:
def get_model_mem_size(model: torch.nn.Module):

  mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
  mem_buffers = sum([buffer.nelement() * buffer.element_size() for buffer in model.buffers()])

  model_mem_bytes = mem_params + mem_buffers # in bytes
  model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
  model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

  return {"model_mem_bytes": model_mem_bytes,
          "model_mem_mb": round(model_mem_mb, 2),
          "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 5012345856, 'model_mem_mb': 4780.15, 'model_mem_gb': 4.67}

### Generating text with LLM

In [None]:
input_text = "What is Self Attention mechanism?"
print(f"Input text: \n{input_text}")

#prompt template
dialogue_template = [
    {"role":"user",
     "content":input_text
    }
]

#apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template, tokenize=False, add_generation_prompt=True)
print(f"\n Prompt:\n{prompt}")

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = tokenizer.decode(llm_model.generate(**input_ids, max_new_tokens = 256)[0])
print(outputs)


Input text: 
What is Self Attention mechanism?

 Prompt:
<bos><start_of_turn>user
What is Self Attention mechanism?<end_of_turn>
<start_of_turn>model

<bos><bos><start_of_turn>user
What is Self Attention mechanism?<end_of_turn>
<start_of_turn>model
Sure, here's a breakdown of the Self Attention mechanism:

**What it is:**

* Self-attention is a mechanism in artificial intelligence (AI) that allows a model to attend to different parts of its input sequence in a more efficient and accurate way than traditional recurrent neural networks (RNNs).
* It is a type of neural network architecture that can be used for various tasks, including natural language processing (NLP), machine translation, and image recognition.

**Key features:**

* **Self-attention mechanism:** This mechanism allows each element in the input sequence to attend to all other elements in the sequence, regardless of their distance.
* **Multi-head attention:** This is a variation of self-attention that uses multiple attentio

In [None]:
query_list = [
    "What is a Large Language Model (LLM), and why is it called 'large'?",
    "What is the difference between traditional machine learning models and LLMs?",
    "What is a transformer architecture, and why is it important for LLMs?",
    "What is the attention mechanism in simple terms?",
    "What is the difference between training, fine-tuning, and inference?",
    "What are embeddings, and why are they used in semantic search?",
    "What is the difference between cosine similarity and dot product?",
    "Why do we chunk documents before creating embeddings in RAG?",
    "What is the difference between an instruction-tuned model and a base model?",
    "What are tokens, and why do they matter in LLMs?"
]

In [None]:
import random
query = random.choice(query_list)

print(f"Query: {query}")
scores, indices = retrieve_relevent_resources(query=query, embeddings=embeddings)
scores, indices


Query: What are tokens, and why do they matter in LLMs?
[INFO] time taken to get scores on (657) embeddings: 0.00008 seconds


(tensor([0.6653, 0.5463, 0.5445, 0.5362, 0.5329], device='cuda:0'),
 tensor([120, 149, 118, 186,  94], device='cuda:0'))

### Augementing Prompt with context items

In [None]:
def prompt_formatter(query: str, context_items: list[dict])->str:

  context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

  base_prompt = f"""
Answer the question using only the context below.

Context:
{context}

Question:
{query}

Answer:
"""
  dialogue_template = [
      {"role":"user",
      "content":base_prompt}
  ]

  prompt = tokenizer.apply_chat_template(conversation=dialogue_template, tokenize=False, add_generation_prompt=True)
  return prompt

In [None]:
query = random.choice(query_list)

print(f"Query: {query}")
scores, indices = retrieve_relevent_resources(query=query, embeddings=embeddings)

context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query, context_items=context_items)
print(prompt)

Query: What are embeddings, and why are they used in semantic search?
[INFO] time taken to get scores on (657) embeddings: 0.00007 seconds
<bos><start_of_turn>user
Answer the question using only the context below.

Context:
- Embeddings are tremendously helpful as they allow us to measure the semantic similarity between two words. Using various distance metrics, we can judge how close one word is to another. As illustrated in Figure 1-9, if we were to compress these embeddings into a two-dimensional representation, you would notice that words with similar meaning tend to be closer. In Chapter 5, we will explore how to compress these embeddings into n-dimensional space. Figure 1-9. Embeddings of words that are similar will be close to each other in dimen‐ sional space. Types of Embeddings There are many types of embeddings, like word embeddings and sentence embed‐ dings that are used to indicate different levels of abstractions (word versus sentence), as illustrated in Figure 1-10. Bag-

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = llm_model.generate(**input_ids, max_new_tokens = 256)[0]
output_text = tokenizer.decode(outputs)

print(f"query: {query}")
print(f"rag answer: \n{output_text.replace(prompt,'')}")



query: What are embeddings, and why are they used in semantic search?
rag answer: 
<bos>Embeddings are numerical representations of words that indicate their semantic similarity. They are used in semantic search to help language models understand the meaning of a query by projecting it into the same space as the text archive. This allows the model to compare the semantic similarity between queries and documents, enabling it to rank documents that are most relevant to the query.<eos>
