In [None]:
!pip install pypdf # 📦 Install `pypdf` for reading and extracting text from PDFs

Collecting pypdf
  Downloading pypdf-5.7.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.7.0-py3-none-any.whl (305 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/305.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m297.0/305.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m305.5/305.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.7.0


| **Category**                    | **Details**                                                                                                                                                 |      |                |           |                                      |
| ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | -------------- | --------- | ------------------------------------ |
| 🎯 **Project Goal**             | To build a **Retrieval-Augmented Generation (RAG)** system using LlamaIndex and Hugging Face's **Llama2** model to answer questions based on PDF documents. |      |                |           |                                      |
| 📥 **Data Source**              | PDF file(s) loaded from `/content/Data`, such as `attention.pdf`, using `SimpleDirectoryReader`.                                                            |      |                |           |                                      |
| 🔍 **Retrieval**                | Text data is chunked, embedded using `MiniLM` (sentence-transformers), and stored in a **VectorStoreIndex** for semantic search.                            |      |                |           |                                      |
| 🤖 **LLM**                      | **Meta’s Llama2-7B-Chat model** is loaded using Hugging Face Transformers. It generates final answers based on retrieved chunks.                            |      |                |           |                                      |
| ⚙️ **Embedding Model**          | `sentence-transformers/all-MiniLM-L6-v2` is used for fast and efficient semantic similarity embedding.                                                      |      |                |           |                                      |
| 🔁 **RAG Workflow Implemented** | ✅ Document Ingestion → ✅ Chunking & Embedding → ✅ Vector Index Creation → ✅ Semantic Query → ✅ Contextual Answer Generation                                 |      |                |           |                                      |
| 🧠 **Prompt Format**            | Custom prompt format defined: \`<                                                                                                                           | USER | >{query\_str}< | ASSISTANT | >\` to simulate dialogue-style Q\&A. |
| 💬 **Query Examples Tested**    | 1. *“What is transformer in NLP?”*<br>2. *“Explain the concept of positional encoding?”*<br>3. *“What is attention mechanism and how it works?”*            |      |                |           |                                      |
| 📤 **Output**                   | For each query, a relevant, contextual answer is generated using the LLM, grounded in document knowledge.                                                   |      |                |           |                                      |


In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes  # 📦 Install essential ML and LLM packages

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
## Embedding
!pip install sentence-transformers # 📦 Install sentence-transformers for dense semantic embeddings



In [None]:
!pip install llama_index # 📦 Install LlamaIndex (formerly GPT Index) for data indexing and retrieval

Collecting llama_index
  Downloading llama_index-0.12.45-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5,>=0.4.0 (from llama_index)
  Downloading llama_index_agent_openai-0.4.12-py3-none-any.whl.metadata (439 bytes)
Collecting llama-index-cli<0.5,>=0.4.2 (from llama_index)
  Downloading llama_index_cli-0.4.3-py3-none-any.whl.metadata (1.4 kB)
Collecting llama-index-core<0.13,>=0.12.45 (from llama_index)
  Downloading llama_index_core-0.12.45-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4,>=0.3.0 (from llama_index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama_index)
  Downloading llama_index_indices_managed_llama_cloud-0.7.7-py3-none-any.whl.metadata (3.3 kB)
Collecting llama-index-llms-openai<0.5,>=0.4.0 (from llama_index)
  Downloading llama_index_llms_openai-0.4.7-py3-none-any.whl.metadata (3.0 kB)
Collecting llama

In [None]:
!pip show llama-index # ✅ Verify llama-index installation

Name: llama-index
Version: 0.12.45
Summary: Interface between LLMs and your data
Home-page: https://llamaindex.ai
Author: 
Author-email: Jerry Liu <jerry@llamaindex.ai>
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: llama-index-agent-openai, llama-index-cli, llama-index-core, llama-index-embeddings-openai, llama-index-indices-managed-llama-cloud, llama-index-llms-openai, llama-index-multi-modal-llms-openai, llama-index-program-openai, llama-index-question-gen-openai, llama-index-readers-file, llama-index-readers-llama-parse, nltk
Required-by: 


In [None]:
!pip install llama-index[llms] --upgrade # 📦 Upgrades llama-index with LLM support



In [None]:
!pip install llama-index-llms-huggingface # 📦 Adds HuggingFace LLM support for llama-index

Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.5.0-py3-none-any.whl.metadata (2.8 kB)
Downloading llama_index_llms_huggingface-0.5.0-py3-none-any.whl (7.8 kB)
Installing collected packages: llama-index-llms-huggingface
Successfully installed llama-index-llms-huggingface-0.5.0


In [None]:
from llama_index.core.prompts import PromptTemplate # 📦 Importing prompt templating support

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext # 📦 Core LlamaIndex components
from llama_index.llms.huggingface import HuggingFaceLLM # 📦 Import LLM interface
from llama_index.core.prompts import PromptTemplate  # 📦 Prompt support (already imported above)

In [None]:
documents = SimpleDirectoryReader("/content/Data").load_data() # 🧠 Load all documents from specified folder
documents

[Document(id_='d6310ea4-3f36-4831-8b38-eed7c4e9fe48', embedding=None, metadata={'page_label': '1', 'file_name': 'attention.pdf', 'file_path': '/content/Data/attention.pdf', 'file_type': 'application/pdf', 'file_size': 569417, 'creation_date': '2025-07-01', 'last_modified_date': '2025-07-01'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of

In [None]:
from llama_index.core.prompts import PromptTemplate # 📦 Importing prompt template class
system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""


query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>") # 🧠 Defines how user query is wrapped before sending to LLM

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
The token `llama-index-project` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re

In [None]:
import torch

llm = HuggingFaceLLM( # 🧠 Configure the Hugging Face LLM wrapper
    context_window=4096,  # Maximum tokens the model can consider at once
    max_new_tokens=256,  # Maximum tokens the model can consider at once
    generate_kwargs={"temperature": 0.0, "do_sample": False},# Deterministic output (no randomness)
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf", # Which tokenizer to use
    model_name="meta-llama/Llama-2-7b-chat-hf", # HF model name
    device_map="auto", # Automatically chooses best device (CPU/GPU)
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
!pip install -U langchain-community  # 📦 Update LangChain integrations

Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading langchain_community-0.3.26-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.1-py3-none-any.whl (8.1 kB)
Downloading pydantic_settings-2.10.1-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: httpx-sse, pydantic-settings, langchain-community
Successfully installed httpx-sse-0.4.1 langchain-community-0.3.26 pydantic-settings-2.10.1


In [None]:
!pip install -U llama-index  # 📦 Upgrade llama-index (safe re-install)



In [None]:
!pip install -U llama-index langchain langchain-community # 📦 Upgrade core LLM/RAG libraries



In [None]:
!pip install -U llama-index langchain langchain-community sentence-transformers  # 📦 Same as above + embedding model support



In [None]:
# Imports
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import Settings  # ✅ Modern config manager

# 1. Create embedding model
langchain_embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedding_model = LangchainEmbedding(langchain_embed_model)

# 2. Set the global embedding model via Settings (no ServiceContext)
Settings.embed_model = embedding_model

# Optional: Set chunk size, LLM, etc.
Settings.chunk_size = 1024  # or your preferred size

print("✅ Settings configured — ready to use embeddings!")

✅ Settings configured — ready to use embeddings!


In [None]:
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader(input_files=["/content/Data/attention.pdf"]).load_data()

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [None]:
from llama_index.core import Settings
Settings.llm = None  # ❌ Make sure NO default LLM is set

LLM is explicitly disabled. Using MockLLM.


In [None]:
query_engine = index.as_query_engine(llm=None)

In [None]:
response = query_engine.query("What is self-attention?")
print(response.response)

Context information is below.
---------------------
page_label: 7
file_path: /content/Data/attention.pdf

the input sequence centered around the respective output position. This would increase the maximum
path length to O(n/r). We plan to investigate this approach further in future work.
A single convolutional layer with kernel width k<n does not connect all pairs of input and output
positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels,
or O(logk(n)) in the case of dilated convolutions [ 15], increasing the length of the longest paths
between any two positions in the network. Convolutional layers are generally more expensive than
recurrent layers, by a factor of k. Separable convolutions [ 6], however, decrease the complexity
considerably, to O(k·n·d+ n·d2). Even with k = n, however, the complexity of a separable
convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer,
the approach we take in

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"  # put your key here


In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo")  # or "gpt-4"
query_engine = index.as_query_engine()


In [None]:
!pip install llama-index transformers accelerate bitsandbytes sentence-transformers




In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM          # Load tokenizer & model from Hugging Face
from llama_index.llms.huggingface import HuggingFaceLLM               # LlamaIndex wrapper for HF causal LLMs
from llama_index.core import Settings                                 # Global settings manager for LlamaIndex

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # 🧠 Specify the model to use (TinyLlama is fast and lightweight for testing)

tokenizer = AutoTokenizer.from_pretrained(model_name)                  # 🔠 Load tokenizer corresponding to the model
model = AutoModelForCausalLM.from_pretrained(                          # 🧠 Load the actual language model
    model_name,
    device_map="auto",                           # ✅ Automatically selects the best device (GPU if available)
    load_in_8bit=True  # Optional: saves memory on GPU # Optional: saves memory on GPU  # 🧠 Optional: loads model in 8-bit precision to save GPU memory
)

llm = HuggingFaceLLM(
    model=model,                     # The HF causal model (AutoModelForCausalLM)
    tokenizer=tokenizer,             # Tokenizer for text processing
    tokenizer_name=model_name,        # Optional: name for internal reference
    model_name=model_name,         # Optional: name for logging/tracking
    device_map="auto",           # Let HF manage device placement
    max_new_tokens=512           # Limit response length to 512 tokens
)

Settings.llm = llm


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("What is self-attention?")
print(response.response)



Self-attention is a type of attention mechanism that can be used in sequence-to-sequence models, such as encoder-decoder models, to learn long-range dependencies between input and output sequences. It is a type of attention mechanism that can be used in sequence-to-sequence models, such as encoder-decoder models, to learn long-range dependencies between input and output sequences. Self-attention layers are faster than recurrent layers when the sequence length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [31] and byte-pair [25] representations. Self-attention can be restricted to consider only a neighborhood of size rin 6, which improves computational performance for tasks involving very long sequences.
