In [1]:
# ==========================================================
# TOKEN EMBEDDINGS – Contextual Representation Demo
# ==========================================================
# This notebook demonstrates:
# 1. Token embeddings = smallest unit representation
# 2. Contextual embeddings (same word, different meaning)
# 3. "Apple" in fruit vs tech context
# ==========================================================

# Step 1: Install dependencies (Run in Colab once)
!pip -q install transformers torch

# Step 2: Import libraries
import torch
from transformers import AutoTokenizer, AutoModel

# Step 3: Load a pretrained transformer model
# We use BERT because it generates contextual token embeddings
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Step 4: Define two sentences with same word in different contexts
sentence_fruit = "I ate an apple after lunch."
sentence_tech = "Apple released a new iPhone."

# Step 5: Tokenize the sentences
inputs_fruit = tokenizer(sentence_fruit, return_tensors="pt")
inputs_tech = tokenizer(sentence_tech, return_tensors="pt")

# Step 6: Get contextual embeddings from the model
with torch.no_grad():
    outputs_fruit = model(**inputs_fruit)
    outputs_tech = model(**inputs_tech)

# The last_hidden_state contains token embeddings
embeddings_fruit = outputs_fruit.last_hidden_state
embeddings_tech = outputs_tech.last_hidden_state

# Step 7: Identify index of the word "apple"
tokens_fruit = tokenizer.convert_ids_to_tokens(inputs_fruit["input_ids"][0])
tokens_tech = tokenizer.convert_ids_to_tokens(inputs_tech["input_ids"][0])

apple_index_fruit = tokens_fruit.index("apple")
apple_index_tech = tokens_tech.index("apple")

# Step 8: Extract embedding vectors for "apple"
apple_embedding_fruit = embeddings_fruit[0][apple_index_fruit]
apple_embedding_tech = embeddings_tech[0][apple_index_tech]

# Step 9: Compare embeddings using cosine similarity
cos = torch.nn.CosineSimilarity(dim=0)

similarity = cos(apple_embedding_fruit, apple_embedding_tech)

print("Tokens (Fruit Context):", tokens_fruit)
print("Tokens (Tech Context):", tokens_tech)

print("\nEmbedding size:", apple_embedding_fruit.shape)

print("\nCosine similarity between 'apple' in both contexts:")
print(similarity.item())

# ==========================================================
# Teaching Explanation:
# ----------------------------------------------------------
# - Each token gets a vector (embedding).
# - BERT produces CONTEXTUAL embeddings.
# - Even though the word is the same, embeddings differ.
# - Similarity will NOT be 1.0 (not identical vectors).
# ==========================================================

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Tokens (Fruit Context): ['[CLS]', 'i', 'ate', 'an', 'apple', 'after', 'lunch', '.', '[SEP]']
Tokens (Tech Context): ['[CLS]', 'apple', 'released', 'a', 'new', 'iphone', '.', '[SEP]']

Embedding size: torch.Size([768])

Cosine similarity between 'apple' in both contexts:
0.18383117020130157


In [2]:
# ==========================================================
# SENTENCE EMBEDDINGS – Semantic Representation Demo
# ==========================================================
# This notebook demonstrates:
# 1. Sentence embedding = one vector per sentence
# 2. Captures overall semantic meaning
# 3. Used for semantic similarity search
# ==========================================================



# Step 2: Import library
from sentence_transformers import SentenceTransformer, util
import torch

# Step 3: Load a pretrained sentence embedding model
# This model directly gives sentence-level embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 4: Define example sentences
query = "Best shoes for marathon training"
doc1 = "Top running shoes designed for long-distance marathon runners"
doc2 = "Comfortable office chairs for long working hours"

sentences = [query, doc1, doc2]

# Step 5: Generate sentence embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

# Each sentence is now represented as ONE fixed-length vector
print("Embedding shape:", embeddings.shape)
print("Vector size for one sentence:", embeddings[0].shape)

# Step 6: Compute semantic similarity (cosine similarity)
similarity_with_doc1 = util.cos_sim(embeddings[0], embeddings[1])
similarity_with_doc2 = util.cos_sim(embeddings[0], embeddings[2])

print("\nSimilarity with running shoes sentence:",
      similarity_with_doc1.item())

print("Similarity with office chair sentence:",
      similarity_with_doc2.item())

# ==========================================================
# Teaching Explanation:
# ----------------------------------------------------------
# - Each sentence becomes ONE fixed-length vector.
# - Similar meaning → higher cosine similarity.
# - Used in semantic search, RAG, recommendation systems.
# - Perfect for search queries like:
#   "Best shoes for marathon training"
# ==========================================================

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding shape: torch.Size([3, 384])
Vector size for one sentence: torch.Size([384])

Similarity with running shoes sentence: 0.8602098226547241
Similarity with office chair sentence: 0.19003574550151825


In [None]:
# ==========================================================
# UNDERSTANDING VECTOR SPACE – Intuition Demo
# ==========================================================
# This notebook demonstrates:
# 1. Embeddings live in high-dimensional space (384 dims here)
# 2. Each dimension = learned abstract feature
# 3. Meaning is defined by position in vector space
# 4. We reduce to 2D to visualize the "semantic map"
# ==========================================================

# Step 1: Install libraries (Run once in Colab)
#!pip -q install sentence-transformers scikit-learn matplotlib

# Step 2: Import libraries
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Step 3: Load a sentence embedding model (384-dimensional)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 4: Example sentences
sentences = [
    "Best shoes for marathon training",
    "Running shoes for long distance athletes",
    "Comfortable office chairs",
    "Ergonomic desk furniture",
    "Fresh organic apples and bananas"
]

# Step 5: Generate embeddings (High-dimensional vectors)
embeddings = model.encode(sentences)

print("Embedding shape (num_sentences, dimensions):", embeddings.shape)
print("Each sentence lives in a", embeddings.shape[1], "dimensional space")

# Step 6: Reduce high-dim space → 2D for visualization
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Step 7: Plot semantic space
plt.figure()
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])

for i, sentence in enumerate(sentences):
    plt.annotate(sentence,
                 (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))

plt.title("2D Projection of High-Dimensional Semantic Space")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()

# ==========================================================
# Teaching Explanation:
# ----------------------------------------------------------
# - Original vectors live in 384 dimensions.
# - We cannot visualize 384D directly.
# - PCA compresses meaning into 2D.
# - Similar meanings cluster together in space.
# - Just like cities on a map:
#     Close cities → geographically related
#     Close vectors → semantically related
# ==========================================================

In [4]:
# ==========================================================
# SEMANTIC SIMILARITY – Meaning as Distance in Vector Space
# ==========================================================
# This notebook demonstrates:
# 1. Similar meaning → nearby vectors
# 2. Different meaning → far apart vectors
# 3. Cosine similarity as a similarity metric
# ==========================================================

# Step 1: Install library (Run once in Colab)
!pip -q install sentence-transformers

# Step 2: Import required modules
from sentence_transformers import SentenceTransformer, util

# Step 3: Load sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 4: Define example words/sentences
text1 = "Laptop"
text2 = "Notebook computer"
text3 = "Banana"

sentences = [text1, text2, text3]

# Step 5: Generate embeddings (fixed-length vectors)
embeddings = model.encode(sentences, convert_to_tensor=True)

# Step 6: Compute cosine similarities
similarity_laptop_notebook = util.cos_sim(embeddings[0], embeddings[1])
similarity_laptop_banana = util.cos_sim(embeddings[0], embeddings[2])

# Step 7: Print results
print("Similarity between 'Laptop' and 'Notebook computer':",
      similarity_laptop_notebook.item())

print("Similarity between 'Laptop' and 'Banana':",
      similarity_laptop_banana.item())

# ==========================================================
# Teaching Explanation:
# ----------------------------------------------------------
# - Cosine similarity ranges from -1 to 1
# - Closer to 1 → very similar meaning
# - Closer to 0 → unrelated meaning
# - Similar words occupy nearby regions in vector space
# ==========================================================

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Similarity between 'Laptop' and 'Notebook computer': 0.7891231179237366
Similarity between 'Laptop' and 'Banana': 0.27611950039863586


In [7]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": "Give me a short definition of DNN."}
    ]
)

print(response.choices[0].message.content)

DNN, or Deep Neural Network, is a type of artificial neural network characterized by multiple layers of nodes (neurons) between the input and output layers. These layers enable the model to learn complex patterns and representations in data, making DNNs particularly effective for tasks like image recognition, natural language processing, and other machine learning applications.


In [8]:
# ==========================================================
# FULL RAG PIPELINE DEMO (PDF → Chunk → Embed → Store → Retrieve → Generate)
# ==========================================================
# Embedding Type: Sentence Embeddings
# Model Used: all-MiniLM-L6-v2
# Embedding Dimension: 384
# Vector Database: FAISS
# Chunk Size: Hyperparameter (adjustable)
# ==========================================================


# ==========================================================
# 1️⃣ INSTALL DEPENDENCIES
# ==========================================================
!pip -q install sentence-transformers faiss-cpu pypdf openai


# ==========================================================
# 2️⃣ IMPORT LIBRARIES
# ==========================================================
import os
import faiss
import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from openai import OpenAI


# ==========================================================
# 3️⃣ USER QUERY INPUT
# ==========================================================
user_query = "What is the main topic discussed in the document?"
print("User Query:", user_query)


# ==========================================================
# 4️⃣ LOAD PDF DOCUMENT
# ==========================================================
pdf_path = "sample.pdf"   # Upload your PDF in Colab

reader = PdfReader(pdf_path)
full_text = ""

for page in reader.pages:
    full_text += page.extract_text()

print("\nPDF Loaded Successfully.")
print("Total characters in document:", len(full_text))


# ==========================================================
# 5️⃣ CHUNKING (HYPERPARAMETER)
# ==========================================================
CHUNK_SIZE = 500   # <-- Hyperparameter (adjust this)
CHUNK_OVERLAP = 50

chunks = []
for i in range(0, len(full_text), CHUNK_SIZE - CHUNK_OVERLAP):
    chunk = full_text[i:i+CHUNK_SIZE]
    chunks.append(chunk)

print("\nNumber of Chunks Created:", len(chunks))


# ==========================================================
# 6️⃣ EMBEDDING MODEL
# ==========================================================
# Type: Sentence Embedding Model
# Dimension: 384
# Not word-level, not token-level → sentence-level embedding

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedding_model.encode(chunks)

print("\nEmbedding Type: Sentence Embeddings")
print("Embedding Dimension:", embeddings.shape[1])
print("Total Embeddings Generated:", embeddings.shape[0])

print("\nSample Embedding Vector (first 10 values):")
print(embeddings[0][:10])


# ==========================================================
# 7️⃣ VECTOR DATABASE (FAISS)
# ==========================================================
dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)   # L2 similarity
index.add(np.array(embeddings))

print("\nVector Database Used: FAISS")
print("Total vectors stored:", index.ntotal)


# ==========================================================
# 8️⃣ METADATA STORAGE
# ==========================================================
metadata_store = []

for i, chunk in enumerate(chunks):
    metadata_store.append({
        "chunk_id": i,
        "source": pdf_path,
        "chunk_size": CHUNK_SIZE,
        "text_preview": chunk[:80]
    })

print("\nSample Metadata:")
print(metadata_store[0])


# ==========================================================
# 9️⃣ QUERY EMBEDDING + RETRIEVAL
# ==========================================================
query_embedding = embedding_model.encode([user_query])

k = 3  # top results
distances, indices = index.search(np.array(query_embedding), k)

retrieved_chunks = [chunks[i] for i in indices[0]]

print("\nTop Retrieved Chunks:")
for i, idx in enumerate(indices[0]):
    print(f"\nRank {i+1} | Chunk ID:", idx)
    print("Metadata:", metadata_store[idx])
    print("Distance:", distances[0][i])


# ==========================================================
# 🔟 GENERATE FINAL ANSWER (LLM)
# ==========================================================
client = OpenAI()   # Requires OPENAI_API_KEY

context = "\n\n".join(retrieved_chunks)

prompt = f"""
Answer the question based only on the context below.

Context:
{context}

Question:
{user_query}
"""

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": prompt}]
)

print("\n================ FINAL ANSWER ================\n")
print(response.choices[0].message.content)

User Query: What is the main topic discussed in the document?

PDF Loaded Successfully.
Total characters in document: 42953

Number of Chunks Created: 96


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



Embedding Type: Sentence Embeddings
Embedding Dimension: 384
Total Embeddings Generated: 96

Sample Embedding Vector (first 10 values):
[-0.0270275  -0.03641296  0.00648821 -0.07945357  0.09919923  0.03288389
  0.04146029 -0.02413676 -0.07331552 -0.01886204]

Vector Database Used: FAISS
Total vectors stored: 96

Sample Metadata:
{'chunk_id': 0, 'source': 'sample.pdf', 'chunk_size': 500, 'text_preview': '1\nATLANTIC COUNCIL\nE-COMMERCE POLICY FOR A NEW DIGITAL INDIAISSUE BRIEF\nI\nndia’s'}

Top Retrieved Chunks:

Rank 1 | Chunk ID: 85
Metadata: {'chunk_id': 85, 'source': 'sample.pdf', 'chunk_size': 500, 'text_preview': 'ic \nCouncil’s US-India Digital Economy Task Force, for their time \nand expertise'}
Distance: 1.3448017

Rank 2 | Chunk ID: 95
Metadata: {'chunk_id': 95, 'source': 'sample.pdf', 'chunk_size': 500, 'text_preview': '\nquotations in news articles, critical articles, or reviews. Please \ndirect inqu'}
Distance: 1.3994248

Rank 3 | Chunk ID: 4
Metadata: {'chunk_id': 4, 'sou