In [2]:

# ============================================================================
# 1. TOKENS AND TOKENIZATION
# ============================================================================


import tiktoken

text = "Hello! How are you doing today?"

# Load tokenizer
encoding = tiktoken.get_encoding("cl100k_base")  # GPT-4 tokenizer

# Tokenize
tokens = encoding.encode(text)

print(f"\nOriginal text: '{text}'")
print(f"\nToken IDs: {tokens}")
print(f"Number of tokens: {len(tokens)}")

# Decode back to text
print("\nDecoding each token:")
for token_id in tokens:
    token_text = encoding.decode([token_id])
    print(f"  Token ID {token_id:5d} -> '{token_text}'")


Original text: 'Hello! How are you doing today?'

Token IDs: [9906, 0, 2650, 527, 499, 3815, 3432, 30]
Number of tokens: 8

Decoding each token:
  Token ID  9906 -> 'Hello'
  Token ID     0 -> '!'
  Token ID  2650 -> ' How'
  Token ID   527 -> ' are'
  Token ID   499 -> ' you'
  Token ID  3815 -> ' doing'
  Token ID  3432 -> ' today'
  Token ID    30 -> '?'


In [5]:

# ============================================================================
# 2. EMBEDDINGS
# ============================================================================

from sentence_transformers import SentenceTransformer

# Load a small, fast model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example sentences
sentences = [
    "The cat sits on the mat",
    "A feline rests on the rug",
    "The dog runs in the park"
]

# Generate embeddings
embeddings = model.encode(sentences)

print(f"\nGenerated embeddings for {len(sentences)} sentences")
print(f"Embedding dimensions: {embeddings[0].shape[0]}")

for i, sentence in enumerate(sentences):
    print(f"\nSentence {i+1}: '{sentence}'")
    print(f"Embedding (first 10 values): {embeddings[i][:10]}")
    print(f"Embedding (last 10 values): {embeddings[i][-10:]}")

  from .autonotebook import tqdm as notebook_tqdm



Generated embeddings for 3 sentences
Embedding dimensions: 384

Sentence 1: 'The cat sits on the mat'
Embedding (first 10 values): [ 0.13489066 -0.03206333 -0.02033523  0.03590099 -0.0283331   0.04150213
  0.03315875  0.03660566  0.00861661  0.03763952]
Embedding (last 10 values): [-0.08110048 -0.04863552  0.01041568  0.00716836  0.03275092  0.05027731
  0.00980353  0.04674229  0.01492449  0.05863348]

Sentence 2: 'A feline rests on the rug'
Embedding (first 10 values): [ 0.06828602  0.02979378  0.03781996  0.11503891 -0.02664034  0.07539926
  0.01991902  0.00916963  0.01979519  0.02939497]
Embedding (last 10 values): [-0.02178581 -0.02646917 -0.00078678 -0.01127525  0.08196999  0.09736849
  0.04664826 -0.00084484  0.01191917  0.03106434]

Sentence 3: 'The dog runs in the park'
Embedding (first 10 values): [ 0.05794052 -0.03488497  0.05711472 -0.00158803  0.05946509 -0.02963705
 -0.01222521  0.01785607  0.06413977  0.03801326]
Embedding (last 10 values): [ 0.02910725  0.02868083 -0.06

In [10]:
# ============================================================================
# 3. VECTOR SIMILARITY (Cosine Similarity)
# ============================================================================

import numpy as np

def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors"""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    return dot_product / (norm1 * norm2)

# Using embeddings from previous step
if 'embeddings' in locals() and len(embeddings) >= 3:
    print("\nComparing sentence similarities:")
    print(f"\nSentence 1: '{sentences[0]}'")
    print(f"Sentence 2: '{sentences[1]}'")
    print(f"Sentence 3: '{sentences[2]}'")
    
    # Calculate similarities
    sim_1_2 = cosine_similarity(embeddings[0], embeddings[1])
    sim_1_3 = cosine_similarity(embeddings[0], embeddings[2])
    sim_2_3 = cosine_similarity(embeddings[1], embeddings[2])
    
    print(f"\nSimilarity (Sentence 1 ↔ Sentence 2): {sim_1_2:.4f}")
    print(f"Similarity (Sentence 1 ↔ Sentence 3): {sim_1_3:.4f}")
    print(f"Similarity (Sentence 2 ↔ Sentence 3): {sim_2_3:.4f}")
    
    print("\nInterpretation:")
    print("- Sentences 1 & 2 should be MOST similar (same meaning, different words)")
    print("- Sentences 1 & 3 should be LEAST similar (different meanings)")



Comparing sentence similarities:

Sentence 1: 'The cat sits on the mat'
Sentence 2: 'A feline rests on the rug'
Sentence 3: 'The dog runs in the park'

Similarity (Sentence 1 ↔ Sentence 2): 0.5607
Similarity (Sentence 1 ↔ Sentence 3): 0.0949
Similarity (Sentence 2 ↔ Sentence 3): 0.0973

Interpretation:
- Sentences 1 & 2 should be MOST similar (same meaning, different words)
- Sentences 1 & 3 should be LEAST similar (different meanings)


In [11]:

# ============================================================================
# 4. VECTOR SEARCH (Finding Most Similar Document)
# ============================================================================

# Simulate a simple vector search

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

# Our "knowledge base" of documents
documents = [
    "Python is a programming language",
    "Machine learning uses algorithms to learn from data",
    "The Eiffel Tower is in Paris",
    "Deep learning is a subset of machine learning",
    "JavaScript is used for web development"
]

# Embed all documents
doc_embeddings = model.encode(documents)

# User query
query = "What is deep learning?"
query_embedding = model.encode([query])[0]

print(f"\nQuery: '{query}'")
print(f"\nSearching through {len(documents)} documents...\n")

# Calculate similarity with each document
similarities = []
for i, doc_emb in enumerate(doc_embeddings):
    similarity = cosine_similarity(query_embedding, doc_emb)
    similarities.append((i, similarity))

# Sort by similarity (highest first)
similarities.sort(key=lambda x: x[1], reverse=True)

print("Results (ranked by relevance):")
for rank, (doc_idx, score) in enumerate(similarities, 1):
    print(f"\n{rank}. [Score: {score:.4f}] {documents[doc_idx]}")



Query: 'What is deep learning?'

Searching through 5 documents...

Results (ranked by relevance):

1. [Score: 0.7540] Deep learning is a subset of machine learning

2. [Score: 0.4643] Machine learning uses algorithms to learn from data

3. [Score: 0.2547] JavaScript is used for web development

4. [Score: 0.2431] Python is a programming language

5. [Score: -0.0161] The Eiffel Tower is in Paris


In [13]:

# ============================================================================
# 5. VECTOR DATABASE (Chromadb - Simple Example)
# ============================================================================


import chromadb
from chromadb.utils import embedding_functions

# Create a simple in-memory database
client = chromadb.Client()

# Create or get collection
collection = client.get_or_create_collection(
    name="my_documents",
    embedding_function=embedding_functions.DefaultEmbeddingFunction()
)

# Add documents to the database
documents = [
    "Python is a high-level programming language",
    "Machine learning enables computers to learn",
    "Paris is the capital of France",
    "Neural networks are inspired by the brain",
    "JavaScript runs in web browsers"
]

# Add documents with IDs and metadata
collection.add(
    documents=documents,
    ids=[f"doc_{i}" for i in range(len(documents))],
    metadatas=[{"source": f"document_{i}.txt"} for i in range(len(documents))]
)

print(f"\nAdded {len(documents)} documents to vector database")

# Query the database
query = "Tell me about artificial intelligence"

print(f"\nQuery: '{query}'")
print("\nSearching vector database...\n")

results = collection.query(
    query_texts=[query],
    n_results=3  # Return top 3 results
)

print("Top 3 Results:")
for i, (doc, distance, metadata) in enumerate(zip(
    results['documents'][0],
    results['distances'][0],
    results['metadatas'][0]
), 1):
    print(f"\n{i}. [Distance: {distance:.4f}]")
    print(f"   Document: {doc}")
    print(f"   Source: {metadata['source']}")

/Users/saurabhbhardwaj/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:16<00:00, 4.96MiB/s]



Added 5 documents to vector database

Query: 'Tell me about artificial intelligence'

Searching vector database...

Top 3 Results:

1. [Distance: 1.0995]
   Document: Neural networks are inspired by the brain
   Source: document_3.txt

2. [Distance: 1.1080]
   Document: Machine learning enables computers to learn
   Source: document_1.txt

3. [Distance: 1.3073]
   Document: Python is a high-level programming language
   Source: document_0.txt
