# Notebook 03: Embedding Generation & FAISS Index Creation


### Convert text to vectors and build semantic search infrastructure

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
import pickle
import time


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ================================
# 1. LOAD CLEANED DATA
# ================================

print("\n📂 Loading cleaned dataset...")
df = pd.read_csv('../data/arxiv_papers_clean.csv')
print(f"✓ Loaded {len(df)} papers")



📂 Loading cleaned dataset...
✓ Loaded 2130 papers


In [4]:
# ================================
# 2. MODEL SELECTION & LOADING
# ================================

print("\n Loading embedding model...")
print("Model: sentence-transformers/all-MiniLM-L6-v2")
print("  - Dimension: 384")
print("  - Fast and efficient for semantic search")
print("  - Good balance of quality and speed")

model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Model loaded successfully")


 Loading embedding model...
Model: sentence-transformers/all-MiniLM-L6-v2
  - Dimension: 384
  - Fast and efficient for semantic search
  - Good balance of quality and speed
✓ Model loaded successfully


In [5]:
# ================================
# 3. EMBEDDING GENERATION
# ================================

print("\n" + "="*60)
print("GENERATING EMBEDDINGS")
print("="*60)

# Decision: Embed abstracts (not titles) since they contain more semantic information
texts_to_embed = df['abstract_clean'].tolist()

print(f"\nGenerating embeddings for {len(texts_to_embed)} abstracts...")
print("This may take a few minutes...")

start_time = time.time()

# Generate embeddings in batches for efficiency
batch_size = 32
embeddings = model.encode(
    texts_to_embed,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True
)

elapsed_time = time.time() - start_time

print(f"\n✓ Embedding generation complete!")
print(f"  Time taken: {elapsed_time:.2f} seconds")
print(f"  Average time per paper: {elapsed_time/len(texts_to_embed):.3f} seconds")
print(f"  Embedding shape: {embeddings.shape}")


GENERATING EMBEDDINGS

Generating embeddings for 2130 abstracts...
This may take a few minutes...


Batches: 100%|██████████| 67/67 [00:56<00:00,  1.19it/s]


✓ Embedding generation complete!
  Time taken: 56.48 seconds
  Average time per paper: 0.027 seconds
  Embedding shape: (2130, 384)





In [6]:

# ================================
# 4. VALIDATE EMBEDDINGS
# ================================

print("\n" + "="*60)
print("EMBEDDING VALIDATION")
print("="*60)

# Check embedding statistics
print("\nEmbedding statistics:")
print(f"  Mean: {embeddings.mean():.4f}")
print(f"  Std: {embeddings.std():.4f}")
print(f"  Min: {embeddings.min():.4f}")
print(f"  Max: {embeddings.max():.4f}")

# Check for NaN or infinite values
print(f"\nData quality checks:")
print(f"  NaN values: {np.isnan(embeddings).sum()}")
print(f"  Infinite values: {np.isinf(embeddings).sum()}")

# Test semantic similarity
print("\n" + "="*60)
print("SEMANTIC SIMILARITY TEST")
print("="*60)

def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors"""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Test with first 3 papers
test_indices = [0, 1, 2]
print("\nTesting semantic similarity between first 3 papers:")

for i in test_indices:
    print(f"\n📄 Paper {i}: {df.iloc[i]['title'][:60]}...")

print("\nSimilarity matrix:")
for i in test_indices:
    for j in test_indices:
        sim = cosine_similarity(embeddings[i], embeddings[j])
        print(f"  Paper {i} <-> Paper {j}: {sim:.4f}")


EMBEDDING VALIDATION

Embedding statistics:
  Mean: 0.0010
  Std: 0.0510
  Min: -0.2439
  Max: 0.2378

Data quality checks:
  NaN values: 0
  Infinite values: 0

SEMANTIC SIMILARITY TEST

Testing semantic similarity between first 3 papers:

📄 Paper 0: An Optimal Control View of Adversarial Machine Learning...

📄 Paper 1: Machine Learning for Clinical Predictive Analytics...

📄 Paper 2: Towards Modular Machine Learning Solution Development: Benef...

Similarity matrix:
  Paper 0 <-> Paper 0: 1.0000
  Paper 0 <-> Paper 1: 0.3258
  Paper 0 <-> Paper 2: 0.3027
  Paper 1 <-> Paper 0: 0.3258
  Paper 1 <-> Paper 1: 1.0000
  Paper 1 <-> Paper 2: 0.4468
  Paper 2 <-> Paper 0: 0.3027
  Paper 2 <-> Paper 1: 0.4468
  Paper 2 <-> Paper 2: 1.0000


In [7]:
# ================================
# 5. BUILD FAISS INDEX
# ================================

print("\n" + "="*60)
print("BUILDING FAISS INDEX")
print("="*60)

# Normalize embeddings for cosine similarity
embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index (IndexFlatIP for inner product = cosine similarity with normalized vectors)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)

# Add embeddings to index
index.add(embeddings_normalized.astype('float32'))

print(f"✓ FAISS index created")
print(f"  Index type: IndexFlatIP (Inner Product)")
print(f"  Total vectors: {index.ntotal}")
print(f"  Dimension: {dimension}")


BUILDING FAISS INDEX
✓ FAISS index created
  Index type: IndexFlatIP (Inner Product)
  Total vectors: 2130
  Dimension: 384


In [8]:
# ================================
# 6. TEST SEMANTIC SEARCH
# ================================

print("\n" + "="*60)
print("TESTING SEMANTIC SEARCH")
print("="*60)

def search_papers(query, top_k=5):
    """Search for papers similar to query"""
    # Encode query
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_normalized = query_embedding / np.linalg.norm(query_embedding)
    
    # Search
    distances, indices = index.search(query_normalized.astype('float32'), top_k)
    
    return indices[0], distances[0]

# Test queries
test_queries = [
    "deep learning for computer vision",
    "natural language processing transformers",
    "reinforcement learning robotics"
]

for query in test_queries:
    print(f"\n🔍 Query: '{query}'")
    print("-" * 60)
    
    indices, scores = search_papers(query, top_k=3)
    
    for rank, (idx, score) in enumerate(zip(indices, scores), 1):
        paper = df.iloc[idx]
        print(f"\n  {rank}. Score: {score:.4f}")
        print(f"     Title: {paper['title'][:80]}...")
        print(f"     Abstract preview: {paper['abstract_clean'][:120]}...")


TESTING SEMANTIC SEARCH

🔍 Query: 'deep learning for computer vision'
------------------------------------------------------------

  1. Score: 0.7082
     Title: Deep Learning vs. Traditional Computer Vision...
     Abstract preview: Deep Learning has pushed the limits of what was possible in the domain of Digital Image Processing. However, that is not...

  2. Score: 0.6987
     Title: Residual Quantity in Percentage of Factory Machines Using Computer Vision and Ma...
     Abstract preview: Computer vision has been thriving since AI development was gaining thrust. Using deep learning techniques has been the m...

  3. Score: 0.6893
     Title: Integration and Performance Analysis of Artificial Intelligence and Computer Vis...
     Abstract preview: This paper focuses on the analysis of the application effectiveness of the integration of deep learning and computer vis...

🔍 Query: 'natural language processing transformers'
------------------------------------------------------------


In [9]:
# ================================
# 7. SAVE ARTIFACTS
# ================================

print("\n" + "="*60)
print("SAVING ARTIFACTS")
print("="*60)

# Save embeddings
np.save('../data/embeddings.npy', embeddings_normalized)
print("✓ Saved embeddings to ../data/embeddings.npy")

# Save FAISS index
faiss.write_index(index, '../data/faiss_index.bin')
print("✓ Saved FAISS index to ../data/faiss_index.bin")

# Save metadata for retrieval
metadata = {
    'model_name': 'all-MiniLM-L6-v2',
    'embedding_dim': dimension,
    'num_papers': len(df),
    'index_type': 'IndexFlatIP',
    'creation_date': pd.Timestamp.now().isoformat()
}

with open('../data/index_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)
print("✓ Saved metadata to ../data/index_metadata.pkl")


SAVING ARTIFACTS
✓ Saved embeddings to ../data/embeddings.npy
✓ Saved FAISS index to ../data/faiss_index.bin
✓ Saved metadata to ../data/index_metadata.pkl


In [13]:
# ================================
# 8. PERFORMANCE BENCHMARKING
# ================================

print("\n" + "="*60)
print("SEARCH PERFORMANCE BENCHMARK")
print("="*60)

# Test search latency
num_queries = 100
query = "machine learning"

start = time.time()
for _ in range(num_queries):
    search_papers(query, top_k=10)
end = time.time()

avg_latency = (end - start) / num_queries

print(f"\nAverage search latency: {avg_latency*1000:.2f} ms")
print(f"Queries per second: {1/avg_latency:.1f}")


SEARCH PERFORMANCE BENCHMARK

Average search latency: 10.52 ms
Queries per second: 95.1


In [14]:
# ================================
# 9. SUMMARY
# ================================

print("\n" + "="*60)
print("EMBEDDING & INDEX CREATION COMPLETE!")
print("="*60)

summary = {
    'Total Papers': len(df),
    'Embedding Dimension': dimension,
    'Model': 'all-MiniLM-L6-v2',
    'Index Type': 'FAISS IndexFlatIP',
    'Avg Search Latency': f"{avg_latency*1000:.2f} ms",
    'Storage Size (embeddings)': f"{embeddings_normalized.nbytes / 1024**2:.2f} MB"
}

for key, value in summary.items():
    print(f"{key}: {value}")

print("\n✅ Ready for RAG system development in notebook 04")
print("="*60)


EMBEDDING & INDEX CREATION COMPLETE!
Total Papers: 2130
Embedding Dimension: 384
Model: all-MiniLM-L6-v2
Index Type: FAISS IndexFlatIP
Avg Search Latency: 10.52 ms
Storage Size (embeddings): 3.12 MB

✅ Ready for RAG system development in notebook 04
