# 🚀 Vector Database Setup - Mana Knight Digital

This notebook demonstrates the setup and configuration of the vector database for product similarity search.

## Features Covered:
- Pinecone vector database initialization
- Product vectorization using TF-IDF
- Vector storage and retrieval
- Similarity search implementation
- Local fallback configuration

In [None]:
# Import required libraries
import sys
import os
sys.path.append('..')

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from datetime import datetime

# Import our services
from services.vector_db import VectorDatabase
from services.database import DatabaseService
from services.recommendation import RecommendationEngine

print("✅ All imports successful!")

## 1. Initialize Vector Database Service

In [None]:
# Initialize vector database (will use local fallback if Pinecone not configured)
vector_db = VectorDatabase()

# Check connection status
stats = vector_db.get_index_stats()
print(f"Vector Database Stats: {stats}")

# Initialize database service
db_service = DatabaseService()
print("✅ Services initialized successfully!")

## 2. Load Sample Product Data

In [None]:
# Sample product data for demonstration
sample_products = [
    {
        "id": 1,
        "stock_code": "LAPTOP001",
        "description": "High Performance Gaming Laptop with RTX Graphics",
        "unit_price": 1299.99,
        "country": "USA"
    },
    {
        "id": 2,
        "stock_code": "PHONE001",
        "description": "Smartphone with Advanced Camera and 5G Connectivity",
        "unit_price": 899.99,
        "country": "South Korea"
    },
    {
        "id": 3,
        "stock_code": "HEADPHONE001",
        "description": "Wireless Noise Cancelling Headphones Premium Audio",
        "unit_price": 299.99,
        "country": "Germany"
    },
    {
        "id": 4,
        "stock_code": "WATCH001",
        "description": "Smartwatch with Fitness Tracking and Health Monitoring",
        "unit_price": 399.99,
        "country": "USA"
    },
    {
        "id": 5,
        "stock_code": "TABLET001",
        "description": "Professional Tablet for Digital Art and Design Work",
        "unit_price": 699.99,
        "country": "Japan"
    }
]

print(f"Loaded {len(sample_products)} sample products")
for product in sample_products:
    print(f"- {product['stock_code']}: {product['description'][:50]}...")

## 3. Create Product Vectors using TF-IDF

In [None]:
# Extract product descriptions for vectorization
descriptions = [product['description'] for product in sample_products]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=512,  # Limit to 512 dimensions
    stop_words='english',
    ngram_range=(1, 2),  # Include unigrams and bigrams
    lowercase=True
)

# Fit and transform descriptions
tfidf_matrix = vectorizer.fit_transform(descriptions)
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Vector Dimensions: {tfidf_matrix.shape[1]}")

# Convert to dense arrays for storage
product_vectors = tfidf_matrix.toarray()
print(f"Product Vectors Shape: {product_vectors.shape}")

## 4. Store Vectors in Vector Database

In [None]:
# Prepare vectors for storage
vectors_to_store = []

for i, product in enumerate(sample_products):
    vector_id = f"product_{product['id']}"
    vector = product_vectors[i].tolist()
    metadata = {
        'stock_code': product['stock_code'],
        'description': product['description'],
        'unit_price': product['unit_price'],
        'country': product['country']
    }
    
    vectors_to_store.append((vector_id, vector, metadata))

# Store vectors in database
success = vector_db.upsert_vectors(vectors_to_store)
print(f"Vector storage successful: {success}")

# Verify storage
updated_stats = vector_db.get_index_stats()
print(f"Updated Vector Database Stats: {updated_stats}")

## 5. Test Similarity Search

In [None]:
# Test query
test_query = "gaming laptop with graphics card"
print(f"Test Query: '{test_query}'")

# Vectorize the query
query_vector = vectorizer.transform([test_query]).toarray()[0]
print(f"Query Vector Shape: {query_vector.shape}")

# Search for similar products
similar_products = vector_db.query_vectors(
    query_vector=query_vector.tolist(),
    top_k=3
)

print(f"\nFound {len(similar_products)} similar products:")
for i, result in enumerate(similar_products, 1):
    print(f"{i}. {result['metadata']['stock_code']}: {result['metadata']['description']}")
    print(f"   Similarity Score: {result['score']:.4f}")
    print(f"   Price: ${result['metadata']['unit_price']}")
    print()

## 6. Test Recommendation Engine Integration

In [None]:
# Initialize recommendation engine
rec_engine = RecommendationEngine()

# Test different queries
test_queries = [
    "wireless headphones for music",
    "smartwatch for fitness tracking",
    "tablet for digital art",
    "smartphone with good camera"
]

print("Testing Recommendation Engine:")
print("=" * 50)

for query in test_queries:
    print(f"\nQuery: '{query}'")
    try:
        result = rec_engine.get_recommendations(query, top_k=2)
        recommendations = result.get('products', [])
        response = result.get('response', 'No response generated')
        
        print(f"Response: {response}")
        print(f"Recommendations ({len(recommendations)}):")
        
        for i, product in enumerate(recommendations, 1):
            print(f"  {i}. {product.get('stock_code', 'N/A')}: {product.get('description', 'N/A')[:60]}...")
            print(f"     Similarity: {product.get('similarity_score', 0):.3f}")
    except Exception as e:
        print(f"Error: {e}")
    
    print("-" * 30)

## 7. Performance Analysis

In [None]:
import time

# Measure query performance
def measure_query_performance(query, num_iterations=10):
    times = []
    
    for _ in range(num_iterations):
        start_time = time.time()
        
        # Vectorize query
        query_vector = vectorizer.transform([query]).toarray()[0]
        
        # Search
        results = vector_db.query_vectors(query_vector.tolist(), top_k=5)
        
        end_time = time.time()
        times.append(end_time - start_time)
    
    return {
        'avg_time': np.mean(times),
        'min_time': np.min(times),
        'max_time': np.max(times),
        'std_time': np.std(times)
    }

# Test performance
test_query = "laptop computer for programming"
perf_stats = measure_query_performance(test_query)

print("Query Performance Analysis:")
print(f"Query: '{test_query}'")
print(f"Average Time: {perf_stats['avg_time']*1000:.2f} ms")
print(f"Min Time: {perf_stats['min_time']*1000:.2f} ms")
print(f"Max Time: {perf_stats['max_time']*1000:.2f} ms")
print(f"Std Deviation: {perf_stats['std_time']*1000:.2f} ms")

## 8. Save Configuration and Vectorizer

In [None]:
import pickle

# Save vectorizer for future use
os.makedirs('../models', exist_ok=True)

with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save product vectors
with open('../models/product_vectors.pkl', 'wb') as f:
    pickle.dump({
        'vectors': product_vectors,
        'products': sample_products,
        'vectorizer_config': {
            'max_features': 512,
            'ngram_range': (1, 2),
            'stop_words': 'english'
        },
        'created_at': datetime.now().isoformat()
    }, f)

print("✅ Vectorizer and product vectors saved successfully!")
print("Files saved:")
print("- ../models/tfidf_vectorizer.pkl")
print("- ../models/product_vectors.pkl")

## 🎉 Summary

This notebook successfully demonstrated:

1. ✅ **Vector Database Setup**: Initialized Pinecone with local fallback
2. ✅ **Product Vectorization**: Created TF-IDF vectors for product descriptions
3. ✅ **Vector Storage**: Stored product vectors with metadata
4. ✅ **Similarity Search**: Implemented and tested vector similarity search
5. ✅ **Integration Testing**: Verified recommendation engine integration
6. ✅ **Performance Analysis**: Measured query response times
7. ✅ **Model Persistence**: Saved vectorizer and vectors for production use

The vector database is now ready for production use with the e-commerce recommendation system!