In [1]:
#Step 1: Environment Setup - Complete Implementation

In [2]:
#Check Python Environment

In [3]:
import sys
import platform

print(f"Python version: {sys.version}")
print(f"Platform: {platform.platform()}")
print(f"Architecture: {platform.architecture()}")

# Check if we're in a virtual environment (recommended)
if hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix):
    print("✓ Running in virtual environment")
else:
    print("⚠️  Consider using a virtual environment for this project")

Python version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Platform: Windows-11-10.0.26100-SP0
Architecture: ('64bit', 'WindowsPE')
⚠️  Consider using a virtual environment for this project


In [4]:
#Install Core Packages

In [5]:
# Install all required packages for the RAG system
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✓ Successfully installed {package}")
    except subprocess.CalledProcessError as e:
        print(f"✗ Failed to install {package}: {e}")

# Core packages for RAG system
packages = [
    "sentence-transformers",  # For local embeddings
    "faiss-cpu",             # Vector database (CPU version)
    "chromadb",              # Alternative vector database
    "pandas",                # Data manipulation
    "numpy",                 # Numerical operations
    "scikit-learn",          # ML utilities
    "nltk",                  # Natural language processing
    "tqdm",                  # Progress bars
    "matplotlib",            # Plotting
    "seaborn",               # Better plotting
    "jupyter",               # Jupyter notebook support
    "ipywidgets",            # Interactive widgets
]

print("Installing packages for RAG system...")
for package in packages:
    install_package(package)

Installing packages for RAG system...
✓ Successfully installed sentence-transformers
✓ Successfully installed faiss-cpu
✓ Successfully installed chromadb
✓ Successfully installed pandas
✓ Successfully installed numpy
✓ Successfully installed scikit-learn
✓ Successfully installed nltk
✓ Successfully installed tqdm
✓ Successfully installed matplotlib
✓ Successfully installed seaborn
✓ Successfully installed jupyter
✓ Successfully installed ipywidgets


In [7]:
#Install Optional Packages (GPU Support)

In [8]:
# Optional: Install GPU-accelerated packages if you have CUDA
import torch

def check_gpu_support():
    if torch.cuda.is_available():
        print(f"✓ CUDA available: {torch.cuda.get_device_name()}")
        return True
    else:
        print("ℹ️  CUDA not available, using CPU versions")
        return False

has_gpu = check_gpu_support()

# Install GPU versions if available
if has_gpu:
    gpu_packages = [
        "faiss-gpu",  # GPU version of FAISS (will replace faiss-cpu)
    ]
    
    print("\nInstalling GPU-accelerated packages...")
    for package in gpu_packages:
        install_package(package)

ℹ️  CUDA not available, using CPU versions


In [9]:
#Download and Setup NLTK Data

In [10]:
# Download required NLTK data for text processing
import nltk

nltk_downloads = [
    'punkt',        # Sentence tokenizer
    'stopwords',    # Stop words
    'wordnet',      # WordNet lemmatizer
    'averaged_perceptron_tagger',  # POS tagger
]

print("Downloading NLTK data...")
for item in nltk_downloads:
    try:
        nltk.download(item, quiet=True)
        print(f"✓ Downloaded {item}")
    except Exception as e:
        print(f"✗ Failed to download {item}: {e}")

Downloading NLTK data...
✓ Downloaded punkt
✓ Downloaded stopwords
✓ Downloaded wordnet
✓ Downloaded averaged_perceptron_tagger


In [11]:
# Import and Test All Libraries

In [12]:
# Import all libraries and test they work correctly
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Core imports
try:
    import pandas as pd
    import numpy as np
    import json
    import os
    import re
    from pathlib import Path
    from typing import List, Dict, Tuple, Optional
    print("✓ Core libraries imported")
except ImportError as e:
    print(f"✗ Core import error: {e}")

# ML and NLP imports
try:
    from sentence_transformers import SentenceTransformer
    import faiss
    import chromadb
    from sklearn.metrics.pairwise import cosine_similarity
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    print("✓ ML/NLP libraries imported")
except ImportError as e:
    print(f"✗ ML/NLP import error: {e}")

# Utility imports
try:
    from tqdm.notebook import tqdm  # Progress bars for Jupyter
    import matplotlib.pyplot as plt
    import seaborn as sns
    print("✓ Utility libraries imported")
except ImportError as e:
    print(f"✗ Utility import error: {e}")

✓ Core libraries imported
✓ ML/NLP libraries imported
✓ Utility libraries imported


In [13]:
#System Resource Check

In [15]:
# Check system resources for optimal configuration
import psutil
import gc
from pathlib import Path
import shutil

def _pretty_gb(x): 
    return f"{x/(1024**3):.2f} GB"

def safe_disk_usage(path="."):
    """Robust disk usage that avoids psutil edge cases on Windows."""
    try:
        # Try psutil with the resolved path first
        return psutil.disk_usage(str(Path(path).resolve()))
    except Exception:
        # Fall back to drive root (Windows) or "/" (POSIX)
        anchor = Path(path).resolve().anchor or (Path(path).anchor or "/")
        try:
            return psutil.disk_usage(anchor)
        except Exception:
            # Final fallback: shutil (cross-platform)
            total, used, free = shutil.disk_usage(anchor or "/")
            # Mimic psutil result
            from collections import namedtuple
            DiskUsage = namedtuple("sdiskusage", ["total", "used", "free", "percent"])
            percent = (used / total * 100) if total else 0.0
            return DiskUsage(total, used, free, percent)

def check_system_resources():
    # Memory info
    memory = psutil.virtual_memory()
    print(f"Total RAM: {_pretty_gb(memory.total)}")
    print(f"Available RAM: {_pretty_gb(memory.available)}")
    print(f"RAM Usage: {memory.percent}%")
    
    # CPU info
    print(f"CPU Cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical")
    
    # Disk space (robust)
    disk = safe_disk_usage(".")
    print(f"Disk Space: {_pretty_gb(disk.free)} free of {_pretty_gb(disk.total)} ({disk.percent:.1f}% used)")
    
    # GPU info
    try:
        import torch
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            print(f"GPUs available: {gpu_count}")
            for i in range(gpu_count):
                props = torch.cuda.get_device_properties(i)
                print(f"  GPU {i}: {props.name} ({_pretty_gb(props.total_memory)})")
        else:
            print("No GPU available")
    except Exception:
        print("PyTorch not available for GPU check")

check_system_resources()

# Recommend configuration based on resources
def recommend_config():
    memory_gb = psutil.virtual_memory().total / (1024**3)
    print("\n=== Recommended Configuration ===")
    if memory_gb >= 32:
        print("High-end setup: Use large embedding models and ChromaDB")
        return {"embedding_model": "all-mpnet-base-v2", "vector_db": "chromadb", "batch_size": 100}
    elif memory_gb >= 16:
        print("Mid-range setup: Use medium embedding models and FAISS")
        return {"embedding_model": "all-MiniLM-L6-v2", "vector_db": "faiss", "batch_size": 50}
    elif memory_gb >= 8:
        print("Budget setup: Use lightweight models and small batches")
        return {"embedding_model": "all-MiniLM-L6-v2", "vector_db": "faiss", "batch_size": 25}
    else:
        print("Low memory: Use very lightweight setup")
        return {"embedding_model": "paraphrase-MiniLM-L3-v2", "vector_db": "faiss", "batch_size": 10}

config = recommend_config()


Total RAM: 15.86 GB
Available RAM: 3.98 GB
RAM Usage: 74.9%
CPU Cores: 4 physical, 8 logical
Disk Space: 13.02 GB free of 236.95 GB (94.5% used)
No GPU available

=== Recommended Configuration ===
Budget setup: Use lightweight models and small batches


In [16]:
#Create Project Directory Structure

In [17]:
# Create organized directory structure for your RAG project
import os
from pathlib import Path

def create_project_structure():
    directories = [
        "data/raw",           # Your original text files and JSON
        "data/processed",     # Cleaned and chunked data
        "data/embeddings",    # Saved embeddings
        "models",            # Downloaded models
        "vector_db",         # Vector database files
        "outputs",           # Query results and logs
        "notebooks",         # Additional notebooks
        "utils",             # Helper functions
    ]
    
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
        print(f"✓ Created directory: {directory}")
    
    # Create .gitignore for the project
    gitignore_content = """
# Models and embeddings (too large for git)
models/
data/embeddings/
vector_db/

# Jupyter notebook checkpoints
.ipynb_checkpoints/

# Python cache
__pycache__/
*.pyc

# Environment variables
.env

# Large data files
*.bin
*.gguf
*.model
"""
    
    with open('.gitignore', 'w') as f:
        f.write(gitignore_content)
    print("✓ Created .gitignore file")

create_project_structure()

✓ Created directory: data/raw
✓ Created directory: data/processed
✓ Created directory: data/embeddings
✓ Created directory: models
✓ Created directory: vector_db
✓ Created directory: outputs
✓ Created directory: notebooks
✓ Created directory: utils
✓ Created .gitignore file


In [18]:
#Setup Configuration File

In [19]:
# Create configuration file for your RAG system
import json

# Configuration based on your system capabilities
rag_config = {
    "embedding": {
        "model_name": config["embedding_model"],
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "batch_size": config["batch_size"],
        "max_length": 512
    },
    "chunking": {
        "chunk_size": 500,
        "overlap": 50,
        "min_chunk_size": 100
    },
    "vector_db": {
        "type": config["vector_db"],
        "index_type": "IVFFlat" if config["vector_db"] == "faiss" else None,
        "similarity_threshold": 0.7
    },
    "retrieval": {
        "top_k": 5,
        "rerank": True
    },
    "data": {
        "supported_formats": [".txt", ".json", ".md"],
        "encoding": "utf-8"
    }
}

# Save configuration
with open('rag_config.json', 'w') as f:
    json.dump(rag_config, f, indent=2)

print("Configuration saved to rag_config.json")
print("Current configuration:")
print(json.dumps(rag_config, indent=2))

Configuration saved to rag_config.json
Current configuration:
{
  "embedding": {
    "model_name": "all-MiniLM-L6-v2",
    "device": "cpu",
    "batch_size": 25,
    "max_length": 512
  },
  "chunking": {
    "chunk_size": 500,
    "overlap": 50,
    "min_chunk_size": 100
  },
  "vector_db": {
    "type": "faiss",
    "index_type": "IVFFlat",
    "similarity_threshold": 0.7
  },
  "retrieval": {
    "top_k": 5,
    "rerank": true
  },
  "data": {
    "supported_formats": [
      ".txt",
      ".json",
      ".md"
    ],
    "encoding": "utf-8"
  }
}


In [20]:
#Test Data Loading

In [21]:
# Test function to check if we can access your data files
def check_data_files(data_directory="data/raw"):
    """
    Check what data files are available for processing
    """
    data_path = Path(data_directory)
    
    if not data_path.exists():
        print(f"Data directory {data_directory} doesn't exist. Please upload your files there.")
        return None
    
    supported_extensions = ['.txt', '.json', '.md']
    files_found = []
    
    for ext in supported_extensions:
        files = list(data_path.glob(f"*{ext}"))
        files_found.extend(files)
        if files:
            print(f"Found {len(files)} {ext} files")
    
    if files_found:
        print(f"\nTotal files found: {len(files_found)}")
        print("Sample files:")
        for file in files_found[:5]:  # Show first 5 files
            print(f"  - {file.name}")
        if len(files_found) > 5:
            print(f"  ... and {len(files_found) - 5} more")
    else:
        print("No data files found. Please upload your text files and JSON files to data/raw/")
    
    return files_found

# Check for your data files
available_files = check_data_files()

No data files found. Please upload your text files and JSON files to data/raw/


In [22]:
#Environment Verification Summary

In [23]:
# Final verification that everything is set up correctly
def verify_environment():
    print("=== Environment Setup Verification ===")
    
    # Check imports
    try:
        from sentence_transformers import SentenceTransformer
        print("✓ SentenceTransformers ready")
    except:
        print("✗ SentenceTransformers not available")
    
    try:
        import faiss
        print("✓ FAISS ready")
    except:
        print("✗ FAISS not available")
    
    try:
        import chromadb
        print("✓ ChromaDB ready")
    except:
        print("✗ ChromaDB not available")
    
    # Check directories
    required_dirs = ["data/raw", "data/processed", "models", "vector_db"]
    for directory in required_dirs:
        if Path(directory).exists():
            print(f"✓ Directory {directory} exists")
        else:
            print(f"✗ Directory {directory} missing")
    
    # Check config file
    if Path('rag_config.json').exists():
        print("✓ Configuration file ready")
    else:
        print("✗ Configuration file missing")
    
    print("\n=== Next Steps ===")
    print("1. Upload your text files and JSON files to data/raw/")
    print("2. Ready to proceed to Step 2: Data Preprocessing")
    
    return True

verify_environment()

=== Environment Setup Verification ===
✓ SentenceTransformers ready
✓ FAISS ready
✓ ChromaDB ready
✓ Directory data/raw exists
✓ Directory data/processed exists
✓ Directory models exists
✓ Directory vector_db exists
✓ Configuration file ready

=== Next Steps ===
1. Upload your text files and JSON files to data/raw/
2. Ready to proceed to Step 2: Data Preprocessing


True

In [24]:
#Step 2: Data Preprocessing Pipeline - Complete Implementation

In [25]:
#Load Configuration and Setup

In [26]:
# Load configuration and set up preprocessing pipeline
import json
import pandas as pd
import numpy as np
from pathlib import Path
import re
from typing import List, Dict, Tuple, Optional
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Load configuration
with open('rag_config.json', 'r') as f:
    config = json.load(f)

print("Loaded configuration:")
print(f"Chunk size: {config['chunking']['chunk_size']}")
print(f"Overlap: {config['chunking']['overlap']}")
print(f"Supported formats: {config['data']['supported_formats']}")

# Global variables for processing
processed_chunks = []
metadata_store = {}

Loaded configuration:
Chunk size: 500
Overlap: 50
Supported formats: ['.txt', '.json', '.md']


In [27]:
#Data Loading Functions

In [28]:
def load_text_files(directory="data/raw") -> Dict[str, str]:
    """Load all text files from directory"""
    text_files = {}
    data_path = Path(directory)
    
    for file_path in data_path.glob("*.txt"):
        try:
            with open(file_path, 'r', encoding=config['data']['encoding']) as f:
                content = f.read()
                text_files[file_path.name] = content
                print(f"✓ Loaded {file_path.name}: {len(content)} characters")
        except Exception as e:
            print(f"✗ Error loading {file_path.name}: {e}")
    
    return text_files

def load_json_files(directory="data/raw") -> Dict[str, List[Dict]]:
    """Load and extract text from JSON files"""
    json_files = {}
    data_path = Path(directory)
    
    for file_path in data_path.glob("*.json"):
        try:
            with open(file_path, 'r', encoding=config['data']['encoding']) as f:
                data = json.load(f)
                json_files[file_path.name] = data
                print(f"✓ Loaded {file_path.name}: {type(data)} with {len(str(data))} characters")
        except Exception as e:
            print(f"✗ Error loading {file_path.name}: {e}")
    
    return json_files

def extract_text_from_json(json_data: Dict) -> str:
    """Extract text content from JSON structure"""
    text_content = []
    
    def extract_recursive(obj):
        if isinstance(obj, dict):
            for key, value in obj.items():
                # Skip purely numeric or metadata fields
                if isinstance(value, str) and len(value) > 10:
                    text_content.append(value)
                elif isinstance(value, (dict, list)):
                    extract_recursive(value)
        elif isinstance(obj, list):
            for item in obj:
                extract_recursive(item)
        elif isinstance(obj, str) and len(obj) > 10:
            text_content.append(obj)
    
    extract_recursive(json_data)
    return " ".join(text_content)

# Load all data files
print("Loading data files...")
text_data = load_text_files()
json_data = load_json_files()

# Extract text from JSON files
json_text_data = {}
for filename, data in json_data.items():
    extracted_text = extract_text_from_json(data)
    if extracted_text:
        json_text_data[filename] = extracted_text
        print(f"✓ Extracted {len(extracted_text)} characters from {filename}")

# Combine all text data
all_text_data = {**text_data, **json_text_data}
print(f"\nTotal files processed: {len(all_text_data)}")

Loading data files...
✓ Loaded EURUSD_BTMM_story.txt: 18406 characters
✓ Loaded EURUSD_SMC_story.txt: 19120 characters
✓ Loaded EURUSD_Wyckoff_story.txt: 21042 characters
✓ Loaded GBPUSD_BTMM_story.txt: 18338 characters
✓ Loaded GBPUSD_SMC_story.txt: 19061 characters
✓ Loaded GBPUSD_Wyckoff_story.txt: 21041 characters
✓ Loaded Symbol_1000_BTMM_story.txt: 18480 characters
✓ Loaded Symbol_1000_SMC_story.txt: 19199 characters
✓ Loaded Symbol_1000_Wyckoff_story.txt: 21134 characters
✓ Loaded Symbol_1001_BTMM_story.txt: 18613 characters
✓ Loaded Symbol_1001_SMC_story.txt: 19182 characters
✓ Loaded Symbol_1001_Wyckoff_story.txt: 20890 characters
✓ Loaded Symbol_1002_BTMM_story.txt: 18545 characters
✓ Loaded Symbol_1002_SMC_story.txt: 19212 characters
✓ Loaded Symbol_1002_Wyckoff_story.txt: 21222 characters
✓ Loaded Symbol_1003_BTMM_story.txt: 18561 characters
✓ Loaded Symbol_1003_SMC_story.txt: 19169 characters
✓ Loaded Symbol_1003_Wyckoff_story.txt: 21009 characters
✓ Loaded Symbol_1004_BTM

In [29]:
#Cleaning and Normalisation

In [30]:
def clean_financial_text(text: str) -> str:
    """Clean and normalize financial market analysis text"""
    
    # Remove excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Normalize paragraph breaks
    text = re.sub(r'[ \t]+', ' ', text)      # Multiple spaces to single space
    
    # Fix common formatting issues
    text = re.sub(r'\.(\w)', r'. \1', text)  # Add space after periods
    text = re.sub(r',(\w)', r', \1', text)   # Add space after commas
    text = re.sub(r';(\w)', r'; \1', text)   # Add space after semicolons
    
    # Clean up percentage formatting
    text = re.sub(r'\(\s*([+-]?\d+\.?\d*)\s*%\s*\)', r'(\1%)', text)
    
    # Normalize financial terms (keep consistent capitalization)
    financial_terms = {
        'smart money': 'smart money',
        'order flow': 'order flow',
        'institutional': 'institutional',
        'liquidity': 'liquidity',
        'consolidation': 'consolidation',
        'accumulation': 'accumulation',
        'distribution': 'distribution',
        'bullish': 'bullish',
        'bearish': 'bearish',
    }
    
    for term, normalized in financial_terms.items():
        text = re.sub(term, normalized, text, flags=re.IGNORECASE)
    
    # Remove extra whitespace
    text = text.strip()
    
    return text

def preprocess_all_text(text_data: Dict[str, str]) -> Dict[str, str]:
    """Apply cleaning to all loaded text"""
    cleaned_data = {}
    
    for filename, content in tqdm(text_data.items(), desc="Cleaning text files"):
        cleaned_content = clean_financial_text(content)
        cleaned_data[filename] = cleaned_content
        
        # Show cleaning stats
        original_length = len(content)
        cleaned_length = len(cleaned_content)
        print(f"{filename}: {original_length} → {cleaned_length} characters")
    
    return cleaned_data

# Clean all text data
cleaned_text_data = preprocess_all_text(all_text_data)

# Show sample of cleaned text
if cleaned_text_data:
    sample_filename = list(cleaned_text_data.keys())[0]
    sample_text = cleaned_text_data[sample_filename]
    print(f"\nSample cleaned text from {sample_filename}:")
    print(sample_text[:500] + "..." if len(sample_text) > 500 else sample_text)

Cleaning text files:   0%|          | 0/17352 [00:00<?, ?it/s]

EURUSD_BTMM_story.txt: 18406 → 18331 characters
EURUSD_SMC_story.txt: 19120 → 19045 characters
EURUSD_Wyckoff_story.txt: 21042 → 20967 characters
GBPUSD_BTMM_story.txt: 18338 → 18263 characters
GBPUSD_SMC_story.txt: 19061 → 18986 characters
GBPUSD_Wyckoff_story.txt: 21041 → 20966 characters
Symbol_1000_BTMM_story.txt: 18480 → 18405 characters
Symbol_1000_SMC_story.txt: 19199 → 19124 characters
Symbol_1000_Wyckoff_story.txt: 21134 → 21059 characters
Symbol_1001_BTMM_story.txt: 18613 → 18538 characters
Symbol_1001_SMC_story.txt: 19182 → 19107 characters
Symbol_1001_Wyckoff_story.txt: 20890 → 20815 characters
Symbol_1002_BTMM_story.txt: 18545 → 18470 characters
Symbol_1002_SMC_story.txt: 19212 → 19137 characters
Symbol_1002_Wyckoff_story.txt: 21222 → 21147 characters
Symbol_1003_BTMM_story.txt: 18561 → 18486 characters
Symbol_1003_SMC_story.txt: 19169 → 19094 characters
Symbol_1003_Wyckoff_story.txt: 21009 → 20934 characters
Symbol_1004_BTMM_story.txt: 18359 → 18284 characters
Symbol_1004

In [32]:
#Intelligent Chunking for Financial Data

In [37]:
#Installing Missing Dependency
!pip install --upgrade nltk


Defaulting to user installation because normal site-packages is not writeable


In [40]:
import nltk
nltk.download('punkt_tab')

def create_financial_chunks(text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
    """
    Create intelligent chunks for financial market analysis text
    Preserves context around financial concepts
    """
    
    # Financial market indicators that should stay together
    financial_indicators = [
        'smart money', 'order flow', 'institutional', 'liquidity',
        'consolidation', 'accumulation', 'distribution', 
        'bullish move', 'bearish move', 'premium levels', 'discount levels',
        'order blocks', 'displacement', 'market selling', 'market orders'
    ]
    
    # Split into sentences first
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_length = 0
    
    for i, sentence in enumerate(sentences):
        sentence_length = len(sentence)
        
        # Check if adding this sentence would exceed chunk size
        if current_length + sentence_length > chunk_size and current_chunk:
            # Before finalizing chunk, check if we're breaking important context
            should_extend = False
            
            # Check if current sentence contains financial indicators
            for indicator in financial_indicators:
                if indicator in sentence.lower():
                    # Check if previous sentence also contains related terms
                    prev_sentence = sentences[i-1] if i > 0 else ""
                    if any(term in prev_sentence.lower() for term in financial_indicators):
                        should_extend = True
                        break
            
            if not should_extend or current_length + sentence_length > chunk_size * 1.2:
                # Finalize current chunk
                chunks.append({
                    'text': current_chunk.strip(),
                    'length': current_length,
                    'sentence_count': current_chunk.count('.'),
                    'start_sentence': max(0, i - current_chunk.count('.')),
                    'end_sentence': i - 1
                })
                
                # Start new chunk with overlap
                if overlap > 0 and chunks:
                    # Get last few sentences for overlap
                    overlap_text = ""
                    for j in range(max(0, len(sentences) - 3), i):
                        if j < len(sentences):
                            overlap_text += sentences[j] + " "
                    
                    if len(overlap_text) <= overlap:
                        current_chunk = overlap_text + sentence + " "
                        current_length = len(current_chunk)
                    else:
                        current_chunk = sentence + " "
                        current_length = sentence_length + 1
                else:
                    current_chunk = sentence + " "
                    current_length = sentence_length + 1
            else:
                current_chunk += sentence + " "
                current_length += sentence_length + 1
        else:
            current_chunk += sentence + " "
            current_length += sentence_length + 1
    
    # Add final chunk if there's remaining content
    if current_chunk.strip():
        chunks.append({
            'text': current_chunk.strip(),
            'length': len(current_chunk),
            'sentence_count': current_chunk.count('.'),
            'start_sentence': len(sentences) - current_chunk.count('.'),
            'end_sentence': len(sentences) - 1
        })
    
    return chunks

def chunk_all_documents(cleaned_data: Dict[str, str]) -> List[Dict]:
    """Apply chunking to all documents"""
    all_chunks = []
    
    for filename, content in tqdm(cleaned_data.items(), desc="Chunking documents"):
        if len(content) < config['chunking']['min_chunk_size']:
            print(f"Skipping {filename}: too short ({len(content)} chars)")
            continue
        
        chunks = create_financial_chunks(
            content,
            config['chunking']['chunk_size'],
            config['chunking']['overlap']
        )
        
        # Add metadata to each chunk
        for i, chunk in enumerate(chunks):
            chunk_with_metadata = {
                'chunk_id': f"{filename}_{i:03d}",
                'source_file': filename,
                'chunk_index': i,
                'text': chunk['text'],
                'length': chunk['length'],
                'sentence_count': chunk['sentence_count'],
                'financial_terms': extract_financial_terms(chunk['text'])
            }
            all_chunks.append(chunk_with_metadata)
        
        print(f"✓ {filename}: {len(chunks)} chunks created")
    
    return all_chunks

def extract_financial_terms(text: str) -> List[str]:
    """Extract relevant financial terms from text"""
    financial_vocabulary = [
        'smart money', 'order flow', 'institutional', 'liquidity',
        'consolidation', 'accumulation', 'distribution', 'displacement',
        'bullish', 'bearish', 'premium', 'discount', 'order blocks',
        'market selling', 'market orders', 'supply', 'demand'
    ]
    
    found_terms = []
    text_lower = text.lower()
    
    for term in financial_vocabulary:
        if term in text_lower:
            found_terms.append(term)
    
    return found_terms

# Create chunks for all documents
print("Creating intelligent chunks...")
document_chunks = chunk_all_documents(cleaned_text_data)

print(f"\nTotal chunks created: {len(document_chunks)}")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


Creating intelligent chunks...


[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Chunking documents:   0%|          | 0/17352 [00:00<?, ?it/s]

✓ EURUSD_BTMM_story.txt: 37 chunks created
✓ EURUSD_SMC_story.txt: 37 chunks created
✓ EURUSD_Wyckoff_story.txt: 46 chunks created
✓ GBPUSD_BTMM_story.txt: 36 chunks created
✓ GBPUSD_SMC_story.txt: 37 chunks created
✓ GBPUSD_Wyckoff_story.txt: 47 chunks created
✓ Symbol_1000_BTMM_story.txt: 37 chunks created
✓ Symbol_1000_SMC_story.txt: 38 chunks created
✓ Symbol_1000_Wyckoff_story.txt: 47 chunks created
✓ Symbol_1001_BTMM_story.txt: 37 chunks created
✓ Symbol_1001_SMC_story.txt: 38 chunks created
✓ Symbol_1001_Wyckoff_story.txt: 46 chunks created
✓ Symbol_1002_BTMM_story.txt: 36 chunks created
✓ Symbol_1002_SMC_story.txt: 37 chunks created
✓ Symbol_1002_Wyckoff_story.txt: 47 chunks created
✓ Symbol_1003_BTMM_story.txt: 36 chunks created
✓ Symbol_1003_SMC_story.txt: 37 chunks created
✓ Symbol_1003_Wyckoff_story.txt: 46 chunks created
✓ Symbol_1004_BTMM_story.txt: 36 chunks created
✓ Symbol_1004_SMC_story.txt: 37 chunks created
✓ Symbol_1004_Wyckoff_story.txt: 47 chunks created
✓ Symbol

In [45]:
#Data Quality Analysis

In [46]:
def analyze_chunk_quality(chunks: List[Dict]) -> Dict:
    """Analyze the quality and characteristics of created chunks"""
    
    if not chunks:
        return {"error": "No chunks to analyze"}
    
    # Basic statistics
    lengths = [chunk['length'] for chunk in chunks]
    sentence_counts = [chunk['sentence_count'] for chunk in chunks]
    
    analysis = {
        'total_chunks': len(chunks),
        'avg_length': np.mean(lengths),
        'std_length': np.std(lengths),
        'min_length': min(lengths),
        'max_length': max(lengths),
        'avg_sentences': np.mean(sentence_counts),
        'total_text_length': sum(lengths)
    }
    
    # Financial terms analysis
    all_terms = []
    for chunk in chunks:
        all_terms.extend(chunk['financial_terms'])
    
    from collections import Counter
    term_counts = Counter(all_terms)
    analysis['top_financial_terms'] = term_counts.most_common(10)
    analysis['unique_terms'] = len(term_counts)
    
    # Source file distribution
    source_counts = Counter([chunk['source_file'] for chunk in chunks])
    analysis['chunks_per_file'] = dict(source_counts)
    
    return analysis

def display_chunk_analysis(analysis: Dict):
    """Display chunk analysis in readable format"""
    print("=== Chunk Quality Analysis ===")
    print(f"Total chunks: {analysis['total_chunks']:,}")
    print(f"Average chunk length: {analysis['avg_length']:.1f} ± {analysis['std_length']:.1f} chars")
    print(f"Length range: {analysis['min_length']} - {analysis['max_length']} chars")
    print(f"Average sentences per chunk: {analysis['avg_sentences']:.1f}")
    print(f"Total text processed: {analysis['total_text_length']:,} characters")
    
    print(f"\n=== Financial Terms Coverage ===")
    print(f"Unique financial terms found: {analysis['unique_terms']}")
    print("Most common terms:")
    for term, count in analysis['top_financial_terms']:
        print(f"  {term}: {count} chunks")
    
    print(f"\n=== Source File Distribution ===")
    for filename, count in analysis['chunks_per_file'].items():
        print(f"  {filename}: {count} chunks")

# Analyze chunk quality
chunk_analysis = analyze_chunk_quality(document_chunks)
display_chunk_analysis(chunk_analysis)

=== Chunk Quality Analysis ===
Total chunks: 695,377
Average chunk length: 485.9 ± 71.5 chars
Length range: 15 - 601 chars
Average sentences per chunk: 6.0
Total text processed: 337,904,189 characters

=== Financial Terms Coverage ===
Unique financial terms found: 14
Most common terms:
  liquidity: 422887 chunks
  accumulation: 317593 chunks
  consolidation: 244394 chunks
  premium: 153703 chunks
  bullish: 147361 chunks
  discount: 146987 chunks
  order blocks: 141427 chunks
  displacement: 131149 chunks
  institutional: 122892 chunks
  order flow: 16792 chunks

=== Source File Distribution ===
  EURUSD_BTMM_story.txt: 37 chunks
  EURUSD_SMC_story.txt: 37 chunks
  EURUSD_Wyckoff_story.txt: 46 chunks
  GBPUSD_BTMM_story.txt: 36 chunks
  GBPUSD_SMC_story.txt: 37 chunks
  GBPUSD_Wyckoff_story.txt: 47 chunks
  Symbol_1000_BTMM_story.txt: 37 chunks
  Symbol_1000_SMC_story.txt: 38 chunks
  Symbol_1000_Wyckoff_story.txt: 47 chunks
  Symbol_1001_BTMM_story.txt: 37 chunks
  Symbol_1001_SMC_sto

In [41]:
#Sample Chunks Preview

In [42]:
def preview_chunks(chunks: List[Dict], num_samples: int = 5):
    """Preview sample chunks to verify quality"""
    
    if not chunks:
        print("No chunks to preview")
        return
    
    print(f"=== Preview of {min(num_samples, len(chunks))} Sample Chunks ===")
    
    # Show variety: short, medium, long chunks
    sample_indices = []
    lengths = [chunk['length'] for chunk in chunks]
    
    # Get chunks of different lengths
    sorted_by_length = sorted(enumerate(chunks), key=lambda x: x[1]['length'])
    
    # Short chunk
    sample_indices.append(sorted_by_length[len(sorted_by_length)//4][0])
    # Medium chunk  
    sample_indices.append(sorted_by_length[len(sorted_by_length)//2][0])
    # Long chunk
    sample_indices.append(sorted_by_length[3*len(sorted_by_length)//4][0])
    
    # Add random samples
    import random
    random_indices = random.sample(range(len(chunks)), min(2, len(chunks)))
    sample_indices.extend(random_indices)
    
    # Remove duplicates and limit
    sample_indices = list(set(sample_indices))[:num_samples]
    
    for i, idx in enumerate(sample_indices, 1):
        chunk = chunks[idx]
        print(f"\n--- Sample {i}: {chunk['chunk_id']} ---")
        print(f"Source: {chunk['source_file']}")
        print(f"Length: {chunk['length']} chars, Sentences: {chunk['sentence_count']}")
        print(f"Financial terms: {', '.join(chunk['financial_terms']) if chunk['financial_terms'] else 'None'}")
        print(f"Text preview:")
        preview_text = chunk['text']
        if len(preview_text) > 300:
            print(f"{preview_text[:300]}...")
        else:
            print(preview_text)
        print("-" * 50)

# Preview sample chunks
preview_chunks(document_chunks)

=== Preview of 5 Sample Chunks ===

--- Sample 1: Symbol_1788_SMC_story.txt_010 ---
Source: Symbol_1788_SMC_story.txt
Length: 538 chars, Sentences: 7
Financial terms: liquidity, consolidation, discount, supply
Text preview:
Price action consolidated, building equal highs and lows for future liquidity runs. The sequence ended with a drop into discount levels. In response, Market structure showed supply zone respected with standard market flow. Price action consolidated, building equal highs and lows for future liquidity...
--------------------------------------------------

--- Sample 2: Symbol_4319_Wyckoff_story.txt_043 ---
Source: Symbol_4319_Wyckoff_story.txt
Length: 482 chars, Sentences: 5
Financial terms: consolidation, accumulation
Text preview:
Transitioning into, The composite operator's intentions revealed composite operator accumulating with average volume test. accumulation schematic developed with signs of strength emerging from the trading range. The phase completed with ma

In [47]:
#Save Processed data

In [48]:
def save_processed_chunks(chunks: List[Dict], output_path: str = "data/processed"):
    """Save processed chunks to files for next steps"""
    
    output_dir = Path(output_path)
    output_dir.mkdir(exist_ok=True)
    
    # Save as JSON for easy loading
    chunks_file = output_dir / "processed_chunks.json"
    with open(chunks_file, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Saved {len(chunks)} chunks to {chunks_file}")
    
    # Save as CSV for analysis
    csv_data = []
    for chunk in chunks:
        csv_data.append({
            'chunk_id': chunk['chunk_id'],
            'source_file': chunk['source_file'],
            'chunk_index': chunk['chunk_index'],
            'length': chunk['length'],
            'sentence_count': chunk['sentence_count'],
            'financial_terms_count': len(chunk['financial_terms']),
            'financial_terms': ', '.join(chunk['financial_terms']),
            'text_preview': chunk['text'][:100] + '...' if len(chunk['text']) > 100 else chunk['text']
        })
    
    df = pd.DataFrame(csv_data)
    csv_file = output_dir / "chunks_metadata.csv"
    df.to_csv(csv_file, index=False)
    print(f"✓ Saved metadata to {csv_file}")
    
    # Save processing summary
    summary = {
        'processing_date': pd.Timestamp.now().isoformat(),
        'total_chunks': len(chunks),
        'source_files': list(set([chunk['source_file'] for chunk in chunks])),
        'config_used': config,
        'quality_analysis': chunk_analysis
    }
    
    summary_file = output_dir / "processing_summary.json"
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, default=str)
    
    print(f"✓ Saved processing summary to {summary_file}")
    
    return chunks_file, csv_file, summary_file

# Save all processed data
if document_chunks:
    saved_files = save_processed_chunks(document_chunks)
    print(f"\nProcessed data saved successfully!")
    print(f"Files created:")
    for file_path in saved_files:
        print(f"  - {file_path}")
else:
    print("No chunks to save. Please check your input data.")

✓ Saved 695377 chunks to data\processed\processed_chunks.json
✓ Saved metadata to data\processed\chunks_metadata.csv
✓ Saved processing summary to data\processed\processing_summary.json

Processed data saved successfully!
Files created:
  - data\processed\processed_chunks.json
  - data\processed\chunks_metadata.csv
  - data\processed\processing_summary.json


In [49]:
#Data Validation and Quality Checks

In [50]:
def validate_processed_data(chunks: List[Dict]) -> bool:
    """Perform final validation on processed chunks"""
    
    print("=== Data Validation ===")
    validation_passed = True
    
    # Check 1: All chunks have required fields
    required_fields = ['chunk_id', 'source_file', 'text', 'length', 'financial_terms']
    for i, chunk in enumerate(chunks[:10]):  # Check first 10
        for field in required_fields:
            if field not in chunk:
                print(f"✗ Chunk {i} missing field: {field}")
                validation_passed = False
    
    if validation_passed:
        print("✓ All chunks have required fields")
    
    # Check 2: No empty chunks
    empty_chunks = [chunk for chunk in chunks if not chunk['text'].strip()]
    if empty_chunks:
        print(f"✗ Found {len(empty_chunks)} empty chunks")
        validation_passed = False
    else:
        print("✓ No empty chunks found")
    
    # Check 3: Reasonable length distribution
    lengths = [chunk['length'] for chunk in chunks]
    too_short = sum(1 for length in lengths if length < 50)
    too_long = sum(1 for length in lengths if length > 1000)
    
    if too_short > len(chunks) * 0.1:  # More than 10% too short
        print(f"⚠️  {too_short} chunks might be too short (< 50 chars)")
    
    if too_long > len(chunks) * 0.1:  # More than 10% too long
        print(f"⚠️  {too_long} chunks might be too long (> 1000 chars)")
    
    # Check 4: Financial terms coverage
    chunks_with_terms = sum(1 for chunk in chunks if chunk['financial_terms'])
    coverage = chunks_with_terms / len(chunks) * 100
    
    if coverage < 30:
        print(f"⚠️  Low financial terms coverage: {coverage:.1f}% of chunks")
    else:
        print(f"✓ Good financial terms coverage: {coverage:.1f}% of chunks")
    
    print(f"\nValidation {'PASSED' if validation_passed else 'FAILED'}")
    return validation_passed

# Validate the processed data
validation_result = validate_processed_data(document_chunks)

# Final summary
print("\n=== Step 2 Complete ===")
print(f"✓ Processed {len(all_text_data)} input files")
print(f"✓ Created {len(document_chunks)} text chunks")
print(f"✓ Saved processed data to data/processed/")
print(f"✓ Data validation: {'PASSED' if validation_result else 'NEEDS ATTENTION'}")
print("\nReady for Step 3: Local Embedding Generation")

=== Data Validation ===
✓ All chunks have required fields
✓ No empty chunks found
✓ Good financial terms coverage: 92.7% of chunks

Validation PASSED

=== Step 2 Complete ===
✓ Processed 17352 input files
✓ Created 695377 text chunks
✓ Saved processed data to data/processed/
✓ Data validation: PASSED

Ready for Step 3: Local Embedding Generation


In [51]:
#Step 3: Local Embedding Generation - Complete Implementation

In [52]:
#Load Configuration and Processed Data

In [53]:
# Load configuration and processed chunks from Step 2
import json
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
from typing import List, Dict, Tuple, Optional
from tqdm.notebook import tqdm
import torch

# Load configuration
with open('rag_config.json', 'r') as f:
    config = json.load(f)

# Load processed chunks from Step 2
def load_processed_chunks(file_path="data/processed/processed_chunks.json"):
    """Load the chunks created in Step 2"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            chunks = json.load(f)
        print(f"✓ Loaded {len(chunks)} processed chunks")
        return chunks
    except FileNotFoundError:
        print(f"✗ File not found: {file_path}")
        print("Please run Step 2 first to create processed chunks")
        return None
    except Exception as e:
        print(f"✗ Error loading chunks: {e}")
        return None

# Load chunks
document_chunks = load_processed_chunks()

if document_chunks:
    print(f"Sample chunk keys: {list(document_chunks[0].keys())}")
    print(f"Total text to embed: {sum(len(chunk['text']) for chunk in document_chunks):,} characters")
else:
    print("Cannot proceed without processed chunks. Please run Step 2 first.")

✓ Loaded 695377 processed chunks
Sample chunk keys: ['chunk_id', 'source_file', 'chunk_index', 'text', 'length', 'sentence_count', 'financial_terms']
Total text to embed: 337,208,812 characters


In [54]:
#Setup Embedding Model

In [58]:
#Missing dependency
!pip install huggingface_hub[hf_xet]

Defaulting to user installation because normal site-packages is not writeable
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.9-cp37-abi3-win_amd64.whl.metadata (4.7 kB)
Downloading hf_xet-1.1.9-cp37-abi3-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.8 MB ? eta -:--:--
   ------- -------------------------------- 0.5/2.8 MB 1.4 MB/s eta 0:00:02
   ------------------ --------------------- 1.3/2.8 MB 2.5 MB/s eta 0:00:01
   ------------------------------------- -- 2.6/2.8 MB 3.8 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 3.4 MB/s eta 0:00:00
Installing collected packages: hf-xet
Successfully installed hf-xet-1.1.9


In [59]:
# Setup local embedding model using sentence-transformers
from sentence_transformers import SentenceTransformer
import torch

def setup_embedding_model(model_name=None):
    """Setup and test the embedding model"""
    
    # Use model from config if not specified
    if model_name is None:
        model_name = config['embedding']['model_name']
    
    print(f"Setting up embedding model: {model_name}")
    print(f"Device: {config['embedding']['device']}")
    
    # Check available device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if device != config['embedding']['device']:
        print(f"⚠️  Config says {config['embedding']['device']}, but using {device}")
        config['embedding']['device'] = device
    
    try:
        # Load the model
        model = SentenceTransformer(model_name, device=device)
        
        # Test the model with financial text
        test_text = "Smart money building liquidity for next move with bullish order flow"
        test_embedding = model.encode([test_text])
        
        print(f"✓ Model loaded successfully")
        print(f"✓ Embedding dimension: {test_embedding.shape[1]}")
        print(f"✓ Model max sequence length: {model.max_seq_length}")
        
        # Update config with actual embedding dimension
        config['embedding']['dimension'] = int(test_embedding.shape[1])
        
        return model
        
    except Exception as e:
        print(f"✗ Error loading model: {e}")
        print("Trying alternative model...")
        
        # Fallback to a smaller model
        fallback_model = "paraphrase-MiniLM-L3-v2"
        try:
            model = SentenceTransformer(fallback_model, device=device)
            test_embedding = model.encode([test_text])
            print(f"✓ Fallback model {fallback_model} loaded successfully")
            print(f"✓ Embedding dimension: {test_embedding.shape[1]}")
            config['embedding']['dimension'] = int(test_embedding.shape[1])
            return model
        except Exception as e2:
            print(f"✗ Fallback model also failed: {e2}")
            return None

# Setup embedding model
embedding_model = setup_embedding_model()

if embedding_model:
    print(f"\n=== Model Information ===")
    print(f"Model name: {embedding_model._modules['0'].auto_model.config.name_or_path}")
    print(f"Embedding dimension: {config['embedding']['dimension']}")
    print(f"Max sequence length: {embedding_model.max_seq_length}")
else:
    print("Cannot proceed without embedding model")

Setting up embedding model: all-MiniLM-L6-v2
Device: cpu
✓ Model loaded successfully
✓ Embedding dimension: 384
✓ Model max sequence length: 256

=== Model Information ===
Model name: sentence-transformers/all-MiniLM-L6-v2
Embedding dimension: 384
Max sequence length: 256


In [None]:
#Text Processing For Embeddings

In [60]:
def preprocess_for_embedding(text: str, max_length: int = 512) -> str:
    """
    Preprocess text specifically for embedding generation
    Focuses on preserving financial terminology and context
    """
    
    # Remove excessive whitespace
    text = " ".join(text.split())
    
    # Truncate if too long, but try to preserve complete sentences
    if len(text) > max_length:
        # Find the last complete sentence within the limit
        truncated = text[:max_length]
        last_period = truncated.rfind('.')
        last_semicolon = truncated.rfind(';')
        
        # Use the later sentence ending
        cut_point = max(last_period, last_semicolon)
        
        if cut_point > max_length * 0.7:  # If we found a good cut point
            text = text[:cut_point + 1]
        else:
            # Fallback: cut at max_length
            text = text[:max_length]
    
    return text.strip()

def prepare_texts_for_embedding(chunks: List[Dict]) -> List[Dict]:
    """Prepare all chunk texts for embedding generation"""
    
    prepared_chunks = []
    
    for chunk in tqdm(chunks, desc="Preparing texts"):
        original_text = chunk['text']
        processed_text = preprocess_for_embedding(
            original_text, 
            config['embedding']['max_length']
        )
        
        # Create prepared chunk with embedding-ready text
        prepared_chunk = chunk.copy()
        prepared_chunk['embedding_text'] = processed_text
        prepared_chunk['text_truncated'] = len(processed_text) < len(original_text)
        prepared_chunk['embedding_length'] = len(processed_text)
        
        prepared_chunks.append(prepared_chunk)
    
    # Statistics
    truncated_count = sum(1 for chunk in prepared_chunks if chunk['text_truncated'])
    avg_length = np.mean([chunk['embedding_length'] for chunk in prepared_chunks])
    
    print(f"✓ Prepared {len(prepared_chunks)} texts for embedding")
    print(f"✓ Average embedding text length: {avg_length:.1f} characters")
    print(f"✓ Truncated texts: {truncated_count} ({truncated_count/len(prepared_chunks)*100:.1f}%)")
    
    return prepared_chunks

# Prepare texts for embedding
if document_chunks and embedding_model:
    prepared_chunks = prepare_texts_for_embedding(document_chunks)
else:
    print("Skipping text preparation - missing chunks or model")

Preparing texts:   0%|          | 0/695377 [00:00<?, ?it/s]

✓ Prepared 695377 texts for embedding
✓ Average embedding text length: 455.1 characters
✓ Truncated texts: 239836 (34.5%)


In [61]:
#Batch Embedding Generation

In [None]:
def generate_embeddings_batch(chunks: List[Dict], model: SentenceTransformer, batch_size: int = None) -> Tuple[np.ndarray, List[str]]:
    """
    Generate embeddings in batches to manage memory efficiently
    """
    
    if batch_size is None:
        batch_size = config['embedding']['batch_size']
    
    print(f"Generating embeddings with batch size: {batch_size}")
    
    # Extract texts for embedding
    texts = [chunk['embedding_text'] for chunk in chunks]
    chunk_ids = [chunk['chunk_id'] for chunk in chunks]
    
    all_embeddings = []
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]
        
        try:
            # Generate embeddings for this batch
            batch_embeddings = model.encode(
                batch_texts,
                convert_to_numpy=True,
                normalize_embeddings=True,  # L2 normalize for better similarity
                show_progress_bar=False
            )
            
            all_embeddings.append(batch_embeddings)
            
            # Memory management
            if i % (batch_size * 10) == 0 and i > 0:
                print(f"  Processed {i + batch_size}/{len(texts)} chunks")
                
        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print(f"⚠️  GPU memory error at batch {i//batch_size + 1}")
                print("Trying with smaller batch size...")
                
                # Reduce batch size and retry
                smaller_batch_size = max(1, batch_size // 2)
                batch_embeddings = []
                
                for j in range(0, len(batch_texts), smaller_batch_size):
                    mini_batch = batch_texts[j:j + smaller_batch_size]
                    mini_embeddings = model.encode(
                        mini_batch,
                        convert_to_numpy=True,
                        normalize_embeddings=True,
                        show_progress_bar=False
                    )
                    batch_embeddings.append(mini_embeddings)
                
                batch_embeddings = np.vstack(batch_embeddings)
                all_embeddings.append(batch_embeddings)
            else:
                raise e
    
    # Combine all batches
    final_embeddings = np.vstack(all_embeddings)
    
    print(f"✓ Generated embeddings shape: {final_embeddings.shape}")
    print(f"✓ Embedding dimension: {final_embeddings.shape[1]}")
    
    return final_embeddings, chunk_ids

# Generate embeddings
if prepared_chunks and embedding_model:
    print("Starting embedding generation...")
    embeddings, embedding_chunk_ids = generate_embeddings_batch(prepared_chunks, embedding_model)
    
    # Verify embeddings
    print(f"\n=== Embedding Verification ===")
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Number of chunks: {len(prepared_chunks)}")
    print(f"Embedding dimension: {embeddings.shape[1]}")
    print(f"Data type: {embeddings.dtype}")
    
    # Check for any invalid embeddings
    invalid_embeddings = np.isnan(embeddings).any(axis=1).sum()
    if invalid_embeddings > 0:
        print(f"⚠️  Found {invalid_embeddings} invalid embeddings")
    else:
        print("✓ All embeddings are valid")
        
else:
    print("Skipping embedding generation - missing prerequisites")

Starting embedding generation...
Generating embeddings with batch size: 25


Generating embeddings:   0%|          | 0/27816 [00:00<?, ?it/s]

  Processed 275/695377 chunks
  Processed 525/695377 chunks
  Processed 775/695377 chunks
  Processed 1025/695377 chunks
  Processed 1275/695377 chunks
  Processed 1525/695377 chunks
  Processed 1775/695377 chunks
  Processed 2025/695377 chunks
  Processed 2275/695377 chunks
  Processed 2525/695377 chunks
  Processed 2775/695377 chunks
  Processed 3025/695377 chunks
  Processed 3275/695377 chunks
  Processed 3525/695377 chunks
  Processed 3775/695377 chunks
  Processed 4025/695377 chunks
  Processed 4275/695377 chunks
  Processed 4525/695377 chunks
  Processed 4775/695377 chunks
  Processed 5025/695377 chunks
  Processed 5275/695377 chunks
  Processed 5525/695377 chunks
  Processed 5775/695377 chunks
  Processed 6025/695377 chunks
  Processed 6275/695377 chunks
  Processed 6525/695377 chunks
  Processed 6775/695377 chunks
  Processed 7025/695377 chunks
  Processed 7275/695377 chunks
  Processed 7525/695377 chunks
  Processed 7775/695377 chunks
  Processed 8025/695377 chunks
  Processed

In [None]:
#Embedding Quality Analysis

In [None]:
def analyze_embedding_quality(embeddings: np.ndarray, chunks: List[Dict]) -> Dict:
    """Analyze the quality and characteristics of generated embeddings"""
    
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    analysis = {}
    
    # Basic statistics
    analysis['shape'] = embeddings.shape
    analysis['mean_norm'] = np.mean(np.linalg.norm(embeddings, axis=1))
    analysis['std_norm'] = np.std(np.linalg.norm(embeddings, axis=1))
    
    # Similarity analysis
    print("Computing pairwise similarities...")
    sample_size = min(100, embeddings.shape[0])  # Sample for efficiency
    sample_indices = np.random.choice(embeddings.shape[0], sample_size, replace=False)
    sample_embeddings = embeddings[sample_indices]
    
    similarity_matrix = cosine_similarity(sample_embeddings)
    
    # Remove diagonal (self-similarity)
    mask = np.eye(similarity_matrix.shape[0], dtype=bool)
    similarities = similarity_matrix[~mask]
    
    analysis['avg_similarity'] = np.mean(similarities)
    analysis['std_similarity'] = np.std(similarities)
    analysis['min_similarity'] = np.min(similarities)
    analysis['max_similarity'] = np.max(similarities)
    
    # Find most similar and dissimilar pairs
    idx = np.unravel_index(np.argmax(similarity_matrix * ~np.eye(sample_size, dtype=bool)), similarity_matrix.shape)
    analysis['most_similar_score'] = similarity_matrix[idx]
    analysis['most_similar_chunks'] = (sample_indices[idx[0]], sample_indices[idx[1]])
    
    # Cluster analysis using financial terms
    financial_chunks = [chunk for chunk in chunks if chunk['financial_terms']]
    if len(financial_chunks) > 10:
        analysis['financial_term_coverage'] = len(financial_chunks) / len(chunks)
    
    return analysis

def visualize_embeddings(embeddings: np.ndarray, chunks: List[Dict], sample_size: int = 500):
    """Create visualizations of the embedding space"""
    
    # Sample for visualization if too many
    if embeddings.shape[0] > sample_size:
        indices = np.random.choice(embeddings.shape[0], sample_size, replace=False)
        sample_embeddings = embeddings[indices]
        sample_chunks = [chunks[i] for i in indices]
    else:
        sample_embeddings = embeddings
        sample_chunks = chunks
    
    # Reduce dimensions using PCA
    print("Reducing dimensions for visualization...")
    pca = PCA(n_components=2, random_state=42)
    embeddings_2d = pca.fit_transform(sample_embeddings)
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot 1: Basic scatter plot
    axes[0, 0].scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.6, s=20)
    axes[0, 0].set_title('Embedding Space (2D PCA)')
    axes[0, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    axes[0, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    
    # Plot 2: Color by financial term count
    term_counts = [len(chunk['financial_terms']) for chunk in sample_chunks]
    scatter = axes[0, 1].scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                               c=term_counts, cmap='viridis', alpha=0.7, s=20)
    axes[0, 1].set_title('Embeddings by Financial Term Count')
    axes[0, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    axes[0, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    plt.colorbar(scatter, ax=axes[0, 1])
    
    # Plot 3: Color by chunk length
    lengths = [chunk['length'] for chunk in sample_chunks]
    scatter = axes[1, 0].scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                               c=lengths, cmap='plasma', alpha=0.7, s=20)
    axes[1, 0].set_title('Embeddings by Chunk Length')
    axes[1, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    axes[1, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    plt.colorbar(scatter, ax=axes[1, 0])
    
    # Plot 4: Color by source file
    source_files = [chunk['source_file'] for chunk in sample_chunks]
    unique_sources = list(set(source_files))
    source_colors = [unique_sources.index(source) for source in source_files]
    
    scatter = axes[1, 1].scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                               c=source_colors, cmap='tab10', alpha=0.7, s=20)
    axes[1, 1].set_title('Embeddings by Source File')
    axes[1, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    axes[1, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    
    plt.tight_layout()
    plt.show()
    
    print(f"✓ Visualization complete for {len(sample_embeddings)} embeddings")
    print(f"✓ PCA explained variance: {pca.explained_variance_ratio_.sum():.1%}")

# Analyze embedding quality
if 'embeddings' in locals() and 'prepared_chunks' in locals():
    print("Analyzing embedding quality...")
    quality_analysis = analyze_embedding_quality(embeddings, prepared_chunks)
    
    print("\n=== Embedding Quality Analysis ===")
    print(f"Shape: {quality_analysis['shape']}")
    print(f"Average norm: {quality_analysis['mean_norm']:.3f} ± {quality_analysis['std_norm']:.3f}")
    print(f"Average similarity: {quality_analysis['avg_similarity']:.3f} ± {quality_analysis['std_similarity']:.3f}")
    print(f"Similarity range: {quality_analysis['min_similarity']:.3f} to {quality_analysis['max_similarity']:.3f}")
    
    if 'financial_term_coverage' in quality_analysis:
        print(f"Financial term coverage: {quality_analysis['financial_term_coverage']:.1%}")
    
    # Create visualizations
    visualize_embeddings(embeddings, prepared_chunks)
else:
    print("Skipping quality analysis - embeddings not generated yet")

In [None]:
#Embedding Similarity Testing

In [None]:
def test_embedding_similarity(embeddings: np.ndarray, chunks: List[Dict], embedding_model: SentenceTransformer):
    """Test embedding quality with financial market queries"""
    
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Test queries related to your financial data
    test_queries = [
        "smart money accumulation at discount levels",
        "bullish order flow with demand overwhelming supply",
        "institutional distribution at premium levels",
        "bearish market selling with strong displacement",
        "consolidation phase with equal highs and lows",
        "liquidity building for next market move"
    ]
    
    print("=== Embedding Similarity Testing ===")
    
    for i, query in enumerate(test_queries, 1):
        print(f"\nTest {i}: '{query}'")
        
        # Generate embedding for query
        query_embedding = embedding_model.encode([query], normalize_embeddings=True)
        
        # Find most similar chunks
        similarities = cosine_similarity(query_embedding, embeddings)[0]
        top_indices = np.argsort(similarities)[-5:][::-1]  # Top 5 most similar
        
        print("Most similar chunks:")
        for j, idx in enumerate(top_indices):
            chunk = chunks[idx]
            score = similarities[idx]
            preview = chunk['text'][:100] + "..." if len(chunk['text']) > 100 else chunk['text']
            
            print(f"  {j+1}. Score: {score:.3f} | {chunk['chunk_id']}")
            print(f"     Terms: {', '.join(chunk['financial_terms']) if chunk['financial_terms'] else 'None'}")
            print(f"     Text: {preview}")
            print()

def find_similar_chunks(query_text: str, embeddings: np.ndarray, chunks: List[Dict], 
                       embedding_model: SentenceTransformer, top_k: int = 3) -> List[Dict]:
    """
    Find chunks most similar to a given query
    This is a preview of what your RAG system will do
    """
    
    # Generate query embedding
    query_embedding = embedding_model.encode([query_text], normalize_embeddings=True)
    
    # Calculate similarities
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top k most similar
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    results = []
    for idx in top_indices:
        result = {
            'chunk': chunks[idx],
            'similarity_score': float(similarities[idx]),
            'rank': len(results) + 1
        }
        results.append(result)
    
    return results

# Run similarity tests
if 'embeddings' in locals() and 'prepared_chunks' in locals() and embedding_model:
    test_embedding_similarity(embeddings, prepared_chunks, embedding_model)
    
    # Interactive similarity search
    print("\n=== Interactive Similarity Search ===")
    print("Try your own queries to test the embeddings:")
    
    # Example searches
    example_queries = [
        "What is smart money doing in the market?",
        "Describe bullish accumulation patterns",
        "How does institutional distribution work?"
    ]
    
    for query in example_queries:
        print(f"\nQuery: '{query}'")
        results = find_similar_chunks(query, embeddings, prepared_chunks, embedding_model)
        
        for result in results:
            chunk = result['chunk']
            score = result['similarity_score']
            print(f"  Rank {result['rank']}: {score:.3f} | {chunk['chunk_id']}")
            preview = chunk['text'][:150] + "..." if len(chunk['text']) > 150 else chunk['text']
            print(f"    {preview}")
        print("-" * 60)
else:
    print("Skipping similarity tests - missing prerequisites")

In [None]:
#Save Embeddings and Meta Data

In [None]:
def save_embeddings(embeddings: np.ndarray, chunks: List[Dict], chunk_ids: List[str], 
                   model_info: Dict, output_dir: str = "data/embeddings"):
    """Save embeddings and associated metadata"""
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Save embeddings as numpy array
    embeddings_file = output_path / "embeddings.npy"
    np.save(embeddings_file, embeddings)
    print(f"✓ Saved embeddings to {embeddings_file}")
    
    # Save chunk IDs
    chunk_ids_file = output_path / "chunk_ids.json"
    with open(chunk_ids_file, 'w') as f:
        json.dump(chunk_ids, f, indent=2)
    print(f"✓ Saved chunk IDs to {chunk_ids_file}")
    
    # Save chunks with embedding metadata
    chunks_with_embedding_info = []
    for i, chunk in enumerate(chunks):
        chunk_with_info = chunk.copy()
        chunk_with_info['embedding_index'] = i
        chunk_with_info['embedding_norm'] = float(np.linalg.norm(embeddings[i]))
        chunks_with_embedding_info.append(chunk_with_info)
    
    chunks_file = output_path / "chunks_with_embeddings.json"
    with open(chunks_file, 'w', encoding='utf-8') as f:
        json.dump(chunks_with_embedding_info, f, indent=2, ensure_ascii=False)
    print(f"✓ Saved enhanced chunks to {chunks_file}")
    
    # Save model and processing metadata
    metadata = {
        'model_name': embedding_model._modules['0'].auto_model.config.name_or_path,
        'embedding_dimension': int(embeddings.shape[1]),
        'num_chunks': int(embeddings.shape[0]),
        'creation_date': pd.Timestamp.now().isoformat(),
        'config_used': config,
        'quality_analysis': quality_analysis if 'quality_analysis' in locals() else None,
        'preprocessing_stats': {
            'total_chunks': len(chunks),
            'avg_embedding_length': np.mean([chunk.get('embedding_length', 0) for chunk in chunks]),
            'truncated_chunks': sum(1 for chunk in chunks if chunk.get('text_truncated', False))
        }
    }
    
    metadata_file = output_path / "embedding_metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    print(f"✓ Saved metadata to {metadata_file}")
    
    # Create index mapping for quick lookup
    index_mapping = {chunk_id: i for i, chunk_id in enumerate(chunk_ids)}
    mapping_file = output_path / "index_mapping.json"
    with open(mapping_file, 'w') as f:
        json.dump(index_mapping, f, indent=2)
    print(f"✓ Saved index mapping to {mapping_file}")
    
    return embeddings_file, chunks_file, metadata_file

def create_embedding_summary():
    """Create a comprehensive summary of the embedding generation process"""
    
    summary = {
        'step_3_completion': {
            'status': 'completed',
            'timestamp': pd.Timestamp.now().isoformat()
        },
        'model_info': {
            'name': embedding_model._modules['0'].auto_model.config.name_or_path if embedding_model else 'N/A',
            'dimension': config.get('embedding', {}).get('dimension', 'N/A'),
            'device': config.get('embedding', {}).get('device', 'N/A')
        },
        'processing_stats': {
            'total_chunks_processed': len(prepared_chunks) if 'prepared_chunks' in locals() else 0,
            'embeddings_generated': embeddings.shape[0] if 'embeddings' in locals() else 0,
            'batch_size_used': config.get('embedding', {}).get('batch_size', 'N/A')
        },
        'quality_metrics': quality_analysis if 'quality_analysis' in locals() else {},
        'files_created': [
            'data/embeddings/embeddings.npy',
            'data/embeddings/chunk_ids.json',
            'data/embeddings/chunks_with_embeddings.json',
            'data/embeddings/embedding_metadata.json',
            'data/embeddings/index_mapping.json'
        ],
        'next_step': 'Step 4: Vector Database Setup'
    }
    
    summary_file = Path("data/embeddings") / "step3_summary.json"
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    
    return summary

# Save all embedding data
if 'embeddings' in locals() and 'prepared_chunks' in locals():
    print("Saving embeddings and metadata...")
    
    model_info = {
        'name': embedding_model._modules['0'].auto_model.config.name_or_path,
        'dimension': embeddings.shape[1]
    }
    
    saved_files = save_embeddings(embeddings, prepared_chunks, embedding_chunk_ids, model_info)
    
    # Create summary
    summary = create_embedding_summary()
    print(f"\n✓ Step 3 summary saved")
    
    print(f"\n=== Embedding Generation Complete ===")
    print(f"✓ Generated {embeddings.shape[0]} embeddings")
    print(f"✓ Embedding dimension: {embeddings.shape[1]}")
    print(f"✓ Saved to data/embeddings/")
    print(f"✓ Ready for Step 4: Vector Database Setup")
    
else:
    print("Cannot save embeddings - generation incomplete")

In [None]:
#Load Embeddings for Future Use

In [None]:
def load_embeddings(embeddings_dir="data/embeddings"):
    """
    Utility function to load embeddings in future steps
    This will be used in Step 4 and beyond
    """
    
    embeddings_path = Path(embeddings_dir)
    
    if not embeddings_path.exists():
        print(f"Embeddings directory not found: {embeddings_dir}")
        return None, None, None
    
    try:
        # Load embeddings
        embeddings = np.load(embeddings_path / "embeddings.npy")
        
        # Load chunk IDs
        with open(embeddings_path / "chunk_ids.json", 'r') as f:
            chunk_ids = json.load(f)
        
        # Load chunks with embedding info
        with open(embeddings_path / "chunks_with_embeddings.json", 'r') as f:
            chunks = json.load(f)
        
        # Load metadata
        with open(embeddings_path / "embedding_metadata.json", 'r') as f:
            metadata = json.load(f)
        
        print(f"✓ Loaded {embeddings.shape[0]} embeddings")
        print(f"✓ Embedding dimension: {embeddings.shape[1]}")
        print(f"✓ Model used: {metadata.get('model_name', 'Unknown')}")
        
        return embeddings, chunks, metadata
        
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return None, None, None

# Test the loading function
print("Testing embedding loading function...")
test_embeddings, test_chunks, test_metadata = load_embeddings()

if test_embeddings is not None:
    print("✓ Embedding loading function works correctly")
else:
    print("✗ Embedding loading function failed - check if embeddings were saved correctly")

In [None]:
#Step 4: Vector Database Setup - Complete Implementation

In [None]:
#Load Configuration and Embeddings from Step 3

In [None]:
# Load configuration and embeddings generated in Step 3
import json
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Union
from tqdm.notebook import tqdm
import pickle

# Load configuration
with open('rag_config.json', 'r') as f:
    config = json.load(f)

def load_embeddings_and_chunks(embeddings_dir="data/embeddings"):
    """Load embeddings and chunks from Step 3"""
    
    embeddings_path = Path(embeddings_dir)
    
    if not embeddings_path.exists():
        print(f"✗ Embeddings directory not found: {embeddings_dir}")
        print("Please complete Step 3 first")
        return None, None, None, None
    
    try:
        # Load embeddings
        print("Loading embeddings...")
        embeddings = np.load(embeddings_path / "embeddings.npy")
        
        # Load chunk IDs
        with open(embeddings_path / "chunk_ids.json", 'r') as f:
            chunk_ids = json.load(f)
        
        # Load chunks with embedding info
        with open(embeddings_path / "chunks_with_embeddings.json", 'r', encoding='utf-8') as f:
            chunks = json.load(f)
        
        # Load metadata
        with open(embeddings_path / "embedding_metadata.json", 'r') as f:
            metadata = json.load(f)
        
        print(f"✓ Loaded {embeddings.shape[0]} embeddings")
        print(f"✓ Embedding dimension: {embeddings.shape[1]}")
        print(f"✓ Model used: {metadata.get('model_name', 'Unknown')}")
        print(f"✓ Creation date: {metadata.get('creation_date', 'Unknown')}")
        
        return embeddings, chunks, chunk_ids, metadata
        
    except Exception as e:
        print(f"✗ Error loading embeddings: {e}")
        return None, None, None, None

# Load embeddings and data
embeddings, chunks, chunk_ids, embedding_metadata = load_embeddings_and_chunks()

if embeddings is not None:
    print(f"\n=== Data Summary ===")
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Number of chunks: {len(chunks)}")
    print(f"Embedding dimension: {embeddings.shape[1]}")
    print(f"Vector database type: {config['vector_db']['type']}")
else:
    print("Cannot proceed without embeddings. Please complete Step 3 first.")

In [None]:
#Setup FAISS Vector Database

In [None]:
# Setup FAISS vector database (recommended for speed and efficiency)
try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    print("⚠️  FAISS not available. Install with: pip install faiss-cpu")
    FAISS_AVAILABLE = False

class FAISSSVectorDB:
    def __init__(self, embedding_dimension: int):
        self.dimension = embedding_dimension
        self.index = None
        self.chunk_ids = []
        self.chunks = []
        
    def create_index(self, index_type: str = "IVFFlat", nlist: int = None):
        """Create FAISS index"""
        
        if nlist is None:
            # Rule of thumb: sqrt(n) clusters, but at least 10
            nlist = max(10, int(np.sqrt(len(self.chunks))))
        
        print(f"Creating {index_type} index with {nlist} clusters...")
        
        if index_type == "IVFFlat":
            # Inverted file with flat (exhaustive) distances
            quantizer = faiss.IndexFlatIP(self.dimension)  # Inner Product for cosine similarity
            self.index = faiss.IndexIVFFlat(quantizer, self.dimension, nlist)
        elif index_type == "Flat":
            # Brute force exact search (slower but exact)
            self.index = faiss.IndexFlatIP(self.dimension)
        elif index_type == "HNSW":
            # Hierarchical Navigable Small World (good for high precision)
            self.index = faiss.IndexHNSWFlat(self.dimension, 32)
        else:
            raise ValueError(f"Unsupported index type: {index_type}")
        
        print(f"✓ Created {index_type} index")
        return self.index
    
    def train_and_add_vectors(self, embeddings: np.ndarray, chunk_ids: List[str], chunks: List[Dict]):
        """Train index (if needed) and add vectors"""
        
        print(f"Adding {len(embeddings)} vectors to index...")
        
        # Ensure embeddings are in correct format
        embeddings = embeddings.astype(np.float32)
        
        # Train index if it requires training
        if hasattr(self.index, 'is_trained') and not self.index.is_trained:
            print("Training index...")
            self.index.train(embeddings)
            print("✓ Index trained")
        
        # Add vectors to index
        self.index.add(embeddings)
        
        # Store metadata
        self.chunk_ids = chunk_ids
        self.chunks = chunks
        
        print(f"✓ Added vectors to index. Total: {self.index.ntotal}")
    
    def search(self, query_embedding: np.ndarray, k: int = 5, nprobe: int = None) -> List[Dict]:
        """Search for similar vectors"""
        
        if self.index is None:
            raise ValueError("Index not created. Call create_index() first.")
        
        # Set search parameters
        if nprobe is not None and hasattr(self.index, 'nprobe'):
            self.index.nprobe = nprobe
        elif hasattr(self.index, 'nprobe'):
            # Default nprobe (number of clusters to search)
            self.index.nprobe = min(10, max(1, self.index.nlist // 10))
        
        # Ensure query is in correct format
        query_embedding = query_embedding.astype(np.float32)
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        # Search
        similarities, indices = self.index.search(query_embedding, k)
        
        # Convert to results
        results = []
        for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
            if idx >= 0:  # Valid result
                results.append({
                    'chunk_id': self.chunk_ids[idx],
                    'chunk': self.chunks[idx],
                    'similarity_score': float(similarity),
                    'rank': i + 1
                })
        
        return results
    
    def save_index(self, file_path: str):
        """Save FAISS index to disk"""
        faiss.write_index(self.index, file_path)
        print(f"✓ Saved FAISS index to {file_path}")
    
    def load_index(self, file_path: str):
        """Load FAISS index from disk"""
        self.index = faiss.read_index(file_path)
        print(f"✓ Loaded FAISS index from {file_path}")

def setup_faiss_database(embeddings: np.ndarray, chunks: List[Dict], chunk_ids: List[str]):
    """Setup and populate FAISS vector database"""
    
    if not FAISS_AVAILABLE:
        print("FAISS not available")
        return None
    
    print("=== Setting up FAISS Vector Database ===")
    
    # Create FAISS database
    vector_db = FAISSSVectorDB(embeddings.shape[1])
    
    # Choose index type based on data size
    if len(embeddings) < 1000:
        index_type = "Flat"  # Exact search for small datasets
    elif len(embeddings) < 10000:
        index_type = "IVFFlat"  # Good balance
    else:
        index_type = "IVFFlat"  # Scalable for large datasets
    
    # Create and populate index
    vector_db.create_index(index_type)
    vector_db.train_and_add_vectors(embeddings, chunk_ids, chunks)
    
    return vector_db

# Setup FAISS if embeddings are available
if embeddings is not None and FAISS_AVAILABLE:
    faiss_db = setup_faiss_database(embeddings, chunks, chunk_ids)
else:
    faiss_db = None
    print("Skipping FAISS setup")

In [None]:
#Setup ChromaDBVector Database

In [None]:
# Setup ChromaDB vector database (alternative to FAISS, more features)
try:
    import chromadb
    from chromadb.config import Settings
    CHROMADB_AVAILABLE = True
except ImportError:
    print("⚠️  ChromaDB not available. Install with: pip install chromadb")
    CHROMADB_AVAILABLE = False

class ChromaVectorDB:
    def __init__(self, collection_name: str = "financial_analysis", persist_directory: str = "vector_db/chroma"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        
    def create_client(self):
        """Create ChromaDB client with persistence"""
        
        # Create directory for persistence
        Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
        
        # Create client with persistent storage
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        
        print(f"✓ Created ChromaDB client with persistence at {self.persist_directory}")
        return self.client
    
    def create_collection(self, embedding_function=None):
        """Create or get collection"""
        
        if self.client is None:
            self.create_client()
        
        # Try to get existing collection, create if doesn't exist
        try:
            self.collection = self.client.get_collection(
                name=self.collection_name,
                embedding_function=embedding_function
            )
            print(f"✓ Retrieved existing collection: {self.collection_name}")
        except:
            self.collection = self.client.create_collection(
                name=self.collection_name,
                embedding_function=embedding_function,
                metadata={"description": "Financial market analysis chunks"}
            )
            print(f"✓ Created new collection: {self.collection_name}")
        
        return self.collection
    
    def add_documents(self, embeddings: np.ndarray, chunks: List[Dict], chunk_ids: List[str]):
        """Add documents to ChromaDB collection"""
        
        if self.collection is None:
            self.create_collection()
        
        print(f"Adding {len(chunks)} documents to ChromaDB...")
        
        # Prepare documents for ChromaDB
        documents = []
        metadatas = []
        ids = []
        embeddings_list = []
        
        for i, (chunk, chunk_id, embedding) in enumerate(zip(chunks, chunk_ids, embeddings)):
            # Document text
            documents.append(chunk['text'])
            
            # Metadata
            metadata = {
                'source_file': chunk['source_file'],
                'chunk_index': chunk['chunk_index'],
                'length': chunk['length'],
                'financial_terms': ', '.join(chunk.get('financial_terms', [])),
                'sentence_count': chunk.get('sentence_count', 0)
            }
            metadatas.append(metadata)
            
            # IDs and embeddings
            ids.append(chunk_id)
            embeddings_list.append(embedding.tolist())
        
        # Add to collection in batches to avoid memory issues
        batch_size = 1000
        
        for i in tqdm(range(0, len(documents), batch_size), desc="Adding to ChromaDB"):
            end_idx = min(i + batch_size, len(documents))
            
            self.collection.add(
                documents=documents[i:end_idx],
                metadatas=metadatas[i:end_idx],
                ids=ids[i:end_idx],
                embeddings=embeddings_list[i:end_idx]
            )
        
        print(f"✓ Added {len(documents)} documents to ChromaDB")
        
        # Verify collection
        count = self.collection.count()
        print(f"✓ Collection now contains {count} documents")
    
    def search(self, query_embedding: np.ndarray, k: int = 5, 
               where: Dict = None, where_document: Dict = None) -> List[Dict]:
        """Search ChromaDB collection"""
        
        if self.collection is None:
            raise ValueError("Collection not created. Call create_collection() first.")
        
        # Convert query embedding to list
        if isinstance(query_embedding, np.ndarray):
            if query_embedding.ndim == 1:
                query_embeddings = [query_embedding.tolist()]
            else:
                query_embeddings = query_embedding.tolist()
        else:
            query_embeddings = [query_embedding]
        
        # Search
        results = self.collection.query(
            query_embeddings=query_embeddings,
            n_results=k,
            where=where,
            where_document=where_document,
            include=['documents', 'metadatas', 'distances']
        )
        
        # Convert to standardized format
        formatted_results = []
        
        for i, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            # Convert distance to similarity (ChromaDB returns distances, lower is better)
            similarity = 1.0 - distance  # Simple conversion for cosine distance
            
            formatted_results.append({
                'chunk_id': results['ids'][0][i],
                'chunk': {
                    'text': doc,
                    'source_file': metadata.get('source_file'),
                    'chunk_index': metadata.get('chunk_index'),
                    'length': metadata.get('length'),
                    'financial_terms': metadata.get('financial_terms', '').split(', ') if metadata.get('financial_terms') else [],
                    'sentence_count': metadata.get('sentence_count')
                },
                'similarity_score': float(similarity),
                'distance': float(distance),
                'rank': i + 1
            })
        
        return formatted_results
    
    def get_collection_stats(self):
        """Get statistics about the collection"""
        if self.collection is None:
            return None
        
        count = self.collection.count()
        
        # Sample some documents for analysis
        sample = self.collection.get(
            limit=min(100, count),
            include=['metadatas']
        )
        
        stats = {
            'total_documents': count,
            'collection_name': self.collection_name,
            'persist_directory': self.persist_directory
        }
        
        if sample['metadatas']:
            # Analyze source files
            source_files = [meta.get('source_file', 'Unknown') for meta in sample['metadatas']]
            from collections import Counter
            stats['source_file_distribution'] = dict(Counter(source_files))
        
        return stats

def setup_chromadb_database(embeddings: np.ndarray, chunks: List[Dict], chunk_ids: List[str]):
    """Setup and populate ChromaDB vector database"""
    
    if not CHROMADB_AVAILABLE:
        print("ChromaDB not available")
        return None
    
    print("=== Setting up ChromaDB Vector Database ===")
    
    # Create ChromaDB database
    vector_db = ChromaVectorDB()
    
    # Create collection and add documents
    vector_db.create_collection()
    vector_db.add_documents(embeddings, chunks, chunk_ids)
    
    # Get stats
    stats = vector_db.get_collection_stats()
    if stats:
        print(f"✓ Collection stats: {stats['total_documents']} documents")
        if 'source_file_distribution' in stats:
            print("Source file distribution:")
            for file, count in stats['source_file_distribution'].items():
                print(f"  {file}: {count} chunks")
    
    return vector_db

# Setup ChromaDB if embeddings are available
if embeddings is not None and CHROMADB_AVAILABLE:
    chroma_db = setup_chromadb_database(embeddings, chunks, chunk_ids)
else:
    chroma_db = None
    print("Skipping ChromaDB setup")

In [None]:
#Unified Vector Database Interface

In [None]:
# Create unified interface for both vector databases
class UnifiedVectorDB:
    def __init__(self, faiss_db=None, chroma_db=None, embedding_model=None):
        self.faiss_db = faiss_db
        self.chroma_db = chroma_db
        self.embedding_model = embedding_model
        self.preferred_db = None
        
        # Determine preferred database
        if faiss_db is not None:
            self.preferred_db = "faiss"
            print("✓ Using FAISS as primary vector database")
        elif chroma_db is not None:
            self.preferred_db = "chroma"
            print("✓ Using ChromaDB as primary vector database")
        else:
            print("⚠️  No vector database available")
    
    def encode_query(self, query_text: str) -> np.ndarray:
        """Encode query text to embedding"""
        if self.embedding_model is None:
            raise ValueError("No embedding model available. Please load embedding model.")
        
        # Generate query embedding (should match the embeddings used to create the database)
        query_embedding = self.embedding_model.encode([query_text], normalize_embeddings=True)
        return query_embedding[0]
    
    def search(self, query: Union[str, np.ndarray], k: int = 5, db_type: str = None) -> List[Dict]:
        """
        Search vector database with text query or embedding
        
        Args:
            query: Either text string or embedding vector
            k: Number of results to return
            db_type: Force specific database ('faiss' or 'chroma')
        """
        
        # Determine which database to use
        if db_type is None:
            db_type = self.preferred_db
        
        if db_type is None:
            raise ValueError("No vector database available")
        
        # Convert text query to embedding if needed
        if isinstance(query, str):
            if self.embedding_model is None:
                raise ValueError("Cannot encode text query - no embedding model loaded")
            query_embedding = self.encode_query(query)
        else:
            query_embedding = query
        
        # Search using specified database
        if db_type == "faiss" and self.faiss_db is not None:
            return self.faiss_db.search(query_embedding, k)
        elif db_type == "chroma" and self.chroma_db is not None:
            return self.chroma_db.search(query_embedding, k)
        else:
            raise ValueError(f"Database type '{db_type}' not available")
    
    def compare_databases(self, query: str, k: int = 3):
        """Compare results from both databases"""
        
        print(f"Comparing database results for: '{query}'")
        print("=" * 60)
        
        results = {}
        
        # Test FAISS
        if self.faiss_db is not None:
            try:
                faiss_results = self.search(query, k, "faiss")
                results['faiss'] = faiss_results
                print(f"\nFAISS Results:")
                for result in faiss_results:
                    print(f"  {result['rank']}. Score: {result['similarity_score']:.3f} | {result['chunk_id']}")
                    preview = result['chunk']['text'][:100] + "..."
                    print(f"     {preview}")
            except Exception as e:
                print(f"\nFAISS Error: {e}")
        
        # Test ChromaDB
        if self.chroma_db is not None:
            try:
                chroma_results = self.search(query, k, "chroma")
                results['chroma'] = chroma_results
                print(f"\nChromaDB Results:")
                for result in chroma_results:
                    print(f"  {result['rank']}. Score: {result['similarity_score']:.3f} | {result['chunk_id']}")
                    preview = result['chunk']['text'][:100] + "..."
                    print(f"     {preview}")
            except Exception as e:
                print(f"\nChromaDB Error: {e}")
        
        return results

# Load embedding model for query encoding
def load_embedding_model_for_queries():
    """Load the same embedding model used in Step 3"""
    from sentence_transformers import SentenceTransformer
    
    if embedding_metadata is not None:
        model_name = embedding_metadata.get('model_name')
        if model_name:
            try:
                print(f"Loading embedding model: {model_name}")
                model = SentenceTransformer(model_name)
                print("✓ Embedding model loaded for queries")
                return model
            except Exception as e:
                print(f"Error loading model {model_name}: {e}")
    
    # Fallback to config model
    try:
        model_name = config['embedding']['model_name']
        print(f"Loading fallback model: {model_name}")
        model = SentenceTransformer(model_name)
        print("✓ Fallback embedding model loaded")
        return model
    except Exception as e:
        print(f"Error loading fallback model: {e}")
        return None

# Setup unified interface
query_embedding_model = load_embedding_model_for_queries()

if (faiss_db is not None or chroma_db is not None) and query_embedding_model is not None:
    unified_db = UnifiedVectorDB(faiss_db, chroma_db, query_embedding_model)
    print("\n✓ Unified vector database interface ready")
else:
    unified_db = None
    print("\n⚠️  Cannot create unified interface - missing components")

In [None]:
#Test Vector Database with Financial Queries

In [None]:
def test_vector_database(vector_db: UnifiedVectorDB, test_queries: List[str] = None):
    """Test vector database with financial market queries"""
    
    if test_queries is None:
        test_queries = [
            "smart money accumulation at discount levels",
            "bullish order flow with strong demand",
            "institutional distribution at premium",
            "bearish market selling and displacement",
            "consolidation phase building liquidity",
            "equal highs and lows liquidity engineering",
            "order blocks waiting for displacement",
            "aggressive accumulation through market orders",
            "supply overwhelming demand bearish flow",
            "strong bullish move percentage gain"
        ]
    
    print("=== Vector Database Testing ===")
    print(f"Testing with {len(test_queries)} financial queries...")
    
    for i, query in enumerate(test_queries, 1):
        print(f"\n--- Test {i}: '{query}' ---")
        
        try:
            # Search using primary database
            results = vector_db.search(query, k=3)
            
            if results:
                print("Top 3 results:")
                for result in results:
                    chunk = result['chunk']
                    score = result['similarity_score']
                    
                    # Show preview
                    preview = chunk['text'][:120] + "..." if len(chunk['text']) > 120 else chunk['text']
                    
                    print(f"  {result['rank']}. Score: {score:.3f} | {result['chunk_id']}")
                    print(f"     Source: {chunk['source_file']}")
                    print(f"     Terms: {', '.join(chunk.get('financial_terms', [])) if chunk.get('financial_terms') else 'None'}")
                    print(f"     Text: {preview}")
                    print()
            else:
                print("  No results found")
                
        except Exception as e:
            print(f"  Error: {e}")
    
    print("Vector database testing complete!")

def interactive_query_interface(vector_db: UnifiedVectorDB):
    """Interactive interface for testing queries"""
    
    print("\n=== Interactive Query Interface ===")
    print("Enter financial market queries to test the vector database")
    print("Type 'quit' to exit")
    
    example_queries = [
        "What patterns indicate smart money accumulation?",
        "How does institutional distribution affect price?",
        "What is order flow analysis?",
        "Describe bullish market structure",
        "How to identify liquidity grabs?"
    ]
    
    print("\nExample queries you can try:")
    for i, example in enumerate(example_queries, 1):
        print(f"  {i}. {example}")
    
    while True:
        try:
            query = input("\nEnter your query (or 'quit'): ").strip()
            
            if query.lower() in ['quit', 'exit', 'q']:
                print("Goodbye!")
                break
            
            if not query:
                continue
            
            print(f"\nSearching for: '{query}'")
            print("-" * 50)
            
            # Search
            results = vector_db.search(query, k=5)
            
            if results:
                for result in results:
                    chunk = result['chunk']
                    score = result['similarity_score']
                    
                    print(f"\nRank {result['rank']} (Score: {score:.3f})")
                    print(f"Source: {chunk['source_file']} | ID: {result['chunk_id']}")
                    
                    if chunk.get('financial_terms'):
                        print(f"Financial terms: {', '.join(chunk['financial_terms'])}")
                    
                    # Show full text or preview
                    text = chunk['text']
                    if len(text) > 300:
                        print(f"Text: {text[:300]}...")
                        show_full = input("Show full text? (y/n): ").strip().lower()
                        if show_full == 'y':
                            print(f"\nFull text:\n{text}")
                    else:
                        print(f"Text: {text}")
                    
                    print("-" * 50)
            else:
                print("No results found for your query.")
                
        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

# Run tests if vector database is available
if unified_db is not None:
    # Automated testing
    test_vector_database(unified_db)
    
    # Compare databases if both are available
    if unified_db.faiss_db is not None and unified_db.chroma_db is not None:
        print("\n=== Database Comparison ===")
        unified_db.compare_databases("smart money accumulation patterns")
    
    # Interactive interface option
    print("\n=== Ready for Interactive Testing ===")
    print("Uncomment the line below to start interactive query interface:")
    print("# interactive_query_interface(unified_db)")
    
    # Uncomment to run interactive interface
    # interactive_query_interface(unified_db)
    
else:
    print("Vector database testing skipped - no database available")

In [None]:
#Save Vector Database Configuration

In [None]:
def save_vector_database_config(output_dir="vector_db"):
    """Save vector database configuration and metadata"""
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Create comprehensive configuration
    vector_db_config = {
        'creation_date': pd.Timestamp.now().isoformat(),
        'source_embeddings': {
            'embeddings_file': 'data/embeddings/embeddings.npy',
            'chunks_file': 'data/embeddings/chunks_with_embeddings.json',
            'metadata_file': 'data/embeddings/embedding_metadata.json'
        },
        'databases': {},
        'query_model': {
            'model_name': embedding_metadata.get('model_name') if embedding_metadata else 'Unknown',
            'dimension': embedding_metadata.get('embedding_dimension') if embedding_metadata else 'Unknown'
        },
        'performance_stats': {}
    }
    
    # FAISS configuration
    if faiss_db is not None:
        faiss_config = {
            'available': True,
            'index_type': type(faiss_db.index).__name__ if hasattr(faiss_db, 'index') else 'Unknown',
            'total_vectors': faiss_db.index.ntotal if hasattr(faiss_db, 'index') else 0,
            'index_file': f'{output_dir}/faiss_index.bin',
            'metadata_file': f'{output_dir}/faiss_metadata.pkl'
        }
        
        # Save FAISS index
        try:
            faiss_db.save_index(str(output_path / "faiss_index.bin"))
            
            # Save FAISS metadata
            faiss_metadata = {
                'chunk_ids': faiss_db.chunk_ids,
                'dimension': faiss_db.dimension,
                'total_vectors': faiss_db.index.ntotal,
                'index_type': type(faiss_db.index).__name__
            }
            
            with open(output_path / "faiss_metadata.pkl", 'wb') as f:
                pickle.dump(faiss_metadata, f)
            
            print("✓ Saved FAISS index and metadata")
            
        except Exception as e:
            print(f"Error saving FAISS: {e}")
            faiss_config['available'] = False
        
        vector_db_config['databases']['faiss'] = faiss_config
    
    # ChromaDB configuration
    if chroma_db is not None:
        chroma_stats = chroma_db.get_collection_stats()
        chroma_config = {
            'available': True,
            'collection_name': chroma_db.collection_name,
            'persist_directory': chroma_db.persist_directory,
            'total_documents': chroma_stats['total_documents'] if chroma_stats else 0
        }
        vector_db_config['databases']['chromadb'] = chroma_config
        print("✓ ChromaDB configuration recorded (auto-persisted)")
    
    # Save main configuration
    config_file = output_path / "vector_db_config.json"
    with open(config_file, 'w') as f:
        json.dump(vector_db_config, f, indent=2, default=str)
    
    # Create loading instructions
    instructions = """
# Vector Database Loading Instructions

## To reload your vector databases in a new session:

### FAISS Database:
```python
import faiss
import pickle
import numpy as np

# Load FAISS index
faiss_index = faiss.read_index('vector_db/faiss_index.bin')

# Load metadata
with open('vector_db/faiss_metadata.pkl', 'rb') as f:
    faiss_metadata = pickle.load(f)

print(f"FAISS index loaded: {faiss_index.ntotal} vectors")

In [None]:
#ChromaDB Database

In [None]:
import chromadb

# ChromaDB automatically persists, just reconnect
client = chromadb.PersistentClient(path="vector_db/chroma")
collection = client.get_collection("financial_analysis")

print(f"ChromaDB loaded: {collection.count()} documents")

In [None]:
#Full Reload

In [None]:
# Run the loading function created in this notebook
vector_db = load_vector_database_complete()

In [None]:
# Complete Vector Database Reload Code - Copy this entire block

import json
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Union
import pickle

# Import vector database libraries
try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    FAISS_AVAILABLE = False
    print("FAISS not available")

try:
    import chromadb
    CHROMADB_AVAILABLE = True
except ImportError:
    CHROMADB_AVAILABLE = False
    print("ChromaDB not available")

try:
    from sentence_transformers import SentenceTransformer
    SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
    SENTENCE_TRANSFORMERS_AVAILABLE = False
    print("SentenceTransformers not available")

# FAISS Vector Database Class
class FAISSSVectorDB:
    def __init__(self, embedding_dimension: int):
        self.dimension = embedding_dimension
        self.index = None
        self.chunk_ids = []
        self.chunks = []
        
    def search(self, query_embedding: np.ndarray, k: int = 5, nprobe: int = None) -> List[Dict]:
        """Search for similar vectors"""
        if self.index is None:
            raise ValueError("Index not created. Call create_index() first.")
        
        # Set search parameters
        if nprobe is not None and hasattr(self.index, 'nprobe'):
            self.index.nprobe = nprobe
        elif hasattr(self.index, 'nprobe'):
            self.index.nprobe = min(10, max(1, self.index.nlist // 10))
        
        # Ensure query is in correct format
        query_embedding = query_embedding.astype(np.float32)
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        # Search
        similarities, indices = self.index.search(query_embedding, k)
        
        # Convert to results
        results = []
        for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
            if idx >= 0:  # Valid result
                results.append({
                    'chunk_id': self.chunk_ids[idx],
                    'chunk': self.chunks[idx],
                    'similarity_score': float(similarity),
                    'rank': i + 1
                })
        
        return results
    
    def save_index(self, file_path: str):
        """Save FAISS index to disk"""
        if FAISS_AVAILABLE:
            faiss.write_index(self.index, file_path)
            print(f"✓ Saved FAISS index to {file_path}")
    
    def load_index(self, file_path: str):
        """Load FAISS index from disk"""
        if FAISS_AVAILABLE:
            self.index = faiss.read_index(file_path)
            print(f"✓ Loaded FAISS index from {file_path}")

# ChromaDB Vector Database Class
class ChromaVectorDB:
    def __init__(self, collection_name: str = "financial_analysis", persist_directory: str = "vector_db/chroma"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        
    def create_client(self):
        """Create ChromaDB client with persistence"""
        if not CHROMADB_AVAILABLE:
            return None
            
        Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        print(f"✓ Created ChromaDB client with persistence at {self.persist_directory}")
        return self.client
    
    def create_collection(self, embedding_function=None):
        """Create or get collection"""
        if self.client is None:
            self.create_client()
        
        try:
            self.collection = self.client.get_collection(
                name=self.collection_name,
                embedding_function=embedding_function
            )
            print(f"✓ Retrieved existing collection: {self.collection_name}")
        except:
            self.collection = self.client.create_collection(
                name=self.collection_name,
                embedding_function=embedding_function,
                metadata={"description": "Financial market analysis chunks"}
            )
            print(f"✓ Created new collection: {self.collection_name}")
        
        return self.collection
    
    def search(self, query_embedding: np.ndarray, k: int = 5, 
               where: Dict = None, where_document: Dict = None) -> List[Dict]:
        """Search ChromaDB collection"""
        if self.collection is None:
            raise ValueError("Collection not created. Call create_collection() first.")
        
        # Convert query embedding to list
        if isinstance(query_embedding, np.ndarray):
            if query_embedding.ndim == 1:
                query_embeddings = [query_embedding.tolist()]
            else:
                query_embeddings = query_embedding.tolist()
        else:
            query_embeddings = [query_embedding]
        
        # Search
        results = self.collection.query(
            query_embeddings=query_embeddings,
            n_results=k,
            where=where,
            where_document=where_document,
            include=['documents', 'metadatas', 'distances']
        )
        
        # Convert to standardized format
        formatted_results = []
        
        for i, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            similarity = 1.0 - distance
            
            formatted_results.append({
                'chunk_id': results['ids'][0][i],
                'chunk': {
                    'text': doc,
                    'source_file': metadata.get('source_file'),
                    'chunk_index': metadata.get('chunk_index'),
                    'length': metadata.get('length'),
                    'financial_terms': metadata.get('financial_terms', '').split(', ') if metadata.get('financial_terms') else [],
                    'sentence_count': metadata.get('sentence_count')
                },
                'similarity_score': float(similarity),
                'distance': float(distance),
                'rank': i + 1
            })
        
        return formatted_results
    
    def get_collection_stats(self):
        """Get statistics about the collection"""
        if self.collection is None:
            return None
        
        count = self.collection.count()
        return {
            'total_documents': count,
            'collection_name': self.collection_name,
            'persist_directory': self.persist_directory
        }

# Unified Vector Database Interface Class
class UnifiedVectorDB:
    def __init__(self, faiss_db=None, chroma_db=None, embedding_model=None):
        self.faiss_db = faiss_db
        self.chroma_db = chroma_db
        self.embedding_model = embedding_model
        self.preferred_db = None
        
        if faiss_db is not None:
            self.preferred_db = "faiss"
            print("✓ Using FAISS as primary vector database")
        elif chroma_db is not None:
            self.preferred_db = "chroma"
            print("✓ Using ChromaDB as primary vector database")
        else:
            print("⚠️  No vector database available")
    
    def encode_query(self, query_text: str) -> np.ndarray:
        """Encode query text to embedding"""
        if self.embedding_model is None:
            raise ValueError("No embedding model available. Please load embedding model.")
        
        query_embedding = self.embedding_model.encode([query_text], normalize_embeddings=True)
        return query_embedding[0]
    
    def search(self, query: Union[str, np.ndarray], k: int = 5, db_type: str = None) -> List[Dict]:
        """Search vector database with text query or embedding"""
        
        if db_type is None:
            db_type = self.preferred_db
        
        if db_type is None:
            raise ValueError("No vector database available")
        
        # Convert text query to embedding if needed
        if isinstance(query, str):
            if self.embedding_model is None:
                raise ValueError("Cannot encode text query - no embedding model loaded")
            query_embedding = self.encode_query(query)
        else:
            query_embedding = query
        
        # Search using specified database
        if db_type == "faiss" and self.faiss_db is not None:
            return self.faiss_db.search(query_embedding, k)
        elif db_type == "chroma" and self.chroma_db is not None:
            return self.chroma_db.search(query_embedding, k)
        else:
            raise ValueError(f"Database type '{db_type}' not available")

# Complete Reload Function
def load_vector_database_complete():
    """
    Complete function to reload vector databases and create unified interface
    Returns a UnifiedVectorDB object ready for querying
    """
    
    print("=== Loading Vector Database System ===")
    
    # Check if required files exist
    required_files = [
        'vector_db/vector_db_config.json',
        'data/embeddings/chunks_with_embeddings.json'
    ]
    
    for file_path in required_files:
        if not Path(file_path).exists():
            print(f"✗ Required file missing: {file_path}")
            print("Please run Steps 1-4 first to create the vector database")
            return None
    
    # Load configuration
    try:
        with open('vector_db/vector_db_config.json', 'r') as f:
            config = json.load(f)
        print("✓ Loaded vector database configuration")
    except Exception as e:
        print(f"✗ Error loading configuration: {e}")
        return None
    
    # Load embedding model
    embedding_model = None
    if SENTENCE_TRANSFORMERS_AVAILABLE:
        try:
            model_name = config['query_model']['model_name']
            print(f"Loading embedding model: {model_name}")
            embedding_model = SentenceTransformer(model_name)
            print("✓ Embedding model loaded")
        except Exception as e:
            print(f"⚠️  Error loading embedding model: {e}")
            print("Queries will not work without embedding model")
    else:
        print("⚠️  SentenceTransformers not available")
    
    # Load chunks
    try:
        with open('data/embeddings/chunks_with_embeddings.json', 'r', encoding='utf-8') as f:
            chunks = json.load(f)
        print(f"✓ Loaded {len(chunks)} chunks")
    except Exception as e:
        print(f"✗ Error loading chunks: {e}")
        return None
    
    # Load FAISS database if available
    faiss_db = None
    if FAISS_AVAILABLE and config['databases'].get('faiss', {}).get('available'):
        try:
            print("Loading FAISS database...")
            
            # Load index
            index = faiss.read_index('vector_db/faiss_index.bin')
            
            # Load metadata
            with open('vector_db/faiss_metadata.pkl', 'rb') as f:
                metadata = pickle.load(f)
            
            # Recreate FAISS wrapper
            faiss_db = FAISSSVectorDB(metadata['dimension'])
            faiss_db.index = index
            faiss_db.chunk_ids = metadata['chunk_ids']
            faiss_db.chunks = chunks
            
            print(f"✓ FAISS database loaded: {faiss_db.index.ntotal} vectors")
            
        except Exception as e:
            print(f"⚠️  Error loading FAISS: {e}")
            faiss_db = None
    else:
        if not FAISS_AVAILABLE:
            print("⚠️  FAISS not available (not installed)")
        else:
            print("⚠️  FAISS database not found in configuration")
    
    # Load ChromaDB database if available
    chroma_db = None
    if CHROMADB_AVAILABLE and config['databases'].get('chromadb', {}).get('available'):
        try:
            print("Loading ChromaDB database...")
            
            chroma_config = config['databases']['chromadb']
            chroma_db = ChromaVectorDB(
                collection_name=chroma_config['collection_name'],
                persist_directory=chroma_config['persist_directory']
            )
            chroma_db.create_client()
            chroma_db.create_collection()
            
            stats = chroma_db.get_collection_stats()
            print(f"✓ ChromaDB database loaded: {stats['total_documents']} documents")
            
        except Exception as e:
            print(f"⚠️  Error loading ChromaDB: {e}")
            chroma_db = None
    else:
        if not CHROMADB_AVAILABLE:
            print("⚠️  ChromaDB not available (not installed)")
        else:
            print("⚠️  ChromaDB database not found in configuration")
    
    # Create unified interface
    if faiss_db is not None or chroma_db is not None:
        unified_db = UnifiedVectorDB(faiss_db, chroma_db, embedding_model)
        print("\n✓ Vector database system loaded successfully!")
        
        # Test with a simple query
        if embedding_model is not None:
            try:
                test_results = unified_db.search("smart money accumulation", k=1)
                if test_results:
                    print(f"✓ Test query successful: found {len(test_results)} results")
                else:
                    print("⚠️  Test query returned no results")
            except Exception as e:
                print(f"⚠️  Test query failed: {e}")
        
        return unified_db
    else:
        print("✗ No vector databases could be loaded")
        return None

# Quick test function
def test_loaded_database(unified_db):
    """Quick test of the loaded database"""
    if unified_db is None:
        print("No database to test")
        return
    
    test_queries = [
        "smart money accumulation patterns",
        "bullish order flow analysis",
        "institutional distribution signals"
    ]
    
    print("\n=== Testing Loaded Database ===")
    
    for query in test_queries:
        try:
            results = unified_db.search(query, k=2)
            print(f"\nQuery: '{query}'")
            if results:
                for result in results:
                    score = result['similarity_score']
                    chunk_id = result['chunk_id']
                    preview = result['chunk']['text'][:100] + "..."
                    print(f"  {score:.3f} | {chunk_id} | {preview}")
            else:
                print("  No results found")
        except Exception as e:
            print(f"  Error: {e}")

# Usage Instructions
print("""
=== USAGE INSTRUCTIONS ===

1. Run this entire code block to define all classes and functions

2. Load your vector database:
   vector_db = load_vector_database_complete()

3. Test the loaded database:
   test_loaded_database(vector_db)

4. Use for queries:
   results = vector_db.search("your financial query here", k=5)

5. Interactive testing:
   while True:
       query = input("Enter query (or 'quit'): ")
       if query.lower() == 'quit': break
       results = vector_db.search(query, k=3)
       for r in results:
           print(f"{r['similarity_score']:.3f}: {r['chunk']['text'][:200]}...")
""")

In [None]:
vector_db = load_vector_database_complete()

In [None]:
test_loaded_database(vector_db)

In [None]:
#Step 5: RAG Pipeline Implementation - Complete Implementation

In [None]:
#Load All Components and Setup