NexusFlow: Complete Tutorial & Demo
Multi-Transformer Framework for Tabular Ecosystems

In [1]:
"""
This notebook provides a comprehensive walkthrough of NexusFlow, demonstrating:
1. Project initialization and setup
2. Data preparation and configuration
3. Model training with both synthetic and real data
4. Model evaluation and inference
5. Advanced features and introspection

NexusFlow is inspired by AlphaFold 2's collaborative intelligence approach,
using multiple specialized transformers that communicate via cross-attention.
"""

"\nThis notebook provides a comprehensive walkthrough of NexusFlow, demonstrating:\n1. Project initialization and setup\n2. Data preparation and configuration\n3. Model training with both synthetic and real data\n4. Model evaluation and inference\n5. Advanced features and introspection\n\nNexusFlow is inspired by AlphaFold 2's collaborative intelligence approach,\nusing multiple specialized transformers that communicate via cross-attention.\n"

In [2]:
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, Markdown
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Setup styling
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [4]:
print("🔗 NexusFlow - Multi-Transformer Framework for Tabular Data")
print("=" * 60)

🔗 NexusFlow - Multi-Transformer Framework for Tabular Data


# 🚀 Section 1: Environment Setup & Installation

In [5]:
# First, let's verify our NexusFlow installation and setup
try:
    # Add the src directory to path if running from project root
    if 'src' not in sys.path:
        src_path = Path.cwd() / 'src'
        if src_path.exists():
            sys.path.insert(0, str(src_path))
        else:
            print("⚠️  Running from non-project directory. Ensure nexusflow is installed.")
    
    # Import core NexusFlow modules
    from src.nexusflow import __version__
    from src.nexusflow.config import load_config_from_file, ConfigModel
    from src.nexusflow.project_manager import ProjectManager
    from src.nexusflow.trainer.trainer import Trainer
    from src.nexusflow.api.model_api import load_model, ModelAPI
    from src.nexusflow.data.ingestion import load_datasets, align_datasets
    
    print(f"✅ NexusFlow v{__version__} loaded successfully!")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("💡 Make sure you're running from the project root or have installed nexusflow")
    print("   Try: pip install -e .")

✅ NexusFlow v0.1.0 loaded successfully!


# 🏗️ Section 2: Project Initialization

In [6]:
def demo_project_initialization():
    """Demonstrate NexusFlow project structure creation."""
    print("\n" + "="*60)
    print("🏗️  PROJECT INITIALIZATION DEMO")
    print("="*60)
    
    # Initialize project manager
    pm = ProjectManager()
    
    # Create a demo project (use force to overwrite if exists)
    project_name = "nexusflow_demo"
    
    try:
        project_path = pm.init_project(project_name, force=True)
        print(f"✅ Demo project created at: {project_path}")
        
        # Show the created directory structure
        demo_path = Path(project_name)
        if demo_path.exists():
            print("\n📁 Project Structure:")
            for item in sorted(demo_path.rglob("*")):
                if item.is_dir():
                    level = len(item.relative_to(demo_path).parts)
                    indent = "  " * (level - 1)
                    print(f"{indent}📁 {item.name}/")
                elif item.name.endswith(('.yaml', '.yml', '.md', '.txt')):
                    level = len(item.relative_to(demo_path).parts)
                    indent = "  " * (level - 1)
                    print(f"{indent}📄 {item.name}")
        
        return demo_path
    except Exception as e:
        print(f"❌ Project creation failed: {e}")
        return None

In [7]:
# Run the demo
demo_project = demo_project_initialization()

[32m2025-08-25 03:29:32.172[0m | [34m[1mDEBUG   [0m | [36msrc.nexusflow.project_manager[0m:[36minit_project[0m:[36m32[0m - [34m[1mDirectory created: C:\Users\arkch\OneDrive\Documents\VS Code Workspace\python_files\ML_projects\Evoformer\nexusflow_demo\configs[0m
[32m2025-08-25 03:29:32.175[0m | [34m[1mDEBUG   [0m | [36msrc.nexusflow.project_manager[0m:[36minit_project[0m:[36m32[0m - [34m[1mDirectory created: C:\Users\arkch\OneDrive\Documents\VS Code Workspace\python_files\ML_projects\Evoformer\nexusflow_demo\datasets[0m
[32m2025-08-25 03:29:32.181[0m | [34m[1mDEBUG   [0m | [36msrc.nexusflow.project_manager[0m:[36minit_project[0m:[36m32[0m - [34m[1mDirectory created: C:\Users\arkch\OneDrive\Documents\VS Code Workspace\python_files\ML_projects\Evoformer\nexusflow_demo\models[0m
[32m2025-08-25 03:29:32.187[0m | [34m[1mDEBUG   [0m | [36msrc.nexusflow.project_manager[0m:[36minit_project[0m:[36m32[0m - [34m[1mDirectory created: C:\Users\a


🏗️  PROJECT INITIALIZATION DEMO
✅ Demo project created at: C:\Users\arkch\OneDrive\Documents\VS Code Workspace\python_files\ML_projects\Evoformer\nexusflow_demo

📁 Project Structure:
📁 configs/
📁 datasets/
📁 models/
📁 notebooks/
📁 results/
📁 src/


# 📊 Section 3: Synthetic Data Training Demo

In [8]:
def run_synthetic_training_demo():
    """Demonstrate training with synthetic data."""
    print("\n" + "="*60)
    print("📊 SYNTHETIC DATA TRAINING DEMO")
    print("="*60)
    
    # Create a synthetic data configuration
    synthetic_config = {
        'project_name': 'synthetic_demo',
        'primary_key': 'id',
        'target': {
            'target_table': 'table_a.csv',
            'target_column': 'label'
        },
        'architecture': {
            'refinement_iterations': 2,
            'global_embed_dim': 32
        },
        'datasets': [
            {'name': 'table_a.csv', 'transformer_type': 'standard', 'complexity': 'small'},
            {'name': 'table_b.csv', 'transformer_type': 'standard', 'complexity': 'small'}
        ],
        'training': {
            'use_synthetic': True,
            'synthetic': {
                'n_samples': 512,
                'feature_dim': 8
            },
            'batch_size': 32,
            'epochs': 3,
            'optimizer': {'name': 'adam', 'lr': 1e-3},
            'split_config': {
                'test_size': 0.2,
                'validation_size': 0.2,
                'randomize': True
            }
        },
        'mlops': {
            'logging_provider': 'stdout',
            'experiment_name': 'synthetic_demo'
        }
    }
    
    # Create config object
    config = ConfigModel.model_validate(synthetic_config)
    print("✅ Configuration created for synthetic data training")
    
    # Initialize trainer
    try:
        trainer = Trainer(config, work_dir=".")
        print("✅ Trainer initialized")
        
        # Run sanity check
        print("\n🔍 Running sanity check...")
        trainer.sanity_check()
        print("✅ Sanity check passed")
        
        # Train the model
        print("\n🚀 Starting training...")
        trainer.train()
        print("✅ Training completed!")
        
        # Evaluate the model
        print("\n📈 Evaluating model...")
        metrics = trainer.evaluate()
        
        if metrics:
            print("\n📊 Final Evaluation Results:")
            for metric, value in metrics.items():
                print(f"   {metric}: {value:.4f}")
        
        return config, trainer
        
    except Exception as e:
        print(f"❌ Training failed: {e}")
        return None, None

In [9]:
# Run synthetic training demo
synthetic_config, synthetic_trainer = run_synthetic_training_demo()

[32m2025-08-25 03:32:40.500[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m_setup_file_logging[0m:[36m171[0m - [1mFile logging enabled: results\logs\training_synthetic_demo.log[0m
[32m2025-08-25 03:32:40.506[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m__init__[0m:[36m105[0m - [1mEnhanced Trainer initialized (device=cpu)[0m
[32m2025-08-25 03:32:40.512[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m_setup_data[0m:[36m178[0m - [1mUsing synthetic data mode[0m
[32m2025-08-25 03:32:40.518[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m_setup_data[0m:[36m187[0m - [1mSynthetic data: 2 datasets × 8 features[0m
[32m2025-08-25 03:32:40.541[0m | [1mINFO    [0m | [36mnexusflow.model.nexus_former[0m:[36m__init__[0m:[36m178[0m - [1mNexusFormer initialized: 2 encoders, 2 iterations[0m



📊 SYNTHETIC DATA TRAINING DEMO
✅ Configuration created for synthetic data training


[32m2025-08-25 03:32:54.997[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36mlog_params[0m:[36m66[0m - [1mLogging parameters: {'model_name': 'NexusFormer', 'input_dims': [8, 8], 'embed_dim': 32, 'refinement_iterations': 2, 'num_parameters': 46145, 'learning_rate': 0.001, 'optimizer': 'adam', 'batch_size': 32, 'epochs': 3}[0m
[32m2025-08-25 03:32:54.998[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m__init__[0m:[36m155[0m - [1mModel initialized: 46145 parameters[0m
[32m2025-08-25 03:32:54.998[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36msanity_check[0m:[36m342[0m - [1mRunning comprehensive sanity checks...[0m
[32m2025-08-25 03:32:55.008[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m62[0m - [34m[1mStandardTabularEncoder output: shape=torch.Size([2, 32]) mean=0.0000[0m
[32m2025-08-25 03:32:55.009[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_form

✅ Trainer initialized

🔍 Running sanity check...


[32m2025-08-25 03:32:55.178[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m203[0m - [34m[1mRefinement iteration 1/2[0m
[32m2025-08-25 03:32:55.183[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m132[0m - [34m[1mCrossContextualAttention output: shape=torch.Size([12, 32])[0m
[32m2025-08-25 03:32:55.185[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m218[0m - [34m[1mEncoder 0 attention change: 1.537369[0m
[32m2025-08-25 03:32:55.188[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m132[0m - [34m[1mCrossContextualAttention output: shape=torch.Size([12, 32])[0m
[32m2025-08-25 03:32:55.190[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m218[0m - [34m[1mEncoder 1 attention change: 2.113461[0m
[32m2025-08-25 03:32:55.192[0m | [34m[1mDEBUG   [0m | [36mnexusfl

✅ Sanity check passed

🚀 Starting training...


[32m2025-08-25 03:32:55.409[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m199[0m - [34m[1mInitial encoding 0: shape=torch.Size([32, 32])[0m
[32m2025-08-25 03:32:55.414[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m62[0m - [34m[1mStandardTabularEncoder output: shape=torch.Size([32, 32]) mean=-0.0005[0m
[32m2025-08-25 03:32:55.415[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m199[0m - [34m[1mInitial encoding 1: shape=torch.Size([32, 32])[0m
[32m2025-08-25 03:32:55.416[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m203[0m - [34m[1mRefinement iteration 1/2[0m
[32m2025-08-25 03:32:55.419[0m | [34m[1mDEBUG   [0m | [36mnexusflow.model.nexus_former[0m:[36mforward[0m:[36m132[0m - [34m[1mCrossContextualAttention output: shape=torch.Size([32, 32])[0m
[32m2025-08-25 03:32:55.421[0m | [34m

✅ Training completed!

📈 Evaluating model...

📊 Final Evaluation Results:
   accuracy: 0.4744
   test_loss: 0.7115
   num_test_samples: 78.0000
   num_test_batches: 3.0000


# 🗂️ Section 4: Real Data Preparation & Training

In [15]:
def create_sample_datasets():
    """Create sample CSV files to demonstrate real data training."""
    print("\n" + "="*60)
    print("🗂️  SAMPLE DATA CREATION")
    print("="*60)
    
    # Ensure datasets directory exists
    datasets_dir = Path("datasets")
    datasets_dir.mkdir(exist_ok=True)
    
    np.random.seed(42)  # For reproducibility
    n_customers = 1000
    
    # 1. Create customers table (demographics + target)
    customers_data = {
        'customer_id': range(1, n_customers + 1),
        'age': np.random.normal(35, 12, n_customers).clip(18, 80).astype(int),
        'income': np.random.lognormal(10.5, 0.5, n_customers).clip(20000, 200000).astype(int),
        'tenure_months': np.random.exponential(24, n_customers).clip(1, 120).astype(int),
        'region': np.random.choice(['North', 'South', 'East', 'West'], n_customers),
        'has_churned': np.random.binomial(1, 0.15, n_customers)  # 15% churn rate
    }
    
    customers_df = pd.DataFrame(customers_data)
    customers_df.to_csv("datasets/customers.csv", index=False)
    print(f"✅ Created customers.csv: {customers_df.shape}")
    
    # 2. Create transactions table (behavioral data)
    # Generate multiple transactions per customer
    transactions_data = []
    for customer_id in range(1, n_customers + 1):
        n_transactions = np.random.poisson(8)  # Average 8 transactions per customer
        for _ in range(n_transactions):
            transactions_data.append({
                'customer_id': customer_id,
                'transaction_amount': np.clip(np.random.lognormal(3, 1), 5, 1000),
                'days_since_last': np.clip(np.random.exponential(15), 1, 90),
                'transaction_type': np.random.choice(['online', 'store', 'phone'], p=[0.6, 0.3, 0.1])
            })
    
    transactions_df = pd.DataFrame(transactions_data)
    
    # Aggregate transactions by customer for this demo
    agg_transactions = transactions_df.groupby('customer_id').agg({
        'transaction_amount': ['mean', 'std', 'count'],
        'days_since_last': ['mean', 'min']
    }).round(2)
    
    # Flatten column names
    agg_transactions.columns = ['_'.join(col).strip() for col in agg_transactions.columns]
    agg_transactions = agg_transactions.reset_index()
    
    # Add transaction type preferences
    type_counts = transactions_df.groupby('customer_id')['transaction_type'].value_counts().unstack(fill_value=0)
    type_proportions = type_counts.div(type_counts.sum(axis=1), axis=0).round(3)
    type_proportions = type_proportions.reset_index()
    
    transactions_final = agg_transactions.merge(type_proportions, on='customer_id')
    transactions_final.to_csv("datasets/transactions.csv", index=False)
    print(f"✅ Created transactions.csv: {transactions_final.shape}")
    
    # 3. Create support tickets table (text-like features)
    support_data = []
    for customer_id in range(1, n_customers + 1):
        if np.random.random() < 0.3:  # 30% of customers have support tickets
            n_tickets = np.random.poisson(2) + 1
            for _ in range(n_tickets):
                support_data.append({
                    'customer_id': customer_id,
                    'ticket_priority': np.random.choice(['low', 'medium', 'high'], p=[0.5, 0.4, 0.1]),
                    'resolution_days': np.clip(np.random.exponential(3), 1, 30),
                    'satisfaction_score': np.clip(np.random.normal(3.5, 1), 1, 5)
                })
    
    if support_data:
        support_df = pd.DataFrame(support_data)
        
        # Aggregate support tickets by customer
        agg_support = support_df.groupby('customer_id').agg({
            'resolution_days': ['mean', 'max'],
            'satisfaction_score': ['mean', 'min']
        }).round(2)
        
        agg_support.columns = ['_'.join(col).strip() for col in agg_support.columns]
        agg_support = agg_support.reset_index()
        
        # Add priority distributions
        priority_counts = support_df.groupby('customer_id')['ticket_priority'].value_counts().unstack(fill_value=0)
        priority_props = priority_counts.div(priority_counts.sum(axis=1), axis=0).round(3)
        priority_props.columns = [f'priority_{col}_pct' for col in priority_props.columns]
        priority_props = priority_props.reset_index()
        
        support_final = agg_support.merge(priority_props, on='customer_id', how='left').fillna(0)
        support_final.to_csv("datasets/support.csv", index=False)
        print(f"✅ Created support.csv: {support_final.shape}")
    
    print(f"\n📋 Dataset Summary:")
    print(f"   Customers: {len(customers_df)} records")
    print(f"   Transactions: {len(transactions_final)} aggregated records") 
    print(f"   Support: {len(support_final) if 'support_final' in locals() else 0} records")
    
    return True

In [16]:
def demo_real_data_training():
    """Demonstrate training with real CSV data."""
    print("\n" + "="*60)
    print("📈 REAL DATA TRAINING DEMO")
    print("="*60)
    
    # First create sample data
    create_sample_datasets()
    
    # Create configuration for real data
    real_data_config = {
        'project_name': 'churn_prediction',
        'primary_key': 'customer_id',
        'target': {
            'target_table': 'customers.csv',
            'target_column': 'has_churned'
        },
        'architecture': {
            'refinement_iterations': 3,
            'global_embed_dim': 64
        },
        'datasets': [
            {'name': 'customers.csv', 'transformer_type': 'standard', 'complexity': 'medium'},
            {'name': 'transactions.csv', 'transformer_type': 'standard', 'complexity': 'medium'},
            {'name': 'support.csv', 'transformer_type': 'standard', 'complexity': 'small'}
        ],
        'training': {
            'use_synthetic': False,  # Use real data
            'batch_size': 64,
            'epochs': 5,
            'optimizer': {'name': 'adam', 'lr': 5e-4},
            'split_config': {
                'test_size': 0.15,
                'validation_size': 0.15,
                'randomize': True
            }
        },
        'mlops': {
            'logging_provider': 'stdout',
            'experiment_name': 'churn_prediction_v1'
        }
    }
    
    config = ConfigModel.model_validate(real_data_config)
    print("✅ Configuration created for real data training")
    
    # Initialize and train
    try:
        trainer = Trainer(config, work_dir=".")
        print("✅ Trainer initialized with real data")
        
        # Show data summary
        print("\n📊 Data Summary:")
        if trainer.datasets:
            for name, df in trainer.datasets.items():
                missing_pct = df.isnull().sum().sum() / df.size * 100
                print(f"   {name}: {df.shape} ({missing_pct:.1f}% missing)")
        print(f"   Input dimensions: {trainer.input_dims}")
        
        # Run sanity check
        print("\n🔍 Running comprehensive data validation...")
        trainer.sanity_check()
        print("✅ Data validation passed")
        
        # Train the model
        print("\n🚀 Training on real data...")
        trainer.train()
        print("✅ Real data training completed!")
        
        # Evaluate
        print("\n📈 Final evaluation...")
        metrics = trainer.evaluate()
        
        if metrics:
            print("\n🎯 Model Performance:")
            for metric, value in metrics.items():
                if 'accuracy' in metric:
                    print(f"   {metric}: {value:.2%}")
                else:
                    print(f"   {metric}: {value:.4f}")
        
        return config, trainer
        
    except Exception as e:
        print(f"❌ Real data training failed: {e}")
        import traceback
        traceback.print_exc()
        return None, None

In [17]:
# Run real data training demo
real_config, real_trainer = demo_real_data_training()


📈 REAL DATA TRAINING DEMO

🗂️  SAMPLE DATA CREATION
✅ Created customers.csv: (1000, 6)
✅ Created transactions.csv: (1000, 9)
✅ Created support.csv: (307, 8)

📋 Dataset Summary:
   Customers: 1000 records
   Transactions: 1000 aggregated records
   Support: 307 records
✅ Configuration created for real data training


[32m2025-08-25 19:43:38.615[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m_setup_file_logging[0m:[36m171[0m - [1mFile logging enabled: results\logs\training_churn_prediction_v1.log[0m
[32m2025-08-25 19:43:38.618[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m__init__[0m:[36m105[0m - [1mEnhanced Trainer initialized (device=cpu)[0m
[32m2025-08-25 19:43:38.619[0m | [1mINFO    [0m | [36msrc.nexusflow.trainer.trainer[0m:[36m_setup_data[0m:[36m190[0m - [1mLoading and aligning real datasets...[0m
[32m2025-08-25 19:43:38.645[0m | [1mINFO    [0m | [36mnexusflow.data.ingestion[0m:[36mload_table[0m:[36m24[0m - [1mLoaded table: datasets/customers.csv rows=1000 cols=6[0m
[32m2025-08-25 19:43:38.646[0m | [1mINFO    [0m | [36mnexusflow.data.ingestion[0m:[36mload_table[0m:[36m25[0m - [1mMissing values: total=0 (0.0%)[0m
[32m2025-08-25 19:43:38.648[0m | [34m[1mDEBUG   [0m | [36mnexusflow.data.ingestion[0

✅ Trainer initialized with real data

📊 Data Summary:
   customers.csv: (307, 6) (0.0% missing)
   transactions.csv: (307, 9) (0.0% missing)
   support.csv: (307, 8) (0.0% missing)
   Input dimensions: [4, 8, 7]

🔍 Running comprehensive data validation...
❌ Real data training failed: could not convert string to float: 'South'


# 🔍 Section 5: Model Inspection & Inference

In [None]:
def demonstrate_model_inference():
    """Show how to load and use trained models for inference."""
    print("\n" + "="*60)
    print("🔍 MODEL INFERENCE & INSPECTION")
    print("="*60)
    
    # Look for trained model files
    model_files = list(Path(".").glob("*.nxf"))
    
    if not model_files:
        print("⚠️  No .nxf model files found. Train a model first.")
        return
    
    # Use the most recent model file
    latest_model = max(model_files, key=lambda x: x.stat().st_mtime)
    print(f"📁 Loading model: {latest_model}")
    
    try:
        # Load the model artifact
        model = load_model(str(latest_model))
        print("✅ Model loaded successfully!")
        
        # Show model summary
        print("\n📋 Model Summary:")
        print(model.summary())
        
        # Show model parameters
        print("\n⚙️  Model Parameters:")
        params = model.get_params()
        key_params = ['total_parameters', 'input_dimensions', 'architecture', 'training_info']
        for key in key_params:
            if key in params:
                print(f"   {key}: {params[key]}")
        
        # Demonstrate inference with sample data
        print("\n🔮 Sample Inference:")
        
        # Create sample input data matching the model's expected format
        input_dims = params['input_dimensions']
        batch_size = 5
        
        # Generate random sample data
        sample_data = []
        for dim in input_dims:
            sample_table = pd.DataFrame(
                np.random.randn(batch_size, dim),
                columns=[f'feature_{i}' for i in range(dim)]
            )
            sample_data.append(sample_table)
        
        # Make predictions
        predictions = model.predict(sample_data)
        print(f"   Input: {len(sample_data)} tables with dims {input_dims}")
        print(f"   Output: {predictions.shape} predictions")
        print(f"   Sample predictions: {predictions[:3].round(4)}")
        
        # Try the visualization (placeholder)
        print("\n🎨 Model Visualization:")
        viz_info = model.visualize_flow()
        if viz_info:
            print(f"   Architecture: {viz_info.get('architecture')}")
            print(f"   Encoders: {viz_info.get('num_encoders')}")
            print(f"   Refinement iterations: {viz_info.get('refinement_iterations')}")
        
        return model
        
    except Exception as e:
        print(f"❌ Model loading/inference failed: {e}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
# Run inference demo
loaded_model = demonstrate_model_inference()

# 📊 Section 6: Advanced Features & Analysis

In [None]:
def analyze_training_results():
    """Analyze and visualize training results."""
    print("\n" + "="*60)
    print("📊 TRAINING RESULTS ANALYSIS")
    print("="*60)
    
    # Look for results and logs
    results_dir = Path("results")
    if not results_dir.exists():
        print("⚠️  No results directory found")
        return
    
    # Check for metrics files
    metrics_file = results_dir / "metrics.json"
    if metrics_file.exists():
        import json
        with open(metrics_file) as f:
            metrics_data = json.load(f)
        
        print(f"✅ Found training metrics: {len(metrics_data)} logged steps")
        
        # Extract metrics for plotting
        steps = []
        train_losses = []
        val_losses = []
        
        for entry in metrics_data:
            if 'metrics' in entry and entry['step']:
                steps.append(entry['step'])
                train_losses.append(entry['metrics'].get('train_loss'))
                val_losses.append(entry['metrics'].get('val_loss'))
        
        # Create training curve plot
        if steps and train_losses:
            plt.figure(figsize=(12, 4))
            
            plt.subplot(1, 2, 1)
            plt.plot(steps, train_losses, 'b-', label='Training Loss', linewidth=2)
            if any(v is not None for v in val_losses):
                val_losses_clean = [v for v in val_losses if v is not None]
                val_steps = [s for s, v in zip(steps, val_losses) if v is not None]
                plt.plot(val_steps, val_losses_clean, 'r-', label='Validation Loss', linewidth=2)
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.title('Training Curves')
            plt.legend()
            plt.grid(True, alpha=0.3)
            
            # Model architecture visualization
            plt.subplot(1, 2, 2)
            if loaded_model:
                params = loaded_model.get_params()
                input_dims = params.get('input_dimensions', [])
                
                # Create a simple bar chart of input dimensions
                if input_dims:
                    plt.bar(range(len(input_dims)), input_dims, 
                           color=sns.color_palette("husl", len(input_dims)))
                    plt.xlabel('Encoder Index')
                    plt.ylabel('Input Features')
                    plt.title('Model Input Dimensions')
                    for i, dim in enumerate(input_dims):
                        plt.text(i, dim + max(input_dims)*0.01, str(dim), 
                               ha='center', va='bottom', fontweight='bold')
            
            plt.tight_layout()
            plt.show()
    
    # Look for checkpoint files
    checkpoint_files = list(Path(".").glob("model_epoch_*.pt"))
    if checkpoint_files:
        print(f"\n📁 Found {len(checkpoint_files)} training checkpoints")
        latest_checkpoint = max(checkpoint_files, key=lambda x: x.stat().st_mtime)
        print(f"   Latest: {latest_checkpoint.name}")
    
    # Check for best model
    best_model = Path("best_model.pt")
    if best_model.exists():
        print(f"✅ Best model checkpoint available: {best_model}")

In [None]:
def demonstrate_config_variations():
    """Show different configuration options and their effects."""
    print("\n" + "="*60)
    print("⚙️  CONFIGURATION VARIATIONS")
    print("="*60)
    
    configs = {
        "Minimal Setup": {
            'project_name': 'minimal_test',
            'primary_key': 'id',
            'target': {'target_table': 'data.csv', 'target_column': 'label'},
            'datasets': [{'name': 'data.csv'}],
            'architecture': {'global_embed_dim': 32},
            'training': {'use_synthetic': True, 'epochs': 2}
        },
        
        "Complex Multi-Table": {
            'project_name': 'complex_system',
            'primary_key': 'user_id', 
            'target': {'target_table': 'users.csv', 'target_column': 'conversion'},
            'datasets': [
                {'name': 'users.csv', 'transformer_type': 'standard', 'complexity': 'large'},
                {'name': 'events.csv', 'transformer_type': 'timeseries', 'complexity': 'medium'},
                {'name': 'content.csv', 'transformer_type': 'text', 'complexity': 'medium'}
            ],
            'architecture': {'global_embed_dim': 128, 'refinement_iterations': 5},
            'training': {'batch_size': 128, 'epochs': 50, 'optimizer': {'name': 'adam', 'lr': 1e-4}}
        },
        
        "High Performance": {
            'project_name': 'high_performance',
            'primary_key': 'sample_id',
            'target': {'target_table': 'outcomes.csv', 'target_column': 'target'},
            'datasets': [
                {'name': f'features_{i}.csv', 'complexity': 'medium'} for i in range(5)
            ],
            'architecture': {'global_embed_dim': 256, 'refinement_iterations': 8},
            'training': {
                'batch_size': 256, 
                'epochs': 100,
                'optimizer': {'name': 'adam', 'lr': 2e-4},
                'split_config': {'test_size': 0.1, 'validation_size': 0.1}
            }
        }
    }
    
    for name, config in configs.items():
        print(f"\n📋 {name}:")
        print(f"   Datasets: {len(config['datasets'])}")
        print(f"   Embedding dim: {config.get('architecture', {}).get('global_embed_dim', 64)}")
        print(f"   Epochs: {config.get('training', {}).get('epochs', 10)}")
        if 'refinement_iterations' in config.get('architecture', {}):
            print(f"   Refinement loops: {config['architecture']['refinement_iterations']}")

In [None]:
# Run analysis functions
analyze_training_results()
demonstrate_config_variations()

# 🎯 Section 7: Production Usage Patterns

In [None]:
def show_production_patterns():
    """Demonstrate production usage patterns and best practices."""
    print("\n" + "="*60)
    print("🎯 PRODUCTION USAGE PATTERNS")
    print("="*60)
    
    print("🔧 1. Batch Inference Pipeline:")
    print("""
    # Load trained model
    model = nexusflow.load_model('models/production_model.nxf')
    
    # Process new data in batches
    batch_size = 1000
    predictions = []
    
    for batch_data in data_loader:
        batch_pred = model.predict(batch_data)
        predictions.extend(batch_pred)
    """)
    
    print("\n🔧 2. API Endpoint Integration:")
    print("""
    from flask import Flask, request, jsonify
    import nexusflow
    
    app = Flask(__name__)
    model = nexusflow.load_model('production_model.nxf')
    
    @app.route('/predict', methods=['POST'])
    def predict():
        data = request.json
        # Convert JSON to required DataFrame format
        predictions = model.predict(data)
        return jsonify({'predictions': predictions.tolist()})
    """)
    
    print("\n🔧 3. Model Monitoring & Updates:")
    print("""
    # Performance monitoring
    def monitor_model_drift(model, new_data, reference_stats):
        current_stats = compute_data_statistics(new_data)
        drift_score = calculate_drift(current_stats, reference_stats)
        
        if drift_score > DRIFT_THRESHOLD:
            trigger_retraining_pipeline()
    
    # A/B testing framework
    def ab_test_models(model_a, model_b, test_data):
        predictions_a = model_a.predict(test_data)
        predictions_b = model_b.predict(test_data)
        
        # Compare performance metrics
        return compare_performance(predictions_a, predictions_b)
    """)
    
    print("\n🔧 4. MLOps Best Practices:")
    best_practices = [
        "Version control your config.yaml files",
        "Use experiment tracking (W&B, MLflow)",
        "Implement automated model validation",
        "Set up continuous integration for model training",
        "Monitor feature distributions in production",
        "Implement model rollback mechanisms",
        "Use feature stores for consistent data access",
        "Implement proper error handling and logging"
    ]
    
    for i, practice in enumerate(best_practices, 1):
        print(f"   {i}. {practice}")

In [None]:
def create_deployment_checklist():
    """Create a deployment readiness checklist."""
    print("\n📋 DEPLOYMENT READINESS CHECKLIST:")
    print("="*50)
    
    checklist_items = [
        ("✅", "Model trained and validated on representative data"),
        ("✅", "Configuration files version controlled"),
        ("✅", "Performance benchmarks established"),
        ("⚠️", "Data drift monitoring implemented"),
        ("⚠️", "A/B testing framework ready"),
        ("⚠️", "Model rollback procedure tested"),
        ("❌", "Production API endpoints implemented"),
        ("❌", "Monitoring and alerting configured"),
        ("❌", "Documentation and runbooks created"),
        ("❌", "Security review completed")
    ]
    
    for status, item in checklist_items:
        print(f"{status} {item}")

In [None]:
show_production_patterns()
create_deployment_checklist()

# 🎓 Section 8: Summary & Next Steps

In [None]:
def create_summary():
    """Create a comprehensive summary of what we've covered."""
    print("\n" + "="*60)
    print("🎓 TUTORIAL SUMMARY & NEXT STEPS")
    print("="*60)
    
    print("🏆 What You've Learned:")
    achievements = [
        "Project initialization and structure",
        "Configuration management with YAML",
        "Training with both synthetic and real data",
        "Multi-table data alignment and processing", 
        "Model evaluation and performance analysis",
        "Model artifact creation and inference",
        "Production deployment considerations"
    ]
    
    for achievement in achievements:
        print(f"   ✅ {achievement}")
    
    print("\n🚀 Next Steps to Explore:")
    next_steps = [
        "Experiment with different architectures (embed_dim, refinement_iterations)",
        "Try heterogeneous data types (text, timeseries)",
        "Implement custom encoders for domain-specific data",
        "Set up MLOps integration (W&B, MLflow)",
        "Build production API endpoints",
        "Implement model monitoring and drift detection",
        "Explore the visualization capabilities",
        "Contribute to the NexusFlow project"
    ]
    
    for i, step in enumerate(next_steps, 1):
        print(f"   {i}. {step}")
    
    print("\n📚 Key Resources:")
    resources = [
        "NexusFlow Documentation: Check project README",
        "Configuration Examples: See configs/ directory", 
        "API Reference: nexusflow.api.model_api module",
        "Model Architecture: nexusflow.model.nexus_former module",
        "Training Pipeline: nexusflow.trainer.trainer module"
    ]
    
    for resource in resources:
        print(f"   📖 {resource}")
    
    print("\n💡 Pro Tips:")
    tips = [
        "Start with synthetic data to validate your setup",
        "Use smaller embed_dim for faster prototyping",
        "Monitor validation loss to prevent overfitting", 
        "Experiment with different refinement_iterations",
        "Save your successful configurations for reuse",
        "Use the CLI for production training workflows"
    ]
    
    for tip in tips:
        print(f"   💡 {tip}")

In [None]:
create_summary()

In [None]:
print("\n" + "="*60)
print("🎉 Tutorial Complete!")
print("="*60)
print("Thank you for exploring NexusFlow!")
print("🔗 Ready to build your own multi-transformer tabular ecosystems!")

# 🛠️ Section 9: Interactive Utilities & Helper Functions

In [None]:
def create_interactive_model_explorer():
    """Create an interactive model exploration widget."""
    print("\n" + "="*60)
    print("🛠️  INTERACTIVE MODEL EXPLORER")
    print("="*60)
    
    def explore_model_architecture():
        """Interactive function to explore different architectures."""
        print("🏗️  Architecture Explorer")
        print("-" * 30)
        
        # Interactive parameter selection
        embed_dims = [32, 64, 128, 256]
        refinement_iterations = [1, 2, 3, 5, 8]
        
        print("Available embedding dimensions:", embed_dims)
        print("Available refinement iterations:", refinement_iterations)
        
        # Calculate parameter counts for different configurations
        print("\n📊 Parameter Count Estimates:")
        for embed_dim in [64, 128]:
            for ref_iter in [2, 5]:
                # Rough parameter estimate (simplified)
                encoder_params = embed_dim * 50  # Rough estimate per encoder
                attention_params = embed_dim * embed_dim * 4  # Q,K,V,O projections
                fusion_params = embed_dim * 3 * 32  # Fusion layers
                
                total_params = 2 * encoder_params + 2 * attention_params + fusion_params
                print(f"   embed_dim={embed_dim}, iterations={ref_iter}: ~{total_params:,} parameters")
        
        return embed_dims, refinement_iterations
    
    def model_performance_predictor():
        """Predict approximate training time and memory usage."""
        print("\n⏱️  Performance Predictor")
        print("-" * 30)
        
        configs = [
            {"name": "Small", "embed_dim": 32, "iterations": 2, "datasets": 2},
            {"name": "Medium", "embed_dim": 64, "iterations": 3, "datasets": 3}, 
            {"name": "Large", "embed_dim": 128, "iterations": 5, "datasets": 4},
            {"name": "XLarge", "embed_dim": 256, "iterations": 8, "datasets": 5}
        ]
        
        for config in configs:
            # Rough estimates based on typical hardware
            train_time_per_epoch = config["embed_dim"] * config["iterations"] * config["datasets"] * 0.001
            memory_mb = config["embed_dim"] * config["datasets"] * 0.5
            
            print(f"   {config['name']:>7}: ~{train_time_per_epoch:.1f}s/epoch, ~{memory_mb:.0f}MB memory")
    
    explore_model_architecture()
    model_performance_predictor()

In [None]:
def create_data_quality_analyzer():
    """Create tools for analyzing data quality and preparation."""
    print("\n" + "="*60)
    print("🔍 DATA QUALITY ANALYZER")
    print("="*60)
    
    def analyze_dataset_compatibility():
        """Check if datasets are compatible for NexusFlow training."""
        print("🔗 Dataset Compatibility Checker")
        print("-" * 35)
        
        # Check if our sample datasets exist
        dataset_files = ["customers.csv", "transactions.csv", "support.csv"]
        datasets_dir = Path("datasets")
        
        compatibility_report = {}
        
        for filename in dataset_files:
            filepath = datasets_dir / filename
            if filepath.exists():
                try:
                    df = pd.read_csv(filepath)
                    
                    # Analyze dataset characteristics
                    analysis = {
                        "shape": df.shape,
                        "missing_values": df.isnull().sum().sum(),
                        "missing_percentage": (df.isnull().sum().sum() / df.size) * 100,
                        "numeric_columns": len(df.select_dtypes(include=[np.number]).columns),
                        "categorical_columns": len(df.select_dtypes(include=['object']).columns),
                        "has_customer_id": 'customer_id' in df.columns,
                        "duplicate_ids": df['customer_id'].duplicated().sum() if 'customer_id' in df.columns else 0
                    }
                    
                    compatibility_report[filename] = analysis
                    
                    # Print summary
                    status = "✅" if analysis["missing_percentage"] < 10 and analysis["has_customer_id"] else "⚠️"
                    print(f"   {status} {filename}: {df.shape[0]:,} rows × {df.shape[1]} cols, {analysis['missing_percentage']:.1f}% missing")
                    
                except Exception as e:
                    print(f"   ❌ {filename}: Error reading file - {e}")
            else:
                print(f"   ❌ {filename}: File not found")
        
        return compatibility_report
    
    def suggest_data_improvements():
        """Suggest improvements for better model performance."""
        print("\n💡 Data Improvement Suggestions")
        print("-" * 35)
        
        suggestions = [
            "🔹 Feature Engineering: Create interaction features between tables",
            "🔹 Temporal Features: Add time-based aggregations (recent vs. historical)",
            "🔹 Categorical Encoding: Use target encoding for high-cardinality categories",
            "🔹 Missing Value Strategy: Consider domain-specific imputation methods",
            "🔹 Feature Scaling: Normalize numerical features within each table",
            "🔹 Outlier Treatment: Cap extreme values or use robust transformations",
            "🔹 Data Validation: Implement schema validation for production data",
            "🔹 Feature Selection: Remove highly correlated or low-variance features"
        ]
        
        for suggestion in suggestions:
            print(f"   {suggestion}")
    
    report = analyze_dataset_compatibility()
    suggest_data_improvements()
    
    return report

In [None]:
def create_experiment_tracker():
    """Create a simple experiment tracking utility."""
    print("\n" + "="*60)
    print("🧪 EXPERIMENT TRACKER")
    print("="*60)
    
    experiments_log = []
    
    def log_experiment(name, config, results=None):
        """Log an experiment configuration and results."""
        experiment = {
            "name": name,
            "timestamp": pd.Timestamp.now().isoformat(),
            "config": config,
            "results": results or {}
        }
        experiments_log.append(experiment)
        
        print(f"📝 Logged experiment: {name}")
        return len(experiments_log) - 1  # Return experiment ID
    
    def compare_experiments(exp_ids=None):
        """Compare multiple experiments."""
        if not experiments_log:
            print("No experiments logged yet")
            return
        
        if exp_ids is None:
            exp_ids = list(range(len(experiments_log)))
        
        print(f"🔍 Comparing {len(exp_ids)} experiments:")
        print("-" * 50)
        
        comparison_data = []
        for exp_id in exp_ids:
            if exp_id < len(experiments_log):
                exp = experiments_log[exp_id]
                
                # Extract key metrics
                config = exp.get("config", {})
                results = exp.get("results", {})
                
                row = {
                    "Name": exp["name"],
                    "Embed_Dim": config.get("architecture", {}).get("global_embed_dim", "N/A"),
                    "Refinements": config.get("architecture", {}).get("refinement_iterations", "N/A"),
                    "Epochs": config.get("training", {}).get("epochs", "N/A"),
                    "Batch_Size": config.get("training", {}).get("batch_size", "N/A"),
                    "Accuracy": results.get("accuracy", "N/A"),
                    "Loss": results.get("test_loss", "N/A")
                }
                comparison_data.append(row)
        
        if comparison_data:
            comparison_df = pd.DataFrame(comparison_data)
            display(comparison_df)
        
        return comparison_data
    
    # Log our demo experiments
    if 'synthetic_config' in locals() and synthetic_config:
        log_experiment("Synthetic Demo", synthetic_config.model_dump() if hasattr(synthetic_config, 'model_dump') else dict(synthetic_config))
    
    if 'real_config' in locals() and real_config:
        log_experiment("Real Data Demo", real_config.model_dump() if hasattr(real_config, 'model_dump') else dict(real_config))
    
    def save_experiments_log():
        """Save experiments to file."""
        if experiments_log:
            log_file = Path("experiments_log.json")
            import json
            with open(log_file, 'w') as f:
                json.dump(experiments_log, f, indent=2, default=str)
            print(f"💾 Saved experiments log to: {log_file}")
    
    print(f"📊 Current experiments logged: {len(experiments_log)}")
    if experiments_log:
        compare_experiments()
    
    return log_experiment, compare_experiments, save_experiments_log

In [None]:
# Run interactive utilities
create_interactive_model_explorer()
data_quality_report = create_data_quality_analyzer()
log_exp, compare_exp, save_exp_log = create_experiment_tracker()

# 🎨 Section 10: Advanced Visualization & Analysis

In [None]:
def create_advanced_visualizations():
    """Create advanced visualization tools for model analysis."""
    print("\n" + "="*60)
    print("🎨 ADVANCED VISUALIZATIONS")
    print("="*60)
    
    def plot_architecture_diagram():
        """Create a visual representation of the NexusFormer architecture."""
        print("🏗️  Architecture Visualization")
        print("-" * 35)
        
        if loaded_model:
            params = loaded_model.get_params()
            input_dims = params.get('input_dimensions', [64, 32, 16])
            embed_dim = params.get('architecture', {}).get('embed_dim', 64)
            refinement_iterations = params.get('architecture', {}).get('refinement_iterations', 3)
        else:
            # Use example values
            input_dims = [64, 32, 16]
            embed_dim = 64
            refinement_iterations = 3
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # 1. Input Dimensions
        ax1 = axes[0, 0]
        colors = sns.color_palette("viridis", len(input_dims))
        bars = ax1.bar(range(len(input_dims)), input_dims, color=colors)
        ax1.set_title('Input Dimensions per Encoder')
        ax1.set_xlabel('Encoder Index')
        ax1.set_ylabel('Feature Count')
        
        # Add value labels on bars
        for bar, dim in zip(bars, input_dims):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height,
                    f'{dim}', ha='center', va='bottom', fontweight='bold')
        
        # 2. Architecture Flow
        ax2 = axes[0, 1]
        ax2.set_title('NexusFormer Information Flow')
        
        # Create a simplified flow diagram
        y_positions = np.arange(len(input_dims))
        x_encoder = [0] * len(input_dims)
        x_attention = [2] * len(input_dims)
        x_fusion = [4]
        
        # Plot encoders
        ax2.scatter(x_encoder, y_positions, s=200, c=colors, alpha=0.7, label='Encoders')
        
        # Plot attention mechanisms
        ax2.scatter(x_attention, y_positions, s=150, c='red', marker='s', alpha=0.7, label='Cross-Attention')
        
        # Plot fusion
        ax2.scatter(x_fusion, [len(input_dims)//2], s=300, c='gold', marker='D', label='Fusion Layer')
        
        # Draw connections
        for i in y_positions:
            # Encoder to attention
            ax2.arrow(0.1, i, 1.7, 0, head_width=0.1, head_length=0.1, fc='gray', ec='gray', alpha=0.5)
            # Attention to fusion
            ax2.arrow(2.1, i, 1.5, len(input_dims)//2 - i - 0.1, head_width=0.1, head_length=0.1, fc='gray', ec='gray', alpha=0.5)
        
        ax2.set_xlim(-0.5, 5)
        ax2.set_ylim(-0.5, len(input_dims) + 0.5)
        ax2.set_xticks([0, 2, 4])
        ax2.set_xticklabels(['Encoders', 'Cross-Attention', 'Fusion'])
        ax2.set_yticks(range(len(input_dims)))
        ax2.set_yticklabels([f'Table {i+1}' for i in range(len(input_dims))])
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. Parameter Distribution
        ax3 = axes[1, 0]
        
        # Estimate parameter distribution
        encoder_params = sum(dim * embed_dim for dim in input_dims)
        attention_params = len(input_dims) * embed_dim * embed_dim * 4  # Q,K,V,O
        fusion_params = len(input_dims) * embed_dim * 64  # Rough estimate
        
        param_categories = ['Encoders', 'Cross-Attention', 'Fusion']
        param_counts = [encoder_params, attention_params, fusion_params]
        
        wedges, texts, autotexts = ax3.pie(param_counts, labels=param_categories, autopct='%1.1f%%', startangle=90)
        ax3.set_title('Parameter Distribution (Estimated)')
        
        # 4. Refinement Process
        ax4 = axes[1, 1]
        iterations = list(range(refinement_iterations + 1))
        
        # Simulate attention convergence (for illustration)
        np.random.seed(42)
        attention_changes = [1.0] + [np.exp(-i * 0.5) + np.random.normal(0, 0.1) for i in range(1, refinement_iterations + 1)]
        attention_changes = [max(0, x) for x in attention_changes]  # Ensure non-negative
        
        ax4.plot(iterations, attention_changes, 'o-', linewidth=2, markersize=8)
        ax4.set_title('Attention Refinement Process')
        ax4.set_xlabel('Refinement Iteration')
        ax4.set_ylabel('Attention Change Magnitude')
        ax4.grid(True, alpha=0.3)
        ax4.set_ylim(0, max(attention_changes) * 1.1)
        
        plt.tight_layout()
        plt.show()
        
        return fig
    
    def plot_data_quality_heatmap():
        """Create a heatmap showing data quality metrics."""
        print("\n📊 Data Quality Heatmap")
        print("-" * 30)
        
        if data_quality_report:
            # Convert quality report to matrix form
            metrics = ['missing_percentage', 'numeric_columns', 'categorical_columns']
            datasets = list(data_quality_report.keys())
            
            quality_matrix = []
            for dataset in datasets:
                row = []
                for metric in metrics:
                    value = data_quality_report[dataset].get(metric, 0)
                    if metric == 'missing_percentage':
                        # Invert missing percentage (lower is better)
                        value = max(0, 100 - value)
                    row.append(value)
                quality_matrix.append(row)
            
            quality_df = pd.DataFrame(quality_matrix, index=datasets, columns=metrics)
            
            plt.figure(figsize=(10, 6))
            sns.heatmap(quality_df, annot=True, fmt='.1f', cmap='RdYlGn', 
                       cbar_kws={'label': 'Quality Score'})
            plt.title('Dataset Quality Overview')
            plt.xlabel('Quality Metrics')
            plt.ylabel('Datasets')
            plt.tight_layout()
            plt.show()
    
    def create_feature_importance_plot():
        """Simulate and plot feature importance analysis."""
        print("\n🎯 Feature Importance Analysis")
        print("-" * 35)
        
        # Simulate feature importance scores
        np.random.seed(42)
        
        if loaded_model:
            params = loaded_model.get_params()
            input_dims = params.get('input_dimensions', [])
        else:
            input_dims = [10, 8, 6]
        
        # Create mock feature importance data
        all_features = []
        all_importance = []
        
        for table_idx, dim in enumerate(input_dims):
            table_name = f"Table_{table_idx + 1}"
            for feat_idx in range(dim):
                feature_name = f"{table_name}_feat_{feat_idx + 1}"
                # Simulate decreasing importance with some randomness
                importance = np.random.beta(2, 5) * (1 - feat_idx / dim)
                
                all_features.append(feature_name)
                all_importance.append(importance)
        
        # Sort by importance
        importance_data = sorted(zip(all_features, all_importance), key=lambda x: x[1], reverse=True)
        features, importance = zip(*importance_data[:15])  # Top 15 features
        
        # Create the plot
        plt.figure(figsize=(12, 8))
        colors = []
        for feat in features:
            if 'Table_1' in feat:
                colors.append('skyblue')
            elif 'Table_2' in feat:
                colors.append('lightgreen')
            else:
                colors.append('lightcoral')
        
        bars = plt.barh(range(len(features)), importance, color=colors)
        plt.yticks(range(len(features)), features)
        plt.xlabel('Importance Score')
        plt.title('Top 15 Feature Importance (Simulated)')
        plt.grid(axis='x', alpha=0.3)
        
        # Add legend
        handles = [plt.Rectangle((0,0),1,1, color='skyblue', label='Table 1'),
                  plt.Rectangle((0,0),1,1, color='lightgreen', label='Table 2'),
                  plt.Rectangle((0,0),1,1, color='lightcoral', label='Table 3')]
        plt.legend(handles=handles, loc='lower right')
        
        plt.tight_layout()
        plt.show()
        
        return list(zip(features, importance))
    
    # Create the visualizations
    if 'matplotlib' in sys.modules:
        arch_fig = plot_architecture_diagram()
        plot_data_quality_heatmap()
        feature_importance = create_feature_importance_plot()
    else:
        print("⚠️  Matplotlib not available for visualizations")

In [None]:
# Run advanced visualizations
create_advanced_visualizations()

# 🔧 Section 11: Troubleshooting & FAQ

In [None]:
def create_troubleshooting_guide():
    """Create a comprehensive troubleshooting guide."""
    print("\n" + "="*60)
    print("🔧 TROUBLESHOOTING GUIDE")
    print("="*60)
    
    problems_and_solutions = {
        "🚨 Training Errors": [
            {
                "problem": "CUDA out of memory",
                "solution": "Reduce batch_size in config, use smaller embed_dim, or train on CPU",
                "example": "training: { batch_size: 16 }  # Instead of 64"
            },
            {
                "problem": "Model convergence issues",
                "solution": "Lower learning rate, increase epochs, or reduce model complexity",
                "example": "optimizer: { name: adam, lr: 1e-4 }  # Instead of 1e-3"
            },
            {
                "problem": "Validation loss not improving",
                "solution": "Check for data leakage, reduce overfitting, or increase dataset size",
                "example": "Add dropout, reduce refinement_iterations, or use early stopping"
            }
        ],
        
        "🗂️  Data Issues": [
            {
                "problem": "Primary key alignment fails",
                "solution": "Ensure all datasets have the same primary key column with matching values",
                "example": "All CSVs must have 'customer_id' column with identical IDs"
            },
            {
                "problem": "Missing values causing errors", 
                "solution": "Handle missing values before training or configure proper imputation",
                "example": "df.fillna(df.mean()) or use more sophisticated imputation"
            },
            {
                "problem": "Inconsistent data types",
                "solution": "Ensure numerical features are properly typed and categorical features encoded",
                "example": "df['numeric_col'] = pd.to_numeric(df['numeric_col'], errors='coerce')"
            }
        ],
        
        "⚙️  Configuration Problems": [
            {
                "problem": "YAML parsing errors",
                "solution": "Check indentation, quotes, and YAML syntax",
                "example": "Use spaces (not tabs) for indentation, quote string values"
            },
            {
                "problem": "Model architecture mismatch",
                "solution": "Ensure input_dims match your actual data dimensions",
                "example": "Count features correctly after excluding primary key and target"
            },
            {
                "problem": "Resource constraints",
                "solution": "Scale down model size for local development",
                "example": "Use embed_dim: 32, refinement_iterations: 2 for testing"
            }
        ]
    }
    
    for category, issues in problems_and_solutions.items():
        print(f"\n{category}")
        print("-" * 40)
        
        for i, issue in enumerate(issues, 1):
            print(f"\n{i}. ❓ Problem: {issue['problem']}")
            print(f"   💡 Solution: {issue['solution']}")
            print(f"   📝 Example: {issue['example']}")

In [None]:
def create_faq_section():
    """Create a frequently asked questions section."""
    print("\n" + "="*60)
    print("❓ FREQUENTLY ASKED QUESTIONS")
    print("="*60)
    
    faqs = [
        {
            "q": "How does NexusFlow differ from traditional ML approaches?",
            "a": "NexusFlow processes multiple tables simultaneously without flattening, using specialized transformers that communicate via cross-attention. This preserves relationships between tables and captures complex multi-hop dependencies."
        },
        {
            "q": "When should I use NexusFlow vs. standard approaches?",
            "a": "Use NexusFlow when you have multiple related tables, complex relationships between entities, or when traditional feature engineering becomes too complex/lossy."
        },
        {
            "q": "How do I choose the right architecture parameters?",
            "a": "Start with embed_dim=64 and refinement_iterations=3. Increase embed_dim for more complex data, increase refinement_iterations for stronger table interactions. Monitor validation performance."
        },
        {
            "q": "Can I use different encoder types for different tables?",
            "a": "Yes! Set transformer_type to 'standard', 'text', or 'timeseries' in your dataset configuration. Each table can have its own specialized encoder."
        },
        {
            "q": "How much data do I need for good results?",
            "a": "Generally 1000+ aligned records minimum. More complex architectures need more data. The quality of relationships between tables is often more important than raw size."
        },
        {
            "q": "Is NexusFlow suitable for production use?",
            "a": "Yes, but ensure proper testing, monitoring, and MLOps practices. Start with smaller models in production and scale up based on performance requirements."
        },
        {
            "q": "How do I interpret the model's predictions?",
            "a": "Use the visualize_flow() method to understand information flow between tables. Feature importance analysis and attention visualization help explain predictions."
        },
        {
            "q": "Can I add new tables to an existing model?",
            "a": "Currently, you need to retrain with the new architecture. Future versions may support incremental table addition with transfer learning."
        }
    ]
    
    for i, faq in enumerate(faqs, 1):
        print(f"\n{i}. ❓ {faq['q']}")
        print(f"   💬 {faq['a']}")

In [None]:
def create_performance_optimization_tips():
    """Provide performance optimization tips."""
    print("\n" + "="*60)
    print("⚡ PERFORMANCE OPTIMIZATION TIPS")
    print("="*60)
    
    optimization_categories = {
        "🚀 Training Speed": [
            "Use GPU acceleration (CUDA) when available",
            "Optimize batch_size for your hardware (try 32, 64, 128)",
            "Use smaller embed_dim during development (32-64)",
            "Reduce refinement_iterations for faster prototyping (1-2)",
            "Use mixed precision training for larger models",
            "Profile your code to identify bottlenecks"
        ],
        
        "💾 Memory Usage": [
            "Reduce batch_size if getting OOM errors",
            "Use gradient accumulation for effective larger batches",
            "Clear unnecessary variables and use torch.cuda.empty_cache()",
            "Process data in chunks if datasets are very large",
            "Use appropriate data types (float32 instead of float64)",
            "Monitor memory usage throughout training"
        ],
        
        "🎯 Model Quality": [
            "Start with synthetic data to validate architecture",
            "Use cross-validation for robust evaluation",
            "Implement early stopping to prevent overfitting",
            "Experiment with different learning rates (1e-4 to 1e-2)",
            "Try different optimizers (Adam, AdamW, SGD)",
            "Use learning rate scheduling for better convergence"
        ],
        
        "📊 Data Efficiency": [
            "Ensure high-quality primary key alignment",
            "Remove highly correlated or low-variance features",
            "Use appropriate feature scaling within each table",
            "Handle missing values strategically (not just fillna(0))",
            "Consider feature selection techniques",
            "Validate data quality before training"
        ]
    }
    
    for category, tips in optimization_categories.items():
        print(f"\n{category}")
        print("-" * 35)
        for tip in tips:
            print(f"   • {tip}")

In [None]:
# Run troubleshooting sections
create_troubleshooting_guide()
create_faq_section()
create_performance_optimization_tips()

# 🎉 Section 12: Final Wrap-up & Resources

In [None]:
def create_final_summary():
    """Create the final comprehensive summary."""
    print("\n" + "="*80)
    print("🎉 NEXUSFLOW TUTORIAL COMPLETE!")
    print("="*80)
    
    print("\n🏆 What You've Mastered:")
    mastery_areas = [
        "✅ Project structure and initialization",
        "✅ Configuration management and best practices", 
        "✅ Multi-table data preparation and alignment",
        "✅ Model training with both synthetic and real data",
        "✅ Architecture understanding and parameter tuning",
        "✅ Model evaluation and performance analysis",
        "✅ Inference and production deployment patterns",
        "✅ Troubleshooting and optimization techniques",
        "✅ Advanced visualization and model introspection",
        "✅ MLOps integration and experiment tracking"
    ]
    
    for area in mastery_areas:
        print(f"   {area}")
    
    print(f"\n🗂️  Files Created in This Tutorial:")
    created_files = [
        "📁 nexusflow_demo/ - Sample project structure",
        "📄 datasets/customers.csv - Sample customer data",
        "📄 datasets/transactions.csv - Sample transaction data", 
        "📄 datasets/support.csv - Sample support data",
        "🧠 *.nxf - Trained model artifacts",
        "📈 results/metrics.json - Training metrics log",
        "📋 experiments_log.json - Experiment tracking data"
    ]
    
    for file_info in created_files:
        print(f"   {file_info}")
    
    print(f"\n🚀 Ready for Production:")
    production_checklist = [
        "🔧 CLI commands: nexusflow init, train, validate",
        "🏗️  Architecture design principles and trade-offs",
        "📊 Data quality assessment and improvement strategies",
        "🎯 Model evaluation and performance monitoring",
        "🔍 Debugging and troubleshooting techniques",
        "⚡ Performance optimization for different scenarios",
        "🛡️  Production deployment considerations and best practices"
    ]
    
    for item in production_checklist:
        print(f"   {item}")
    
    print(f"\n🌟 Key Takeaways:")
    takeaways = [
        "NexusFlow excels at learning from multiple related tables without data flattening",
        "Cross-attention mechanisms enable sophisticated inter-table relationships",
        "Start simple (synthetic data, small models) then scale up systematically",
        "Configuration-driven approach enables reproducible experiments",
        "Built-in MLOps integration supports production deployment",
        "Visualization capabilities provide model interpretability",
        "Community-focused design encourages contribution and extension"
    ]
    
    for takeaway in takeaways:
        print(f"   💡 {takeaway}")
    
    print(f"\n🔗 Connect with the Community:")
    community_info = [
        "📖 Documentation: Explore the full project documentation",
        "🐛 Issues: Report bugs or request features on GitHub",
        "💬 Discussions: Join the community discussions",
        "🤝 Contributing: Submit pull requests and improvements",
        "📧 Contact: Reach out to the development team",
        "🎓 Learning: Share your success stories and use cases"
    ]
    
    for info in community_info:
        print(f"   {info}")
    
    print("\n" + "="*80)
    print("Thank you for exploring NexusFlow! 🚀")
    print("Build amazing multi-transformer tabular ecosystems! 🌟")
    print("="*80)

In [None]:
# Final summary
create_final_summary()

Save this notebook's state for future reference