# DISTILLED MONITORING SYSTEM

## Predictive monitoring with local caching and fallback support

### 🚀 QUICK WORKFLOW:
1. `setup()` - Initialize system and fallbacks
2. `generate_datasets()` - Generate training data (resumable)
3. `train()` - Train the distilled model
4. `test()` - Test model inference
5. `demo()` - Run monitoring demo

### 📊 MONITORING:
- `status()` - Check system status
- `show_progress()` - Check dataset generation progress

### 🔧 RECOVERY:
- `retry_failed()` - Retry failed generations
- `reset_progress()` - Start fresh

**Data Sources:** Splunk, Jira, Confluence, IBM Spectrum Conductor, VEMKD logs from Red Hat Linux

**Fallback Order:** Remote API → Ollama → Local Model → Static Responses

In [None]:
# Import the system
from main_notebook import *
from config import CONFIG

CONFIG['model_name'] = "bert-base-uncased" # use local cached model instead of attempting to download. 
print("🚀 Distilled Monitoring System")
print("📊 Ready for predictive monitoring with local caching")
print(f"📁 Cache directory: {CONFIG['hf_cache_dir']}")

In [None]:
# 1. Setup system with fallback chain
print("🚀 Setting up Distilled Monitoring System...")
print("This includes: directories, fallback systems, and progress tracking")

setup_success = setup()

if setup_success:
    print("\n✅ System setup complete!")
    print("\nNext: generate_datasets() to create training data")
else:
    print("\n❌ Setup failed. Check error messages above.")
    print("You may need to install dependencies or setup Ollama.")

In [None]:
# 2. Check system status
status()

In [None]:
# 3. Check dataset generation progress (if any)
print("📊 Current dataset generation progress:")
show_progress()

print("\n💡 TIPS:")
print("• First run: Will show new session")
print("• Resuming: Will show existing progress")
print("• Use reset_progress() to start fresh")
print("• Use retry_failed() to retry failed items")

In [None]:
# use with caution.
# reset_progress()

In [None]:
# 4. Generate training datasets with dynamic calculation
print("📊 DATASET GENERATION")
print("="*50)

# Calculate dynamic targets based on YAML content
from dataset_generator import OptimizedDatasetGenerator

# Calculate more realistic targets
temp_generator = DatasetGenerator()  # Changed from OptimizedDatasetGenerator
targets, total_language_target = temp_generator._calculate_dynamic_targets()
existing = temp_generator._analyze_existing_dataset()

print(f"🧮 Realistic Target Calculation:")
print(f"  Technical explanations: {targets.get('technical_explanation', 0)}")
print(f"  Error interpretations: {targets.get('error_interpretation', 0)}")
print(f"  Conversational samples: {targets.get('conversational_samples', 0)}")
print(f"  Total language target: {total_language_target}")
print(f"  Models per question: {CONFIG.get('models_per_question', 2)}")
print("")

print(f"📊 Current Progress:")
total_existing = 0
total_needed = 0

for sample_type, target_count in targets.items():
    existing_count = existing.get(sample_type, 0)
    needed = max(0, target_count - existing_count)
    total_existing += existing_count
    total_needed += needed
    status = "✅" if needed == 0 else "🔄"
    print(f"  {status} {sample_type}: {existing_count}/{target_count} (need {needed})")

print(f"")
print(f"📈 Overall Progress: {total_existing}/{total_language_target} ({total_existing/total_language_target*100:.1f}%)")
print(f"🎯 Remaining: {total_needed} language samples to generate")
print(f"  Metrics target: {CONFIG['metrics_samples']}")
print(f"  Models per question: {CONFIG.get('models_per_question', 2)}")
print("")

print(f"📊 Current Progress:")
for sample_type, target_count in targets.items():
    existing_count = existing.get(sample_type, 0)
    needed = max(0, target_count - existing_count)
    status = "✅" if needed == 0 else "🔄"
    print(f"  {status} {sample_type}: {existing_count}/{target_count} (need {needed})")

total_existing = sum(existing.values())
total_needed = sum(max(0, targets[t] - existing.get(t, 0)) for t in targets)

print(f"")
print(f"📈 Overall Progress: {total_existing}/{total_language_target} ({total_existing/total_language_target*100:.1f}%)")
print(f"🎯 Remaining: {total_needed} language samples to generate")
print("")

print("⚠️  IMPORTANT:")
print("• Generation time based on actual YAML content")
print("• Safe to interrupt with Ctrl+C (progress saved)")
print("• Progress saved every 50 items")
print("• Automatically resumes from last checkpoint")
print("⚠️ Requires Network access to Huggingface or manual download into the hf_cache directory.")
print("")

if total_needed > 0:
    print(f"🚀 Starting generation of {total_needed} remaining samples...")
else:
    print("✅ All language samples already complete! Checking metrics...")

# Generate datasets using dynamic calculation
try:
    language_data, metrics_data = generate_datasets(
        language_count=None,  # Use dynamic calculation
        metrics_count=CONFIG['metrics_samples']
    )
    
    if language_data is not None or metrics_data is not None:
        print("\n✅ Dataset generation completed!")
        if language_data:
            print(f"Language samples: {len(language_data)} new samples generated")
        print(f"Metrics samples: {len(metrics_data.get('training_samples', []))} total")
        print("\nNext: train() to train the model")
    else:
        print("\n⚠️  Generation interrupted or no new samples needed")
        print("Run this cell again to continue if needed")
        
except KeyboardInterrupt:
    print("\n⏸️  Generation interrupted by user")
    print("Progress saved. Run this cell again to continue.")
except Exception as e:
    print(f"\n❌ Generation error: {e}")
    print("Check error and use retry_failed() if needed.")

In [None]:
# 5. Train the distilled model
print("🏋️ TRAINING DISTILLED MODEL")
print("="*40)
print(f"Environment: {detect_training_environment()}")
print(f"Model: {CONFIG['model_name']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Epochs: {CONFIG['epochs']}")
print("")
print("⚠️  Training can take way long depending on hardware")
print("")

try:
    success = train()
    if success:
        print("\n✅ Training completed!")
        print("Next: test() to test the model")
    else:
        print("\n❌ Training failed - check if datasets exist")
except Exception as e:
    print(f"\n❌ Training error: {e}")
    print("Check logs for detailed error information")

In [None]:
# 6. Test model inference
print("🧪 TESTING MODEL INFERENCE")
print("="*40)
print("Testing with scenarios:")
print("• Normal operation")
print("• CPU spike")
print("• Memory pressure")
print("")

test_success = test()

if test_success:
    print("\n✅ Model testing successful!")
    print("Next: demo() to run monitoring demo")
else:
    print("\n❌ Testing failed - ensure model is trained")

In [None]:
# 7. Run monitoring demo
print("🎭 MONITORING DEMO")
print("="*30)
print("Features:")
print("• Real-time metric processing")
print("• Anomaly detection")
print("• Alert generation")
print("• Recommendation engine")
print("• Dashboard display")
print("")

# Customize demo duration
DEMO_MINUTES = 3

print(f"Running {DEMO_MINUTES}-minute demo...")
print("Will inject anomalies to demonstrate detection")
print("")

try:
    demo(minutes=DEMO_MINUTES)
    print("\n✅ Demo completed!")
    print("Check exported metrics history for results")
except KeyboardInterrupt:
    print("\n⏹️  Demo stopped by user")
except Exception as e:
    print(f"\n❌ Demo error: {e}")

In [None]:
# 8. Final system status
print("📋 FINAL SYSTEM STATUS")
print("="*40)

status()

print("\n🎉 SYSTEM COMPLETE!")
print("="*30)
print("Your distilled monitoring system includes:")
print("  ✅ Multi-model fallback system")
print("  ✅ Local caching for portability")
print("  ✅ Progress tracking and resume")
print("  ✅ Trained monitoring model")
print("  ✅ Real-time anomaly detection")
print("  ✅ Actionable recommendations")
print("")
print("🔧 NEXT STEPS:")
print("  • Integrate with your actual data sources")
print("  • Customize thresholds and alerts")
print("  • Set up continuous monitoring")
print("  • Implement feedback loops for learning")

## 🛠️ Troubleshooting & Recovery

### Common Commands:
- `status()` - Complete system status
- `show_progress()` - Dataset generation progress
- `retry_failed()` - Retry failed generation items
- `reset_progress()` - Start dataset generation fresh

### Common Issues:
- **Generation interrupted:** Just run the generation cell again
- **Failed generations:** Use `retry_failed()`
- **Want to start over:** Use `reset_progress()`
- **Memory issues:** Reduce `CONFIG['batch_size']`
- **No models available:** Check Ollama is running

### Configuration:
Modify `CONFIG` in `config.py` or use:
```python
CONFIG['language_samples'] = 2000  # Increase dataset size
CONFIG['batch_size'] = 8           # Reduce for less memory
CONFIG['epochs'] = 5               # More training epochs

In [1]:
import os
os.environ['TRITON_DISABLE_LINE_INFO'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
from transformers import AutoTokenizer
from config import CONFIG
from distilled_model_trainer import DistilledModelTrainer

trainer = DistilledModelTrainer(CONFIG, resume_training=True)
trainer.train()  # Updates latest model or creates new if none found

INFO:config:📋 Batch discovered 19 Ollama models
INFO:config:📁 Efficiently discovered 4 local models
INFO:config:📋 Discovered 22 total models
INFO:config:🎯 Built rotation pool: 18 models
INFO:config:   ollama: 15 models
INFO:config:   local: 2 models
INFO:config:   static: 1 models
INFO:config:✅ Enhanced model chain initialized
INFO:config:   Total models: 22
INFO:config:   Rotation pool: 18
INFO:distilled_model_trainer:🎮 CUDA GPU: NVIDIA GeForce RTX 4090
INFO:distilled_model_trainer:🚀 CUDA optimizations enabled
INFO:distilled_model_trainer:🆕 No valid checkpoint found, starting fresh
INFO:distilled_model_trainer:🏋️ Starting model training...
INFO:distilled_model_trainer:📁 Loading from local: pretrained\bert-base-uncased
INFO:distilled_model_trainer:✅ Loaded local model: bert-base-uncased
INFO:distilled_model_trainer:🚀 Model compilation enabled
INFO:distilled_model_trainer:📊 Discovering and loading datasets...
INFO:distilled_model_trainer:📁 Discovered 4 dataset files
INFO:distilled_model

True

In [None]:
# Test if the moved model will work for training
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModel, AutoConfig

def test_training_compatibility():
    """Test if the moved model is ready for training."""
    model_path = "./pretrained/bert-base-uncased/"
    
    print("🧪 TESTING TRAINING COMPATIBILITY")
    print("=" * 40)
    
    try:
        # Load exactly as the training code will
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        print("Loading config...")
        config = AutoConfig.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        print("Loading model...")
        model = AutoModel.from_pretrained(
            model_path,
            config=config,
            local_files_only=True,
            torch_dtype=torch.float32
        )
        
        print("✅ All components loaded successfully!")
        
        # Test training-specific functionality
        print("\nTesting training features...")
        
        # Check if model can be put in training mode
        model.train()
        print("✅ Model can enter training mode")
        
        # Test gradient computation
        model.eval()
        test_input = tokenizer("test", return_tensors="pt", max_length=128, truncation=True)
        
        # Enable gradients
        for param in model.parameters():
            param.requires_grad = True
        
        output = model(**test_input)
        loss = output.last_hidden_state.mean()  # Dummy loss
        loss.backward()
        
        print("✅ Gradient computation works")
        
        # Check model size
        param_count = sum(p.numel() for p in model.parameters())
        print(f"✅ Model parameters: {param_count:,} ({param_count/1e6:.1f}M)")
        
        # Check config details
        print(f"✅ Hidden size: {config.hidden_size}")
        print(f"✅ Vocab size: {config.vocab_size}")
        
        print(f"\n🎉 MODEL IS READY FOR TRAINING!")
        return True
        
    except Exception as e:
        print(f"❌ Training compatibility test failed: {e}")
        return False

# Run the test
success = test_training_compatibility()

if success:
    print(f"\n✅ Your moved model in ./pretrained/bert-base-uncased/ is ready!")
    print(f"The distilled_model_trainer.py should now work without internet.")
else:
    print(f"\n❌ There may still be issues with the moved model.")

In [None]:
# Recovery and troubleshooting commands
print("🔧 RECOVERY COMMANDS")
print("="*30)

print("\n📊 Progress Management:")
print("show_progress()    # Check current progress")
print("retry_failed()     # Retry failed items")
print("reset_progress()   # Start completely fresh")

print("\n🔍 Diagnostics:")
print("status()           # Complete system status")

print("\n⚙️  Current Configuration:")
print(f"Language samples: {CONFIG['language_samples']}")
print(f"Metrics samples: {CONFIG['metrics_samples']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Model: {CONFIG['model_name']}")
print(f"Cache dir: {CONFIG['hf_cache_dir']}")

print("\n💡 To modify configuration:")
print("CONFIG['language_samples'] = 2000")
print("CONFIG['batch_size'] = 8")

In [None]:
# Optional: Run individual recovery commands
# Uncomment as needed:

# show_progress()      # Check progress
# retry_failed()       # Retry failed items
# reset_progress()     # Start fresh (WARNING: deletes progress)

print("Uncomment commands above as needed for recovery")