# DISTILLED MONITORING SYSTEM

## Predictive monitoring with local caching and fallback support

### üöÄ QUICK WORKFLOW:
1. `setup()` - Initialize system and fallbacks
2. `generate_datasets()` - Generate training data (resumable)
3. `train()` - Train the distilled model
4. `test()` - Test model inference
5. `demo()` - Run monitoring demo

### üìä MONITORING:
- `status()` - Check system status
- `show_progress()` - Check dataset generation progress

### üîß RECOVERY:
- `retry_failed()` - Retry failed generations
- `reset_progress()` - Start fresh

**Data Sources:** Splunk, Jira, Confluence, IBM Spectrum Conductor, VEMKD logs from Red Hat Linux

**Fallback Order:** Remote API ‚Üí Ollama ‚Üí Local Model ‚Üí Static Responses

In [None]:
# Import the system
from main_notebook import *
from config import CONFIG

CONFIG['model_name'] = "bert-base-uncased" # use local cached model instead of attempting to download. 
print("üöÄ Distilled Monitoring System")
print("üìä Ready for predictive monitoring with local caching")
print(f"üìÅ Cache directory: {CONFIG['hf_cache_dir']}")

In [None]:
# 1. Setup system with fallback chain
print("üöÄ Setting up Distilled Monitoring System...")
print("This includes: directories, fallback systems, and progress tracking")

setup_success = setup()

if setup_success:
    print("\n‚úÖ System setup complete!")
    print("\nNext: generate_datasets() to create training data")
else:
    print("\n‚ùå Setup failed. Check error messages above.")
    print("You may need to install dependencies or setup Ollama.")

In [None]:
# 2. Check system status
status()

In [None]:
# 3. Check dataset generation progress (if any)
print("üìä Current dataset generation progress:")
show_progress()

print("\nüí° TIPS:")
print("‚Ä¢ First run: Will show new session")
print("‚Ä¢ Resuming: Will show existing progress")
print("‚Ä¢ Use reset_progress() to start fresh")
print("‚Ä¢ Use retry_failed() to retry failed items")

In [None]:
# use with caution.
# reset_progress()

In [None]:
# 4. Generate training datasets with dynamic calculation
print("üìä DATASET GENERATION")
print("="*50)

# Calculate dynamic targets based on YAML content
from dataset_generator import OptimizedDatasetGenerator

# Calculate more realistic targets
temp_generator = DatasetGenerator()  # Changed from OptimizedDatasetGenerator
targets, total_language_target = temp_generator._calculate_dynamic_targets()
existing = temp_generator._analyze_existing_dataset()

print(f"üßÆ Realistic Target Calculation:")
print(f"  Technical explanations: {targets.get('technical_explanation', 0)}")
print(f"  Error interpretations: {targets.get('error_interpretation', 0)}")
print(f"  Conversational samples: {targets.get('conversational_samples', 0)}")
print(f"  Total language target: {total_language_target}")
print(f"  Models per question: {CONFIG.get('models_per_question', 2)}")
print("")

print(f"üìä Current Progress:")
total_existing = 0
total_needed = 0

for sample_type, target_count in targets.items():
    existing_count = existing.get(sample_type, 0)
    needed = max(0, target_count - existing_count)
    total_existing += existing_count
    total_needed += needed
    status = "‚úÖ" if needed == 0 else "üîÑ"
    print(f"  {status} {sample_type}: {existing_count}/{target_count} (need {needed})")

print(f"")
print(f"üìà Overall Progress: {total_existing}/{total_language_target} ({total_existing/total_language_target*100:.1f}%)")
print(f"üéØ Remaining: {total_needed} language samples to generate")
print(f"  Metrics target: {CONFIG['metrics_samples']}")
print(f"  Models per question: {CONFIG.get('models_per_question', 2)}")
print("")

print(f"üìä Current Progress:")
for sample_type, target_count in targets.items():
    existing_count = existing.get(sample_type, 0)
    needed = max(0, target_count - existing_count)
    status = "‚úÖ" if needed == 0 else "üîÑ"
    print(f"  {status} {sample_type}: {existing_count}/{target_count} (need {needed})")

total_existing = sum(existing.values())
total_needed = sum(max(0, targets[t] - existing.get(t, 0)) for t in targets)

print(f"")
print(f"üìà Overall Progress: {total_existing}/{total_language_target} ({total_existing/total_language_target*100:.1f}%)")
print(f"üéØ Remaining: {total_needed} language samples to generate")
print("")

print("‚ö†Ô∏è  IMPORTANT:")
print("‚Ä¢ Generation time based on actual YAML content")
print("‚Ä¢ Safe to interrupt with Ctrl+C (progress saved)")
print("‚Ä¢ Progress saved every 50 items")
print("‚Ä¢ Automatically resumes from last checkpoint")
print("‚ö†Ô∏è Requires Network access to Huggingface or manual download into the hf_cache directory.")
print("")

if total_needed > 0:
    print(f"üöÄ Starting generation of {total_needed} remaining samples...")
else:
    print("‚úÖ All language samples already complete! Checking metrics...")

# Generate datasets using dynamic calculation
try:
    language_data, metrics_data = generate_datasets(
        language_count=None,  # Use dynamic calculation
        metrics_count=CONFIG['metrics_samples']
    )
    
    if language_data is not None or metrics_data is not None:
        print("\n‚úÖ Dataset generation completed!")
        if language_data:
            print(f"Language samples: {len(language_data)} new samples generated")
        print(f"Metrics samples: {len(metrics_data.get('training_samples', []))} total")
        print("\nNext: train() to train the model")
    else:
        print("\n‚ö†Ô∏è  Generation interrupted or no new samples needed")
        print("Run this cell again to continue if needed")
        
except KeyboardInterrupt:
    print("\n‚è∏Ô∏è  Generation interrupted by user")
    print("Progress saved. Run this cell again to continue.")
except Exception as e:
    print(f"\n‚ùå Generation error: {e}")
    print("Check error and use retry_failed() if needed.")

In [None]:
# 5. Train the distilled model
print("üèãÔ∏è TRAINING DISTILLED MODEL")
print("="*40)
print(f"Environment: {detect_training_environment()}")
print(f"Model: {CONFIG['model_name']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Epochs: {CONFIG['epochs']}")
print("")
print("‚ö†Ô∏è  Training can take way long depending on hardware")
print("")

try:
    success = train()
    if success:
        print("\n‚úÖ Training completed!")
        print("Next: test() to test the model")
    else:
        print("\n‚ùå Training failed - check if datasets exist")
except Exception as e:
    print(f"\n‚ùå Training error: {e}")
    print("Check logs for detailed error information")

In [None]:
# 6. Test model inference
print("üß™ TESTING MODEL INFERENCE")
print("="*40)
print("Testing with scenarios:")
print("‚Ä¢ Normal operation")
print("‚Ä¢ CPU spike")
print("‚Ä¢ Memory pressure")
print("")

test_success = test()

if test_success:
    print("\n‚úÖ Model testing successful!")
    print("Next: demo() to run monitoring demo")
else:
    print("\n‚ùå Testing failed - ensure model is trained")

In [None]:
# 7. Run monitoring demo
print("üé≠ MONITORING DEMO")
print("="*30)
print("Features:")
print("‚Ä¢ Real-time metric processing")
print("‚Ä¢ Anomaly detection")
print("‚Ä¢ Alert generation")
print("‚Ä¢ Recommendation engine")
print("‚Ä¢ Dashboard display")
print("")

# Customize demo duration
DEMO_MINUTES = 3

print(f"Running {DEMO_MINUTES}-minute demo...")
print("Will inject anomalies to demonstrate detection")
print("")

try:
    demo(minutes=DEMO_MINUTES)
    print("\n‚úÖ Demo completed!")
    print("Check exported metrics history for results")
except KeyboardInterrupt:
    print("\n‚èπÔ∏è  Demo stopped by user")
except Exception as e:
    print(f"\n‚ùå Demo error: {e}")

In [None]:
# 8. Final system status
print("üìã FINAL SYSTEM STATUS")
print("="*40)

status()

print("\nüéâ SYSTEM COMPLETE!")
print("="*30)
print("Your distilled monitoring system includes:")
print("  ‚úÖ Multi-model fallback system")
print("  ‚úÖ Local caching for portability")
print("  ‚úÖ Progress tracking and resume")
print("  ‚úÖ Trained monitoring model")
print("  ‚úÖ Real-time anomaly detection")
print("  ‚úÖ Actionable recommendations")
print("")
print("üîß NEXT STEPS:")
print("  ‚Ä¢ Integrate with your actual data sources")
print("  ‚Ä¢ Customize thresholds and alerts")
print("  ‚Ä¢ Set up continuous monitoring")
print("  ‚Ä¢ Implement feedback loops for learning")

## üõ†Ô∏è Troubleshooting & Recovery

### Common Commands:
- `status()` - Complete system status
- `show_progress()` - Dataset generation progress
- `retry_failed()` - Retry failed generation items
- `reset_progress()` - Start dataset generation fresh

### Common Issues:
- **Generation interrupted:** Just run the generation cell again
- **Failed generations:** Use `retry_failed()`
- **Want to start over:** Use `reset_progress()`
- **Memory issues:** Reduce `CONFIG['batch_size']`
- **No models available:** Check Ollama is running

### Configuration:
Modify `CONFIG` in `config.py` or use:
```python
CONFIG['language_samples'] = 2000  # Increase dataset size
CONFIG['batch_size'] = 8           # Reduce for less memory
CONFIG['epochs'] = 5               # More training epochs

In [1]:
import os
os.environ['TRITON_DISABLE_LINE_INFO'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
from transformers import AutoTokenizer
from config import CONFIG
from distilled_model_trainer import DistilledModelTrainer

trainer = DistilledModelTrainer(CONFIG, resume_training=True)
trainer.train()  # Updates latest model or creates new if none found

INFO:config:üìã Batch discovered 19 Ollama models
INFO:config:üìÅ Efficiently discovered 4 local models
INFO:config:üìã Discovered 22 total models
INFO:config:üéØ Built rotation pool: 18 models
INFO:config:   ollama: 15 models
INFO:config:   local: 2 models
INFO:config:   static: 1 models
INFO:config:‚úÖ Enhanced model chain initialized
INFO:config:   Total models: 22
INFO:config:   Rotation pool: 18
INFO:distilled_model_trainer:üéÆ CUDA GPU: NVIDIA GeForce RTX 4090
INFO:distilled_model_trainer:üöÄ CUDA optimizations enabled
INFO:distilled_model_trainer:üÜï No valid checkpoint found, starting fresh
INFO:distilled_model_trainer:üèãÔ∏è Starting model training...
INFO:distilled_model_trainer:üìÅ Loading from local: pretrained\bert-base-uncased
INFO:distilled_model_trainer:‚úÖ Loaded local model: bert-base-uncased
INFO:distilled_model_trainer:üöÄ Model compilation enabled
INFO:distilled_model_trainer:üìä Discovering and loading datasets...
INFO:distilled_model_trainer:üìÅ Disco

True

In [None]:
# Test if the moved model will work for training
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModel, AutoConfig

def test_training_compatibility():
    """Test if the moved model is ready for training."""
    model_path = "./pretrained/bert-base-uncased/"
    
    print("üß™ TESTING TRAINING COMPATIBILITY")
    print("=" * 40)
    
    try:
        # Load exactly as the training code will
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        print("Loading config...")
        config = AutoConfig.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        print("Loading model...")
        model = AutoModel.from_pretrained(
            model_path,
            config=config,
            local_files_only=True,
            torch_dtype=torch.float32
        )
        
        print("‚úÖ All components loaded successfully!")
        
        # Test training-specific functionality
        print("\nTesting training features...")
        
        # Check if model can be put in training mode
        model.train()
        print("‚úÖ Model can enter training mode")
        
        # Test gradient computation
        model.eval()
        test_input = tokenizer("test", return_tensors="pt", max_length=128, truncation=True)
        
        # Enable gradients
        for param in model.parameters():
            param.requires_grad = True
        
        output = model(**test_input)
        loss = output.last_hidden_state.mean()  # Dummy loss
        loss.backward()
        
        print("‚úÖ Gradient computation works")
        
        # Check model size
        param_count = sum(p.numel() for p in model.parameters())
        print(f"‚úÖ Model parameters: {param_count:,} ({param_count/1e6:.1f}M)")
        
        # Check config details
        print(f"‚úÖ Hidden size: {config.hidden_size}")
        print(f"‚úÖ Vocab size: {config.vocab_size}")
        
        print(f"\nüéâ MODEL IS READY FOR TRAINING!")
        return True
        
    except Exception as e:
        print(f"‚ùå Training compatibility test failed: {e}")
        return False

# Run the test
success = test_training_compatibility()

if success:
    print(f"\n‚úÖ Your moved model in ./pretrained/bert-base-uncased/ is ready!")
    print(f"The distilled_model_trainer.py should now work without internet.")
else:
    print(f"\n‚ùå There may still be issues with the moved model.")

In [None]:
# Recovery and troubleshooting commands
print("üîß RECOVERY COMMANDS")
print("="*30)

print("\nüìä Progress Management:")
print("show_progress()    # Check current progress")
print("retry_failed()     # Retry failed items")
print("reset_progress()   # Start completely fresh")

print("\nüîç Diagnostics:")
print("status()           # Complete system status")

print("\n‚öôÔ∏è  Current Configuration:")
print(f"Language samples: {CONFIG['language_samples']}")
print(f"Metrics samples: {CONFIG['metrics_samples']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Model: {CONFIG['model_name']}")
print(f"Cache dir: {CONFIG['hf_cache_dir']}")

print("\nüí° To modify configuration:")
print("CONFIG['language_samples'] = 2000")
print("CONFIG['batch_size'] = 8")

In [None]:
# Optional: Run individual recovery commands
# Uncomment as needed:

# show_progress()      # Check progress
# retry_failed()       # Retry failed items
# reset_progress()     # Start fresh (WARNING: deletes progress)

print("Uncomment commands above as needed for recovery")