# AWS API Label Propagation

In [None]:
# Import required modules
import json
import config
from data_manager import DataManager
from label_propagation import LabelPropagator
from evaluator import Evaluator
from animation import create_animation
from IPython.display import display
from visualization import (
    print_evaluation_summary,
    create_propagation_dashboard,
    plot_cross_service_comparison
)

In [None]:
# User-configurable parameters for propagation
VERBOSE = True  # Set to False to suppress output messages

# Embedding formats for different propagation types
WITHIN_SERVICE_EMBEDDING_FORMAT = 'with_params'  # or 'method_only', 'with_service_params'
CROSS_SERVICE_EMBEDDING_FORMAT = 'with_params'   # typically 'method_only' for cross-service
ALL_TO_ALL_EMBEDDING_FORMAT = 'with_params'      # typically 'method_only' for all-to-all

# Whether to save propagation history for animation
SAVE_HISTORY = True

## Step 1: Initialize and Load Data

In [None]:
# Initialize data manager
data_manager = DataManager()

# Load embeddings for labeled services
data_manager.load_method_embeddings(config.LABELED_SERVICES, WITHIN_SERVICE_EMBEDDING_FORMAT)

# Load or build Annoy indexes
if config.ANNOY_INDEXES_DIR.exists() and any(config.ANNOY_INDEXES_DIR.iterdir()):
    print(f"\nüìÇ Loading existing Annoy indexes from {config.ANNOY_INDEXES_DIR}")
    data_manager.load_indexes()
else:
    print(f"\nüìÅ Creating new Annoy indexes directory: {config.ANNOY_INDEXES_DIR}")
    config.ANNOY_INDEXES_DIR.mkdir(exist_ok=True, parents=True)
    
    print("\nüîß Building Annoy indexes for each service...")
    for service in data_manager.service_methods.keys():
        print(f"   Building index for {service}...")
        data_manager.build_service_index(service)
    
    print("\nüíæ Saving Annoy indexes...")
    data_manager.save_indexes()

## Step 2: Data Analysis

In [None]:
# Initialize evaluator
evaluator = Evaluator(data_manager)

# Analyze data distribution and coverage
evaluator.analyze_label_distribution()
evaluator.analyze_method_coverage()

## Step 3: Model Evaluation and Parameter Tuning

In [None]:
# Find optimal k value
best_k = evaluator.find_optimal_k()

# Evaluate propagation performance
evaluation_results = evaluator.evaluate_propagation(k=best_k)

## Step 4: Within-Service Label Propagation

In [None]:
# Initialize propagator
propagator = LabelPropagator(data_manager)

In [None]:
# Run adaptive within-service propagation
final_predictions = propagator.propagate_all_services(
    k=best_k,
    threshold=config.DEFAULT_SERVICE_THRESHOLD,
    max_iterations=config.DEFAULT_MAX_ITERATIONS,
    min_confidence=config.DEFAULT_MIN_CONFIDENCE,
    min_threshold=config.DEFAULT_MIN_THRESHOLD,
    save_history=SAVE_HISTORY,
    verbose=VERBOSE,
    embedding_format=WITHIN_SERVICE_EMBEDDING_FORMAT
)

In [None]:
# Save within-service predictions
within_service_metadata = {
    'type': 'within_service_propagation',
    'embedding_format': WITHIN_SERVICE_EMBEDDING_FORMAT,
    'k_neighbors': best_k,
    'initial_threshold': config.DEFAULT_SERVICE_THRESHOLD,
    'minimum_threshold': config.DEFAULT_MIN_THRESHOLD,
    'adaptive_threshold': True,
    'labeled_services': config.LABELED_SERVICES,
    'total_labeled_methods': len(data_manager.method_labels),
    'total_predictions': sum(len(pred) for pred in final_predictions.values())
}

data_manager.save_predictions(
    final_predictions, 
    config.WITHIN_SERVICE_PREDICTIONS_FILE, 
    within_service_metadata
)

## Step 5: Cross-Service Propagation

In [None]:
# Load all available service embeddings for cross-service propagation
methods_dir = config.EMBEDDINGS_DIR / 'methods'
all_services = [d.name for d in methods_dir.iterdir() if d.is_dir()]
services_to_load = [s for s in all_services if s.lower() not in data_manager.service_methods]

if services_to_load:
    print(f"üîÑ Loading {len(services_to_load)} additional services...")
    data_manager.load_method_embeddings(services_to_load, CROSS_SERVICE_EMBEDDING_FORMAT)
    
    existing_indexes = [d.name for d in config.ANNOY_INDEXES_DIR.iterdir() if d.is_dir()]
    services_to_build = [s for s in services_to_load if s.lower() not in existing_indexes]
    
    if services_to_build:
        print(f"üîß Building indexes for {len(services_to_build)} new services...")
        for service in services_to_build:
            data_manager.build_service_index(service)
        data_manager.save_indexes()
    else:
        data_manager.load_indexes()

print(f"üìä Total services available: {len(data_manager.service_methods)}")

In [None]:
# Load within-service predictions from file
with open(config.WITHIN_SERVICE_PREDICTIONS_FILE, 'r') as f:
    within_service_data = json.load(f)

loaded_within_service_predictions = within_service_data.get('predictions', {})

print(f"‚úÖ Loaded {sum(len(pred) for pred in loaded_within_service_predictions.values())} within-service predictions")

### Group Cross-Service Propagation (Only initial prelabeled data)

In [None]:
# Run group-based cross-service propagation with adaptive thresholding
# Using only initial prelabeled methods (no within-service predictions)
group_cross_service_predictions_v1 = propagator.propagate_all_groups_cross_service(
    k=best_k, 
    threshold=config.DEFAULT_SERVICE_THRESHOLD,
    min_threshold=config.DEFAULT_MIN_THRESHOLD,
    min_confidence=config.DEFAULT_MIN_CONFIDENCE,
    within_service_predictions=None,  # Only use initial prelabeled methods
    verbose=VERBOSE,
    embedding_format=CROSS_SERVICE_EMBEDDING_FORMAT
)

# Save group-based predictions
if group_cross_service_predictions_v1:
    group_metadata = {
        'type': 'group_cross_service_propagation_initial_only',
        'embedding_format': CROSS_SERVICE_EMBEDDING_FORMAT,
        'k_neighbors': best_k,
        'initial_threshold': config.DEFAULT_SERVICE_THRESHOLD,
        'minimum_threshold': config.DEFAULT_MIN_THRESHOLD,
        'min_confidence': config.DEFAULT_MIN_CONFIDENCE,
        'groups_processed': list(group_cross_service_predictions_v1.keys()),
        'total_predictions': sum(
            len(service_preds) 
            for group_data in group_cross_service_predictions_v1.values() 
            for service_preds in group_data.values()
        ),
        'group_configuration': config.CROSS_SERVICE_GROUPS,
        'uses_within_service_predictions': False  # Mark that this uses only initial labels
    }
    
    data_manager.save_predictions(
        group_cross_service_predictions_v1, 
        config.GROUP_CROSS_SERVICE_PREDICTIONS_FILE_V1, 
        group_metadata
    )

### Group-Based Cross-Service Propagation (With within-service predictions)

In [None]:
# Run group-based cross-service propagation with adaptive thresholding
group_cross_service_predictions_v2 = propagator.propagate_all_groups_cross_service(
    k=best_k, 
    threshold=config.DEFAULT_SERVICE_THRESHOLD,
    min_threshold=config.DEFAULT_MIN_THRESHOLD,
    min_confidence=config.DEFAULT_MIN_CONFIDENCE,
    within_service_predictions=loaded_within_service_predictions,
    verbose=VERBOSE,
    embedding_format=CROSS_SERVICE_EMBEDDING_FORMAT
)

# Save group-based predictions
if group_cross_service_predictions_v2:
    group_metadata = {
        'type': 'group_cross_service_propagation',
        'embedding_format': CROSS_SERVICE_EMBEDDING_FORMAT,
        'k_neighbors': best_k,
        'initial_threshold': config.DEFAULT_SERVICE_THRESHOLD,
        'minimum_threshold': config.DEFAULT_MIN_THRESHOLD,
        'min_confidence': config.DEFAULT_MIN_CONFIDENCE,
        'groups_processed': list(group_cross_service_predictions_v2.keys()),
        'total_predictions': sum(
            len(service_preds) 
            for group_data in group_cross_service_predictions_v2.values() 
            for service_preds in group_data.values()
        ),
        'group_configuration': config.CROSS_SERVICE_GROUPS
    }
    
    data_manager.save_predictions(
        group_cross_service_predictions_v2, 
        config.GROUP_CROSS_SERVICE_PREDICTIONS_FILE_V2, 
        group_metadata
    )

### All-to-All Cross-Service Propagation

In [None]:
# Run all-to-all cross-service propagation with adaptive thresholding
all_to_all_predictions = propagator.propagate_all_to_all_cross_service(
    k=best_k,
    threshold=config.DEFAULT_SERVICE_THRESHOLD,  # Lower initial threshold for broader coverage
    min_threshold=config.DEFAULT_MIN_THRESHOLD,  # Even lower minimum for all-to-all
    min_confidence=config.DEFAULT_MIN_CONFIDENCE,
    within_service_predictions=loaded_within_service_predictions,
    verbose=VERBOSE,
    embedding_format=ALL_TO_ALL_EMBEDDING_FORMAT
)

# Save all-to-all predictions
if all_to_all_predictions:
    all_to_all_metadata = {
        'type': 'all_to_all_cross_service_propagation',
        'embedding_format': ALL_TO_ALL_EMBEDDING_FORMAT,
        'k_neighbors': best_k,
        'initial_threshold': config.DEFAULT_SERVICE_THRESHOLD,
        'minimum_threshold': config.DEFAULT_MIN_THRESHOLD,
        'min_confidence': config.DEFAULT_MIN_CONFIDENCE,
        'total_predictions': sum(len(pred) for pred in all_to_all_predictions.values())
    }
    
    data_manager.save_predictions(
        all_to_all_predictions, 
        config.ALL_TO_ALL_CROSS_SERVICE_PREDICTIONS_FILE, 
        all_to_all_metadata
    )

### Compare Cross-Service Methods

In [None]:
# Compare the two cross-service approaches
comparison_results = evaluator.compare_cross_service_predictions(
    group_cross_service_predictions_v2, 
    all_to_all_predictions
)

# Save comparison results
if comparison_results:
    with open(config.CROSS_SERVICE_COMPARISON_FILE, 'w') as f:
        json.dump(comparison_results, f, indent=2, default=str)
    
    print(f"üìä Comparison saved to: {config.CROSS_SERVICE_COMPARISON_FILE}")
    
    # Visualize comparison
    plot_cross_service_comparison(comparison_results)

## Step 6: Results Visualization

### Within-Service Visualization

In [None]:
# Create visualization for within-service predictions
total_methods = {service: len(methods_list) for service, methods_list in data_manager.service_methods.items()}

create_propagation_dashboard(
    loaded_within_service_predictions, 
    total_methods, 
    data_manager.method_labels,
    title_prefix="Within-Service"
)

### Group-Based Cross-Service Visualization (Only initial prelabeled data)

In [None]:
# Flatten group predictions for visualization
if group_cross_service_predictions_v1:
    group_flat = {}
    for group_name, group_data in group_cross_service_predictions_v1.items():
        for service, service_predictions in group_data.items():
            if service not in group_flat:
                group_flat[service] = {}
            group_flat[service].update(service_predictions)
    
    create_propagation_dashboard(
        group_flat,
        total_methods,
        data_manager.method_labels,
        title_prefix="Group Cross-Service (Only initial prelabeled data)"
    )

### Group Cross-Service Visualization (With within-service predictions)

In [None]:
# Flatten group predictions for visualization
if group_cross_service_predictions_v2:
    group_flat = {}
    for group_name, group_data in group_cross_service_predictions_v2.items():
        for service, service_predictions in group_data.items():
            if service not in group_flat:
                group_flat[service] = {}
            group_flat[service].update(service_predictions)
    
    create_propagation_dashboard(
        group_flat,
        total_methods,
        data_manager.method_labels,
        title_prefix="Group Cross-Service (With within-service predictions)"
    )

### All-to-All Cross-Service Visualization


In [None]:
# Create visualization for all-to-all predictions
if all_to_all_predictions:
    # Combine manual labels with within-service predictions for accurate coverage calculation
    combined_labels = data_manager.method_labels.copy()
    
    # Add within-service predictions to the label pool
    for service, service_predictions in loaded_within_service_predictions.items():
        for method_name, pred_data in service_predictions.items():
            method_key = (service.lower(), method_name)
            if isinstance(pred_data, dict) and 'label' in pred_data:
                combined_labels[method_key] = pred_data['label']
            else:
                combined_labels[method_key] = pred_data
    
    create_propagation_dashboard(
        all_to_all_predictions,
        total_methods,
        combined_labels,  # Use combined labels instead of just manual labels
        title_prefix="All-to-All Cross-Service"
    )

### Animation (Optional)


In [None]:
# Create animation for a specific service
anim = create_animation(
    data_manager, 
    config.SERVICE_TO_ANIMATE, 
    save_gif=True
)

display(anim)

## Step 7: Save Results Summary

In [None]:
# Generate comprehensive summary
summary = evaluator.generate_summary(
    loaded_within_service_predictions, 
    group_cross_service_predictions_v2,
    evaluation_results, 
    best_k
)

# Add comparison results to summary
if comparison_results:
    summary['cross_service_comparison'] = comparison_results['summary']

# Save summary
with open(config.PROPAGATION_SUMMARY_FILE, 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"\nüíæ Summary saved to: {config.PROPAGATION_SUMMARY_FILE}")

## Step 8: Final Summary

In [None]:
# Print final evaluation summary
print_evaluation_summary(
    evaluation_results, 
    best_k, 
    len(data_manager.method_labels), 
    sum(len(pred) for pred in loaded_within_service_predictions.values())
)

# Print final results summary
print("\nüéä Label Propagation Analysis Complete!")
print("=" * 60)

print(f"\nüìä RESULTS SUMMARY:")
print(f"   Within-service: {sum(len(pred) for pred in loaded_within_service_predictions.values())} predictions")

if group_cross_service_predictions_v2:
    total_group = sum(len(service_preds) for group_data in group_cross_service_predictions_v2.values() 
                     for service_preds in group_data.values())
    print(f"   Group cross-service: {total_group} predictions")

if all_to_all_predictions:
    total_all = sum(len(pred) for pred in all_to_all_predictions.values())
    print(f"   All-to-all cross-service: {total_all} predictions")

if comparison_results:
    print(f"   Cross-method agreement: {comparison_results['summary']['overall_agreement_rate']:.1%}")

print(f"\nüìÅ Generated Files:")
print(f"   ‚Ä¢ {config.WITHIN_SERVICE_PREDICTIONS_FILE}")
print(f"   ‚Ä¢ {config.GROUP_CROSS_SERVICE_PREDICTIONS_FILE_V1}")
print(f"   ‚Ä¢ {config.GROUP_CROSS_SERVICE_PREDICTIONS_FILE_V2}")
print(f"   ‚Ä¢ {config.ALL_TO_ALL_CROSS_SERVICE_PREDICTIONS_FILE}")
print(f"   ‚Ä¢ {config.CROSS_SERVICE_COMPARISON_FILE}")
print(f"   ‚Ä¢ {config.PROPAGATION_SUMMARY_FILE}")

print("\n‚úÖ Process complete!")