# AWS Services Embeddings Creation

In [None]:
import json
import os
from typing import Dict, List
from pathlib import Path

In [None]:
def load_aws_api_documentation(base_path: str, services: List[str]) -> Dict:
    """
    Load AWS API documentation from your JSON files structure.
    
    Args:
        base_path: Path to the directory containing service JSON files
        services: List of service names to load
    
    Returns:
        Dictionary with service -> method -> documentation mapping
    """
    api_docs = {}
    
    for service in services:
        service_file = Path(base_path) / f"{service}.json"
        
        if not service_file.exists():
            print(f"Warning: {service}.json not found, skipping...")
            continue
            
        try:
            # Load service overview
            with open(service_file, 'r') as f:
                service_data = json.load(f)
            
            service_docs = {}
            
            # Get method names from the service file
            method_names = service_data.get('client', {}).get('methods_names', [])
            
            # Load individual method documentation
            for method_name in method_names:
                method_file = Path(base_path) / f"{method_name}.json"
                
                if method_file.exists():
                    try:
                        with open(method_file, 'r') as f:
                            method_data = json.load(f)
                        
                        # Structure the method documentation
                        service_docs[method_name] = {
                            'description': method_data.get('description', ''),
                            'parameters': method_data.get('parameters', []),
                            'return_structure': method_data.get('return_structure', []),
                            'url': method_data.get('url', '')
                        }
                    except Exception as e:
                        print(f"Error loading {method_name}.json: {e}")
                        
            api_docs[service] = service_docs
            print(f"Loaded {service}: {len(service_docs)} methods")
            
        except Exception as e:
            print(f"Error loading {service}.json: {e}")
    
    return api_docs



In [None]:
def get_priority_services_for_security_analysis():
    """
    Returns a prioritized list of AWS services for security analysis.
    Based on common attack vectors and data flow patterns.
    """
    return {
        'tier1_critical': [
            'IAM',           # Identity and access management
            'S3',            # Object storage - common data leaks
            'DynamoDB',      # NoSQL database
            'RDS',           # Relational databases
            'Lambda',        # Serverless compute
            'SSM',           # Systems Manager - parameter/secret storage
            'SecretsManager', # Secrets management
            'KMS'            # Key management
        ],
        'tier2_compute': [
            'EC2',           # Compute instances
            'ECS',           # Container service
            'EKS',           # Kubernetes service
            'Batch',         # Batch computing
            'SageMaker'      # ML platform
        ],
        'tier3_networking': [
            'APIGateway',    # API management
            'CloudFront',    # CDN
            'Route53',       # DNS
            'VPC',           # Virtual private cloud
            'ELB'            # Load balancing
        ],
        'tier4_messaging': [
            'SQS',           # Queue service
            'SNS',           # Notification service
            'EventBridge',   # Event bus
            'Kinesis',       # Streaming
            'MSK'            # Managed Kafka
        ],
        'tier5_monitoring': [
            'CloudWatch',    # Monitoring
            'CloudTrail',    # Audit logs
            'GuardDuty',     # Threat detection
            'Inspector',     # Vulnerability assessment
            'Macie'          # Data discovery
        ]
    }





In [None]:
def create_labeled_data_from_pdf_annotations():
    """
    Convert your PDF annotations into a structured format.
    This matches the format shown in your PDF screenshot.
    """
    # Based on your PDF data
    labeled_data = [
        # S3 Service
        {"service": "S3", "method": "put_object", "label": "sink", "resource_type": "object", "action": "put"},
        {"service": "S3", "method": "get_object", "label": "source", "resource_type": "object", "action": "get"},
        {"service": "S3", "method": "create_bucket", "label": "neither", "resource_type": "bucket", "action": "create"},
        {"service": "S3", "method": "upload_file", "label": "sink", "resource_type": "file", "action": "upload"},
        {"service": "S3", "method": "download_file", "label": "source", "resource_type": "file", "action": "download"},
        {"service": "S3", "method": "list_objects_v2", "label": "source", "resource_type": "object", "action": "list"},
        {"service": "S3", "method": "put_bucket_policy", "label": "sink", "resource_type": "bucket/policy", "action": "put"},
        {"service": "S3", "method": "restore_object", "label": "neither", "resource_type": "object", "action": "restore"},
        {"service": "S3", "method": "get_bucket_encryption", "label": "source", "resource_type": "bucket", "action": "get"},
        {"service": "S3", "method": "put_bucket_cors", "label": "sink", "resource_type": "bucket", "action": "put"},
        
        # DynamoDB Service
        {"service": "DynamoDB", "method": "put_item", "label": "sink", "resource_type": "item", "action": "put"},
        {"service": "DynamoDB", "method": "scan", "label": "source", "resource_type": "table", "action": "scan"},
        {"service": "DynamoDB", "method": "query", "label": "source", "resource_type": "table", "action": "query"},
        {"service": "DynamoDB", "method": "get_item", "label": "source", "resource_type": "item", "action": "get"},
        {"service": "DynamoDB", "method": "update_item", "label": "sink", "resource_type": "item", "action": "update"},
        {"service": "DynamoDB", "method": "create_table", "label": "neither", "resource_type": "table", "action": "create"},
        {"service": "DynamoDB", "method": "batch_write_item", "label": "sink", "resource_type": "item/table", "action": "batch/write"},
        {"service": "DynamoDB", "method": "list_backups", "label": "source", "resource_type": "backup", "action": "list"},
        {"service": "DynamoDB", "method": "describe_table", "label": "source", "resource_type": "table", "action": "describe"},
        {"service": "DynamoDB", "method": "tag_resource", "label": "neither", "resource_type": "resource", "action": "tag"},
        {"service": "DynamoDB", "method": "update_time_to_live", "label": "neither", "resource_type": "table", "action": "update"},
        
        # Lambda Service
        {"service": "Lambda", "method": "invoke", "label": "sink", "resource_type": "function", "action": "invoke"},
        {"service": "Lambda", "method": "add_permission", "label": "sink", "resource_type": "function", "action": "add"},
        {"service": "Lambda", "method": "list_functions", "label": "source", "resource_type": "function", "action": "list"},
        {"service": "Lambda", "method": "get_policy", "label": "source", "resource_type": "policy/function", "action": "get"},
        {"service": "Lambda", "method": "get_function", "label": "source", "resource_type": "function", "action": "get"},
        {"service": "Lambda", "method": "list_tags", "label": "source", "resource_type": "function", "action": "list"},
        {"service": "Lambda", "method": "update_event_source_mapping", "label": "sink", "resource_type": "event source mapping", "action": "update"},
        {"service": "Lambda", "method": "update_function_url_config", "label": "sink", "resource_type": "function/url", "action": "update"},
        {"service": "Lambda", "method": "update_function_code", "label": "sink", "resource_type": "function", "action": "update"},
        {"service": "Lambda", "method": "delete_function", "label": "neither", "resource_type": "function", "action": "delete"},
        {"service": "Lambda", "method": "publish_layer_version", "label": "sink", "resource_type": "layer", "action": "publish"},
        {"service": "Lambda", "method": "update_alias", "label": "neither", "resource_type": "alias", "action": "update"},
        
        # EC2 Service
        {"service": "EC2", "method": "associate_iam_instance_profile", "label": "sink", "resource_type": "instance", "action": "associate"},
        {"service": "EC2", "method": "run_instances", "label": "sink", "resource_type": "instance", "action": "run"},
        {"service": "EC2", "method": "create_security_group", "label": "sink", "resource_type": "sec group", "action": "create"},
        {"service": "EC2", "method": "create_volume", "label": "neither", "resource_type": "volume", "action": "create"},
        {"service": "EC2", "method": "authorize_security_group_ingress", "label": "sink", "resource_type": "sec group/ingress", "action": "authorize"},
        {"service": "EC2", "method": "create_tags", "label": "neither", "resource_type": "?", "action": "create"},
        {"service": "EC2", "method": "get_password_data", "label": "source", "resource_type": "instance", "action": "get"},
        {"service": "EC2", "method": "import_key_pair", "label": "sink", "resource_type": "key pair/key", "action": "import"},
        {"service": "EC2", "method": "modify_instance_attribute", "label": "sink", "resource_type": "instance", "action": "modify"},
        {"service": "EC2", "method": "revoke_client_vpn_ingress", "label": "sink", "resource_type": "ingress", "action": "revoke"},
        
        # IAM Service
        {"service": "IAM", "method": "create_user", "label": "sink", "resource_type": "user", "action": "create"},
        {"service": "IAM", "method": "detach_role_policy", "label": "sink", "resource_type": "role", "action": "detach"},
        {"service": "IAM", "method": "delete_group", "label": "neither", "resource_type": "group", "action": "delete"},
        {"service": "IAM", "method": "delete_group_policy", "label": "sink", "resource_type": "group", "action": "delete"},
        {"service": "IAM", "method": "create_role", "label": "sink", "resource_type": "role", "action": "create"},
        {"service": "IAM", "method": "update_login_profile", "label": "sink", "resource_type": "profile", "action": "update"},
        {"service": "IAM", "method": "upload_ssh_public_key", "label": "sink", "resource_type": "key", "action": "upload"},
        {"service": "IAM", "method": "upload_signing_certificate", "label": "sink", "resource_type": "certificate", "action": "upload"},
        {"service": "IAM", "method": "list_users", "label": "source", "resource_type": "user/?", "action": "list"},
        {"service": "IAM", "method": "get_access_key_last_used", "label": "source", "resource_type": "key", "action": "get"},
        {"service": "IAM", "method": "change_password", "label": "sink", "resource_type": "user", "action": "change"},
        
        # SSM Service
        {"service": "SSM", "method": "get_parameter", "label": "source", "resource_type": "parameter", "action": "get"},
        {"service": "SSM", "method": "start_session", "label": "sink", "resource_type": "session", "action": "start"},
        {"service": "SSM", "method": "list_commands", "label": "source", "resource_type": "command/?", "action": "list"},
        {"service": "SSM", "method": "describe_instance_information", "label": "source", "resource_type": "instance", "action": "describe"},
        {"service": "SSM", "method": "get_parameters_by_path", "label": "source", "resource_type": "parameter", "action": "get"},
        {"service": "SSM", "method": "get_document", "label": "source", "resource_type": "document", "action": "get"},
        {"service": "SSM", "method": "get_inventory", "label": "source", "resource_type": "inventory", "action": "get"},
        {"service": "SSM", "method": "create_association", "label": "sink", "resource_type": "association", "action": "create"},
        {"service": "SSM", "method": "delete_parameter", "label": "neither", "resource_type": "parameter", "action": "delete"},
        {"service": "SSM", "method": "get_command_invocation", "label": "source", "resource_type": "invocation/command", "action": "get"},
        
        # SQS Service
        {"service": "SQS", "method": "send_message", "label": "sink", "resource_type": "message", "action": "send"},
        {"service": "SQS", "method": "get_queue_url", "label": "source", "resource_type": "url/queue", "action": "get"},
        {"service": "SQS", "method": "delete_message", "label": "neither", "resource_type": "message", "action": "delete"},
        {"service": "SQS", "method": "create_queue", "label": "neither", "resource_type": "queue", "action": "create"},
        {"service": "SQS", "method": "receive_message", "label": "source", "resource_type": "message", "action": "receive"},
        {"service": "SQS", "method": "send_message_batch", "label": "sink", "resource_type": "message", "action": "send"},
        {"service": "SQS", "method": "delete_queue", "label": "neither", "resource_type": "queue", "action": "delete"},
        {"service": "SQS", "method": "list_queues", "label": "source", "resource_type": "queue", "action": "list"},
        {"service": "SQS", "method": "set_queue_attributes", "label": "sink", "resource_type": "queue", "action": "set"},
        {"service": "SQS", "method": "tag_queue", "label": "neither", "resource_type": "queue", "action": "tag"},
        
        # SNS Service
        {"service": "SNS", "method": "publish", "label": "sink", "resource_type": "message/topic", "action": "publish"},
        {"service": "SNS", "method": "create_topic", "label": "neither", "resource_type": "topic", "action": "create"},
        {"service": "SNS", "method": "subscribe", "label": "sink", "resource_type": "topic", "action": "subscribe"},
        {"service": "SNS", "method": "list_subscriptions", "label": "source", "resource_type": "topic", "action": "list"},
        {"service": "SNS", "method": "list_topics", "label": "source", "resource_type": "topic", "action": "list"},
        {"service": "SNS", "method": "opt_in_phone_number", "label": "sink", "resource_type": "phone number", "action": "opt"},
        {"service": "SNS", "method": "check_if_phone_number_is_opted_out", "label": "source", "resource_type": "phone number", "action": "check"},
        {"service": "SNS", "method": "delete_endpoint", "label": "neither", "resource_type": "endpoint", "action": "delete"},
        {"service": "SNS", "method": "remove_permission", "label": "sink", "resource_type": "permission/topic", "action": "remove"},
        {"service": "SNS", "method": "delete_sms_sandbox_phone_number", "label": "neither", "resource_type": "phone number", "action": "delete"},
        {"service": "SNS", "method": "put_data_protection_policy", "label": "sink", "resource_type": "policy/topic", "action": "put"},
    ]
    
    return labeled_data

In [None]:
# Path to your API documentation JSON files
api_docs_path = "./aws_api_docs"

# Get priority services
service_tiers = get_priority_services_for_security_analysis()

# Start with tier 1 critical services
initial_services = service_tiers['tier1_critical'][:5]  # Start with first 5

print(f"Loading API documentation for: {initial_services}")
api_docs = load_aws_api_documentation(api_docs_path, initial_services)

# Get your labeled data
labeled_data = create_labeled_data_from_pdf_annotations()

print(f"\nLabeled dataset size: {len(labeled_data)} examples")
print(f"Services covered: {len(set(item['service'] for item in labeled_data))}")

# Save for use in main classifier
import pandas as pd
labeled_df = pd.DataFrame(labeled_data)
labeled_df.to_csv('aws_security_labeled_data.csv', index=False)
print("\nLabeled data saved to aws_security_labeled_data.csv")