# Automated XGBoost Training for Threat Detection
This notebook automatically trains and deploys the threat detection model.

In [None]:
import sagemaker
import boto3
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
import os

In [None]:
# Initialize SageMaker session
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Get processed data bucket from environment
processed_bucket = os.environ.get('PROCESSED_BUCKET', 'cybersec-processed-data-xxxxxxxx')

print(f"SageMaker role: {role}")
print(f"Region: {region}")
print(f"Processed bucket: {processed_bucket}")

In [None]:
# Define S3 paths
train_path = f's3://{processed_bucket}/train/'
validation_path = f's3://{processed_bucket}/validation/'
output_path = f's3://{processed_bucket}/model-output/'

print(f"Training data: {train_path}")
print(f"Validation data: {validation_path}")
print(f"Model output: {output_path}")

In [None]:
# Get XGBoost container
container = sagemaker.image_uris.retrieve('xgboost', region, version='1.5-1')
print(f"XGBoost container: {container}")

In [None]:
# Create XGBoost estimator
xgb_estimator = XGBoost(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=output_path,
    sagemaker_session=sess,
    hyperparameters={
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'num_round': 100,
        'max_depth': 6,
        'eta': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 1,
        'scale_pos_weight': 1,
        'early_stopping_rounds': 10,
        'verbosity': 1
    }
)

print("XGBoost estimator created!")

In [None]:
# Define training inputs
train_input = TrainingInput(train_path, content_type='text/csv')
validation_input = TrainingInput(validation_path, content_type='text/csv')

print("Starting model training...")
xgb_estimator.fit({
    'train': train_input,
    'validation': validation_input
})

print("Model training completed!")

In [None]:
# Deploy model to endpoint
print("Deploying model to endpoint...")

predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    endpoint_name='threat-detection-endpoint',
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer()
)

print(f"Model deployed to endpoint: {predictor.endpoint_name}")

In [None]:
# Test the endpoint
print("Testing the endpoint...")

# Sample test data (normalized features)
test_sample = [
    [0, 1, 0, 0, 0.1, 0.2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
     0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.3, 0.3, 1.0, 0.0, 0.0, 
     0.0, 0.0, 0.0, 0.0, 0.0]
]

result = predictor.predict(test_sample)
print(f"Prediction result: {result}")

prediction = float(result[0][0])
threat_detected = "Attack" if prediction > 0.5 else "Normal"
confidence = prediction if prediction > 0.5 else 1 - prediction

print(f"Traffic Classification: {threat_detected}")
print(f"Confidence: {confidence:.4f}")

In [None]:
# Save endpoint information
endpoint_info = {
    'endpoint_name': predictor.endpoint_name,
    'instance_type': 'ml.t2.medium',
    'status': 'InService'
}

import json
s3 = boto3.client('s3')
s3.put_object(
    Bucket=processed_bucket,
    Key='endpoint_info.json',
    Body=json.dumps(endpoint_info, indent=2)
)

print("âœ… Training and deployment completed successfully!")
print(f"Endpoint Name: {predictor.endpoint_name}")
print("Model is ready for real-time threat detection!")