# NSL-KDD Threat Detection - Complete Data Pipeline
## Data Preprocessing, Model Training & Deployment

In [None]:
# Install required packages
import subprocess
import sys

packages = ['pandas', 'numpy', 'scikit-learn', 'sagemaker', 'boto3']
for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", package])

print("‚úÖ Packages installed successfully")

In [None]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
import os
import urllib.request
import json

# Initialize SageMaker session
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

print(f"SageMaker role: {role}")
print(f"Region: {region}")

In [None]:
# Get S3 bucket names from environment or use defaults
raw_bucket = 'cybersec-raw-data-plh92c1q'
processed_bucket = 'cybersec-processed-data-plh92c1q'

print(f"Raw data bucket: {raw_bucket}")
print(f"Processed data bucket: {processed_bucket}")

In [None]:
# NSL-KDD column names (41 features + label)
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]

print(f"Total columns: {len(columns)}")

In [None]:
# Download NSL-KDD dataset
print("üì• Downloading NSL-KDD dataset...")

train_url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.txt"
test_url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.txt"

urllib.request.urlretrieve(train_url, '/tmp/KDDTrain+.txt')
urllib.request.urlretrieve(test_url, '/tmp/KDDTest+.txt')

print("‚úÖ Dataset downloaded successfully")

In [None]:
# Load and examine the data
print("üìä Loading and examining data...")

train_df = pd.read_csv('/tmp/KDDTrain+.txt', names=columns, header=None)
test_df = pd.read_csv('/tmp/KDDTest+.txt', names=columns, header=None)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nUnique labels in training data: {train_df['label'].unique()}")
print(f"Label distribution:\n{train_df['label'].value_counts()}")

In [None]:
# Data Preprocessing Function
def preprocess_nsl_kdd(train_df, test_df):
    print("üîß Starting data preprocessing...")
    
    # Combine datasets for consistent encoding
    combined_df = pd.concat([train_df, test_df], ignore_index=True)
    
    # 1. Convert labels to binary (0=normal, 1=attack)
    print("Converting labels to binary classification...")
    combined_df['label_binary'] = (combined_df['label'] != 'normal').astype(int)
    
    # 2. Encode categorical features
    print("Encoding categorical features...")
    categorical_features = ['protocol_type', 'service', 'flag']
    label_encoders = {}
    
    for feature in categorical_features:
        le = LabelEncoder()
        combined_df[feature] = le.fit_transform(combined_df[feature])
        label_encoders[feature] = le
        print(f"  - {feature}: {len(le.classes_)} unique values")
    
    # 3. Identify continuous features (exclude categorical and target)
    continuous_features = [col for col in combined_df.columns 
                          if col not in categorical_features + ['label', 'label_binary', 'difficulty']]
    
    print(f"Continuous features: {len(continuous_features)}")
    
    # 4. Normalize continuous features using StandardScaler
    print("Normalizing continuous features...")
    scaler = StandardScaler()
    combined_df[continuous_features] = scaler.fit_transform(combined_df[continuous_features])
    
    # 5. Drop unnecessary columns
    combined_df = combined_df.drop(['label', 'difficulty'], axis=1)
    
    # 6. Reorder columns (label first for XGBoost)
    feature_cols = [col for col in combined_df.columns if col != 'label_binary']
    combined_df = combined_df[['label_binary'] + feature_cols]
    
    # 7. Split back into train and test
    train_processed = combined_df[:len(train_df)].copy()
    test_processed = combined_df[len(train_df):].copy()
    
    print(f"‚úÖ Preprocessing completed!")
    print(f"Final training shape: {train_processed.shape}")
    print(f"Final test shape: {test_processed.shape}")
    print(f"Binary label distribution: {train_processed['label_binary'].value_counts().to_dict()}")
    
    return train_processed, test_processed, label_encoders, scaler

# Execute preprocessing
train_processed, test_processed, encoders, scaler = preprocess_nsl_kdd(train_df, test_df)

In [None]:
# Create train/validation split
print("üìä Creating train/validation split...")

# Split training data into train/validation (80/20)
train_data, val_data = train_test_split(train_processed, test_size=0.2, random_state=42, 
                                       stratify=train_processed['label_binary'])

print(f"Training set: {train_data.shape}")
print(f"Validation set: {val_data.shape}")
print(f"Test set: {test_processed.shape}")

In [None]:
# Save processed data to S3
print("üíæ Saving processed data to S3...")

# Save locally first
train_data.to_csv('/tmp/train_processed.csv', index=False, header=False)
val_data.to_csv('/tmp/validation_processed.csv', index=False, header=False)
test_processed.to_csv('/tmp/test_processed.csv', index=False, header=False)

# Upload to S3
s3 = boto3.client('s3')

s3.upload_file('/tmp/train_processed.csv', processed_bucket, 'train/train.csv')
s3.upload_file('/tmp/validation_processed.csv', processed_bucket, 'validation/validation.csv')
s3.upload_file('/tmp/test_processed.csv', processed_bucket, 'test/test.csv')

# Save preprocessing artifacts
import pickle

with open('/tmp/encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)
    
with open('/tmp/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

s3.upload_file('/tmp/encoders.pkl', processed_bucket, 'artifacts/encoders.pkl')
s3.upload_file('/tmp/scaler.pkl', processed_bucket, 'artifacts/scaler.pkl')

print("‚úÖ Data saved to S3 successfully!")

In [None]:
# Configure XGBoost training
print("üöÄ Configuring XGBoost model training...")

# Define S3 paths
train_path = f's3://{processed_bucket}/train/'
validation_path = f's3://{processed_bucket}/validation/'
output_path = f's3://{processed_bucket}/model-output/'

# Get XGBoost container
container = sagemaker.image_uris.retrieve('xgboost', region, version='1.5-1')

# Create XGBoost estimator
xgb_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=output_path,
    sagemaker_session=sess,
    hyperparameters={
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'num_round': 100,
        'max_depth': 6,
        'eta': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 1,
        'scale_pos_weight': 1,
        'early_stopping_rounds': 10,
        'verbosity': 1
    }
)

print("‚úÖ XGBoost estimator configured!")

In [None]:
# Train the model
print("üîÑ Starting model training...")

train_input = TrainingInput(train_path, content_type='text/csv')
validation_input = TrainingInput(validation_path, content_type='text/csv')

xgb_estimator.fit({
    'train': train_input,
    'validation': validation_input
})

print("‚úÖ Model training completed!")

In [None]:
# Deploy model to endpoint
print("üöÄ Deploying model to endpoint...")

import time
endpoint_name = f'threat-detection-endpoint-{int(time.time())}'

predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    endpoint_name=endpoint_name
)

print(f"‚úÖ Model deployed to endpoint: {predictor.endpoint_name}")

In [None]:
# Test the endpoint with sample data
print("üß™ Testing the endpoint...")

# Get a sample from test data (excluding label)
test_sample = test_processed.iloc[0, 1:].values  # Exclude label column
csv_input = ','.join(map(str, test_sample))

# Make prediction
result = predictor.predict(csv_input)
prediction = float(result)

print(f"Sample input shape: {len(test_sample)}")
print(f"Raw prediction: {prediction}")
print(f"Binary prediction: {1 if prediction > 0.5 else 0}")
print(f"Confidence: {prediction if prediction > 0.5 else 1-prediction:.4f}")
print(f"Status: {'Attack Detected' if prediction > 0.5 else 'Normal Traffic'}")

In [None]:
# Save endpoint information and update Lambda
print("üíæ Saving endpoint information...")

endpoint_info = {
    'endpoint_name': predictor.endpoint_name,
    'instance_type': 'ml.t2.medium',
    'status': 'InService',
    'model_features': len(test_sample),
    'created_at': time.strftime('%Y-%m-%d %H:%M:%S')
}

# Save to S3
s3.put_object(
    Bucket=processed_bucket,
    Key='endpoint_info.json',
    Body=json.dumps(endpoint_info, indent=2)
)

print(f"‚úÖ Endpoint info saved: {endpoint_info}")

In [None]:
# Update Lambda function environment variable
print("üîß Updating Lambda function...")

try:
    lambda_client = boto3.client('lambda')
    
    # Find Lambda function
    functions = lambda_client.list_functions()['Functions']
    lambda_function = None
    
    for func in functions:
        if 'threat-detection-predict' in func['FunctionName']:
            lambda_function = func['FunctionName']
            break
    
    if lambda_function:
        # Update environment variable
        lambda_client.update_function_configuration(
            FunctionName=lambda_function,
            Environment={
                'Variables': {
                    'ENDPOINT_NAME': predictor.endpoint_name
                }
            }
        )
        print(f"‚úÖ Lambda function {lambda_function} updated with endpoint: {predictor.endpoint_name}")
    else:
        print("‚ö†Ô∏è Lambda function not found")
        
except Exception as e:
    print(f"‚ö†Ô∏è Could not update Lambda: {e}")

In [None]:
# Final validation and summary
print("\nüéâ DEPLOYMENT SUMMARY")
print("=" * 50)
print(f"‚úÖ Dataset: NSL-KDD processed successfully")
print(f"‚úÖ Training samples: {len(train_data):,}")
print(f"‚úÖ Validation samples: {len(val_data):,}")
print(f"‚úÖ Test samples: {len(test_processed):,}")
print(f"‚úÖ Features: {len(test_sample)} (normalized)")
print(f"‚úÖ Model: XGBoost binary classifier")
print(f"‚úÖ Endpoint: {predictor.endpoint_name}")
print(f"‚úÖ Status: Ready for predictions")
print("\nüåê Frontend can now make real predictions!")
print(f"üìä S3 Buckets:")
print(f"   - Raw data: s3://{raw_bucket}")
print(f"   - Processed data: s3://{processed_bucket}")