# NSL-KDD Data Preprocessing for Cybersecurity Threat Detection

This notebook preprocesses the NSL-KDD dataset for training an XGBoost model on AWS SageMaker.

In [None]:
import pandas as pd
import numpy as np
import boto3
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import sagemaker
import io

In [None]:
# Initialize SageMaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

# S3 client
s3 = boto3.client('s3')

# Define bucket names (replace with your actual bucket names)
raw_bucket = 'cybersec-raw-data-xxxxxxxx'  # Replace with actual bucket name
processed_bucket = 'cybersec-processed-data-xxxxxxxx'  # Replace with actual bucket name

In [None]:
# Column names for NSL-KDD dataset
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]

In [None]:
# Download and load NSL-KDD data
# You need to upload the NSL-KDD files to your raw data bucket first

# Load training data
train_data = pd.read_csv('s3://' + raw_bucket + '/KDDTrain+.txt', names=columns)
test_data = pd.read_csv('s3://' + raw_bucket + '/KDDTest+.txt', names=columns)

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"\nLabel distribution in training data:")
print(train_data['label'].value_counts())

In [None]:
def preprocess_data(df):
    """Preprocess the NSL-KDD dataset"""
    df = df.copy()
    
    # Remove difficulty column
    if 'difficulty' in df.columns:
        df = df.drop('difficulty', axis=1)
    
    # Convert label to binary (0: normal, 1: attack)
    df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)
    
    # Encode categorical features
    categorical_features = ['protocol_type', 'service', 'flag']
    
    for feature in categorical_features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
    
    # Separate features and target
    X = df.drop('label', axis=1)
    y = df['label']
    
    # Normalize continuous features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X_scaled, y, scaler

# Preprocess training and test data
X_train, y_train, scaler = preprocess_data(train_data)
X_test, y_test, _ = preprocess_data(test_data)

print(f"Preprocessed training data shape: {X_train.shape}")
print(f"Training labels distribution: {y_train.value_counts()}")

In [None]:
# Prepare data for XGBoost (target as first column)
train_data_xgb = pd.concat([y_train, X_train], axis=1)
test_data_xgb = pd.concat([y_test, X_test], axis=1)

# Save preprocessed data to S3
train_data_xgb.to_csv(f's3://{processed_bucket}/train/train.csv', index=False, header=False)
test_data_xgb.to_csv(f's3://{processed_bucket}/test/test.csv', index=False, header=False)

# Save validation data (20% of training data)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

val_data_xgb = pd.concat([y_val, X_val], axis=1)
val_data_xgb.to_csv(f's3://{processed_bucket}/validation/validation.csv', index=False, header=False)

print("Data preprocessing completed and saved to S3!")
print(f"Training data: s3://{processed_bucket}/train/train.csv")
print(f"Validation data: s3://{processed_bucket}/validation/validation.csv")
print(f"Test data: s3://{processed_bucket}/test/test.csv")

In [None]:
# Save feature names for later use
feature_names = X_train.columns.tolist()
feature_info = {
    'feature_names': feature_names,
    'num_features': len(feature_names)
}

import json
with open('/tmp/feature_info.json', 'w') as f:
    json.dump(feature_info, f)

# Upload feature info to S3
s3.upload_file('/tmp/feature_info.json', processed_bucket, 'feature_info.json')
print("Feature information saved to S3")