New Code

In [None]:
# Cell 1 — Setup: installs, imports, seed, GPU check, constants
import os, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef


In [None]:
# Reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

print("TF version:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices('GPU'))


In [None]:
# Constants
RESULTS_DIR = "/content/results"
os.makedirs(RESULTS_DIR, exist_ok=True)

sns.set(style="whitegrid")
print("Setup complete.")

In [None]:
# Cell 2 — NSL-KDD loading and preprocessing
# Using direct file paths since files are already in Kaggle input
train_fp = "/kaggle/input/nslkdd/KDDTrain+.txt"
test_fp = "/kaggle/input/nslkdd/KDDTest+.txt"

print("Using NSL-KDD files:", train_fp, test_fp)

In [None]:
# Read the data (NSL-KDD has no headers)
df_train = pd.read_csv(train_fp, header=None)
df_test = pd.read_csv(test_fp, header=None)


In [None]:
# Add column names based on NSL-KDD documentation
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'difficulty'
]

df_train.columns = columns
df_test.columns = columns

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

In [None]:
# Convert to binary labels (normal = 0, attack = 1)
df_train['label'] = df_train['attack_type'].apply(lambda x: 0 if x == 'normal' else 1)
df_test['label'] = df_test['attack_type'].apply(lambda x: 0 if x == 'normal' else 1)

print("Label distribution - Train:")
print(df_train['label'].value_counts())
print("\nLabel distribution - Test:")
print(df_test['label'].value_counts())

# Drop unnecessary columns
df_train = df_train.drop(['attack_type', 'difficulty'], axis=1)
df_test = df_test.drop(['attack_type', 'difficulty'], axis=1)

# Handle categorical columns (protocol_type, service, flag)
categorical_columns = ['protocol_type', 'service', 'flag']

# One-hot encode categorical variables to avoid unknown category issues
df_train = pd.get_dummies(df_train, columns=categorical_columns, prefix=categorical_columns)
df_test = pd.get_dummies(df_test, columns=categorical_columns, prefix=categorical_columns)

# Align columns (some services might be missing in test set)
train_cols = df_train.columns
test_cols = df_test.columns

# Add missing columns to test set
for col in train_cols:
    if col not in test_cols and col != 'label':
        df_test[col] = 0

# Reorder test columns to match train
df_test = df_test[train_cols]

print(f"After preprocessing - Train shape: {df_train.shape}, Test shape: {df_test.shape}")


In [None]:
# Prepare features and labels
feature_cols = [col for col in df_train.columns if col != 'label']

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(df_train[feature_cols].astype(float))
X_test = scaler.transform(df_test[feature_cols].astype(float))


In [None]:
# Reshape for CNN/LSTM (samples, timesteps, features)
timesteps = X_train.shape[1]
X_train = X_train.reshape((-1, timesteps, 1))
X_test = X_test.reshape((-1, timesteps, 1))

y_train = df_train['label'].values
y_test = df_test['label'].values

print(f"Final shapes - X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")


In [None]:

# Cell 3 — Model definitions
def build_cnn(input_shape):
    model = keras.Sequential([
        layers.Conv1D(64, 3, activation='relu', input_shape=input_shape),
        layers.MaxPooling1D(2),
        layers.Conv1D(128, 3, activation='relu'),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(2, activation='softmax')
    ])
    return model

def build_lstm(input_shape):
    model = keras.Sequential([
        layers.LSTM(64, input_shape=input_shape),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dense(2, activation='softmax')
    ])
    return model

def build_cnn_lstm(input_shape):
    model = keras.Sequential([
        layers.Conv1D(64, 3, activation='relu', input_shape=input_shape),
        layers.MaxPooling1D(2),
        layers.LSTM(64),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dense(2, activation='softmax')
    ])
    return model

In [None]:

# Cell 4 — Training and evaluation
def train_model(model, name):
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    print(f"Training {name}...")
    history = model.fit(X_train, y_train,
                       validation_split=0.2,
                       epochs=10,  # Reduced for faster execution
                       batch_size=128,
                       verbose=1)
    
    return model, history

def evaluate_model(model, name):
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'mcc': matthews_corrcoef(y_test, y_pred)
    }
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Attack'], 
                yticklabels=['Normal', 'Attack'])
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'{RESULTS_DIR}/{name}_cm.png')
    plt.close()
    
    print(f"\n{name} Results:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    
    return metrics

In [None]:

# Cell 5 — Run experiments
models = {
    'CNN': build_cnn((timesteps, 1)),
    'LSTM': build_lstm((timesteps, 1)),
    'CNN_LSTM': build_cnn_lstm((timesteps, 1))
}

print("Starting model training...")
results = []

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}")
    print(f"{'='*50}")
    
    trained_model, history = train_model(model, name)
    metrics = evaluate_model(trained_model, name)
    metrics['model'] = name
    results.append(metrics)


In [None]:

# Cell 6 — Display and save results
print(f"\n{'='*50}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*50}")

results_df = pd.DataFrame(results)
print("\n", results_df)

# Save results
results_df.to_csv(f'{RESULTS_DIR}/nsl_kdd_results.csv', index=False)
print(f"\nResults saved to: {RESULTS_DIR}/nsl_kdd_results.csv")

# Plot comparison
plt.figure(figsize=(10, 6))
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']
x = range(len(results_df))

for i, metric in enumerate(metrics_to_plot):
    plt.bar([p + i*0.2 for p in x], results_df[metric], width=0.2, label=metric)

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks([p + 0.3 for p in x], results_df['model'])
plt.legend()
plt.ylim(0, 1)
plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/model_comparison.png')
plt.show()