In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# Load data with column names
columns = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
           'wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
           'root_shell','su_attempted','num_root','num_file_creations','num_shells',
           'num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count',
           'srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate',
           'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count',
           'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate',
           'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
           'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','label']

df = pd.read_csv('dataset/kddcup.data/kddcup.data', names=columns)

# Create binary labels
df['is_attack'] = (df['label'] != 'normal.').astype(int)

# Drop constant column
df.drop('num_outbound_cmds', axis=1, inplace=True)

# Split data
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['is_attack'], random_state=42)

# Initialize preprocessing objects
categorical_cols = ['protocol_type','service','flag']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

# Fit preprocessing on training data
X_train = train_df.drop(['label','is_attack'], axis=1)
encoder.fit(X_train[categorical_cols])
X_num = scaler.fit_transform(X_train.drop(categorical_cols, axis=1))
X_cat = encoder.transform(X_train[categorical_cols])
X_train_processed = np.hstack((X_num, X_cat))
y_train = train_df['is_attack'].values

# Transform test data with same objects
X_test = test_df.drop(['label','is_attack'], axis=1)
X_num_test = scaler.transform(X_test.drop(categorical_cols, axis=1))
X_cat_test = encoder.transform(X_test[categorical_cols])
X_test_processed = np.hstack((X_num_test, X_cat_test))
y_test = test_df['is_attack'].values

# Verify feature alignment
assert X_train_processed.shape[1] == X_test_processed.shape[1], "Feature count mismatch!"

# Save processed data
np.save('X_train.npy', X_train_processed)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test_processed)
np.save('y_test.npy', y_test)
joblib.dump(encoder, 'encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')

print(f"Training set: {X_train_processed.shape}, {y_train.shape}")
print(f"Test set: {X_test_processed.shape}, {y_test.shape}")

Training set: (3428901, 120), (3428901,)
Test set: (1469530, 120), (1469530,)


In [6]:
print(f"Train features: {X_train.shape[1]}, Test features: {X_test.shape[1]}")


Train features: 40, Test features: 40
