Author: Amitabh Chakravorty

DATA PREPROCESSING - Clean and prepare datasets

Based on exploration results

In [10]:
# ============================================================================
# SETUP
# ============================================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pickle
import os
from google.colab import drive

# Mount and navigate
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/cryptojacking_validation'
os.chdir(base_path)

print("Working directory:", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Working directory: /content/drive/MyDrive/cryptojacking_validation


In [11]:
# ============================================================================
# PREPROCESS DS2OS DATASET
# ============================================================================

def preprocess_ds2os():
    """
    Preprocess DS2OS dataset
    Target column: 'normality'
    """
    print("\n" + "="*70)
    print("PREPROCESSING DS2OS DATASET")
    print("="*70)

    # Load data
    print("\n[1/7] Loading data...")
    file_path = 'data/raw/ds2os/DS2OS.csv'
    df = pd.read_csv(file_path)
    print(f"Loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")

    # Identify target
    print("\n[2/7] Separating features and target")
    target_col = 'normality'

    X = df.drop([target_col], axis=1)
    y = df[target_col]

    print(f"Features: {X.shape[1]} columns")
    print(f"Target distribution:")
    print(y.value_counts())

    # Handle categorical features
    print("\n[3/7] Encoding categorical features...")
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    print(f"Found {len(categorical_cols)} categorical columns:")
    for col in categorical_cols:
        print(f"  - {col}")

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

    print(f"All categorical features encoded")

    # Handle missing values
    print("\n[4/7] Handling missing values...")
    missing_count = X.isnull().sum().sum()
    if missing_count > 0:
        print(f"  Filling {missing_count} missing values with column means")
        X = X.fillna(X.mean())
    else:
        print(f"  No missing values")

    # Encode target (binary: normal=0, attack=1)
    print("\n[5/7] Encoding target variable...")
    # Create binary target: 'normal' = 0, everything else = 1
    y_binary = y.apply(lambda x: 0 if x == 'normal' else 1)

    print(f"Binary encoding:")
    print(f"  Normal (0): {(y_binary == 0).sum():,}")
    print(f"  Attack (1): {(y_binary == 1).sum():,}")

    # Calculate imbalance ratio
    class_counts = np.bincount(y_binary)
    imbalance_ratio = class_counts[1] / class_counts[0] if class_counts[0] > 0 else 0
    print(f"  Imbalance ratio: {imbalance_ratio:.3f}")

    # Split data
    print("\n[6/7] Splitting data (70% train, 30% test)")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_binary,
        test_size=0.3,
        random_state=42,
        stratify=y_binary
    )

    print(f"Train: {X_train.shape[0]:,} samples")
    print(f"Test:  {X_test.shape[0]:,} samples")

    # Apply SMOTE if severe imbalance
    if imbalance_ratio < 0.3 or imbalance_ratio > 3.0:
        print(f"\n  Severe class imbalance detected (ratio: {imbalance_ratio:.3f})")
        print(f"  Applying SMOTE to balance training data...")

        smote = SMOTE(random_state=42, k_neighbors=min(5, (y_train == 1).sum() - 1))
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        print(f"  After SMOTE:")
        print(f"    Normal (0): {(y_train_resampled == 0).sum():,}")
        print(f"    Attack (1): {(y_train_resampled == 1).sum():,}")

        X_train = X_train_resampled
        y_train = y_train_resampled

    # Scale features
    print("\n[7/7] Scaling features")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"Features scaled (mean=0, std=1)")

    # Save processed data
    print("\n Saving processed data")
    np.save('data/processed/X_train_ds2os.npy', X_train_scaled)
    np.save('data/processed/X_test_ds2os.npy', X_test_scaled)
    np.save('data/processed/y_train_ds2os.npy', y_train)
    np.save('data/processed/y_test_ds2os.npy', y_test)

    # Save preprocessing objects
    with open('data/processed/scaler_ds2os.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    with open('data/processed/label_encoders_ds2os.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)

    print("\n" + "="*70)
    print("DS2OS PREPROCESSING COMPLETE")
    print("="*70)
    print(f"Final train shape: {X_train_scaled.shape}")
    print(f"Final test shape:  {X_test_scaled.shape}")
    print(f"Features: {X_train_scaled.shape[1]}")
    print(f"Class distribution (train): Normal={np.sum(y_train==0):,}, Attack={np.sum(y_train==1):,}")
    print(f"Class distribution (test):  Normal={np.sum(y_test==0):,}, Attack={np.sum(y_test==1):,}")

    return X_train_scaled, X_test_scaled, y_train, y_test

In [12]:
# ============================================================================
# PREPROCESS NSL-KDD DATASET
# ============================================================================

def preprocess_nsl_kdd():
    """
    Preprocess NSL-KDD dataset
    Files: KDDTrain+.txt and KDDTest+.txt (no headers)
    """
    print("\n" + "="*70)
    print("PREPROCESSING NSL-KDD DATASET")
    print("="*70)

    # NSL-KDD standard column names
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes',
        'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
        'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
        'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
        'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
        'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate', 'label', 'difficulty'
    ]

    # Load files
    print("\n[1/6] Loading data files...")
    train_path = 'data/raw/nsl_kdd/KDDTrain+.txt'
    test_path = 'data/raw/nsl_kdd/KDDTest+.txt'

    df_train = pd.read_csv(train_path, names=columns)
    df_test = pd.read_csv(test_path, names=columns)

    print(f"Train: {df_train.shape[0]:,} rows")
    print(f"Test:  {df_test.shape[0]:,} rows")

    # Remove difficulty column
    print("\n[2/6] Preparing data")
    if 'difficulty' in df_train.columns:
        df_train = df_train.drop(['difficulty'], axis=1)
        df_test = df_test.drop(['difficulty'], axis=1)

    # Binary classification: normal vs attack
    print("\n[3/6] Creating binary labels")
    print("Original label distribution (train):")
    print(df_train['label'].value_counts().head(10))

    df_train['label_binary'] = df_train['label'].apply(lambda x: 0 if x == 'normal' else 1)
    df_test['label_binary'] = df_test['label'].apply(lambda x: 0 if x == 'normal' else 1)

    print(f"\n Binary labels created:")
    print(f"  Train - Normal: {(df_train['label_binary']==0).sum():,}, Attack: {(df_train['label_binary']==1).sum():,}")
    print(f"  Test  - Normal: {(df_test['label_binary']==0).sum():,}, Attack: {(df_test['label_binary']==1).sum():,}")

    # Separate features and target
    print("\n[4/6] Encoding categorical features...")
    X_train = df_train.drop(['label', 'label_binary'], axis=1)
    X_test = df_test.drop(['label', 'label_binary'], axis=1)
    y_train = df_train['label_binary'].values
    y_test = df_test['label_binary'].values

    # Encode categorical features
    categorical_cols = ['protocol_type', 'service', 'flag']
    label_encoders = {}

    for col in categorical_cols:
        le = LabelEncoder()
        # Fit on combined data to ensure consistent encoding
        combined = pd.concat([X_train[col], X_test[col]])
        le.fit(combined)
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])
        label_encoders[col] = le

    print(f"Encoded {len(categorical_cols)} categorical columns")

    # Convert to numpy arrays
    X_train = X_train.values
    X_test = X_test.values

    # Scale features
    print("\n[5/6] Scaling features")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"Features scaled")

    # Save processed data
    print("\n[6/6] Saving processed data")
    np.save('data/processed/X_train_nsl_kdd.npy', X_train_scaled)
    np.save('data/processed/X_test_nsl_kdd.npy', X_test_scaled)
    np.save('data/processed/y_train_nsl_kdd.npy', y_train)
    np.save('data/processed/y_test_nsl_kdd.npy', y_test)

    with open('data/processed/scaler_nsl_kdd.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    with open('data/processed/label_encoders_nsl_kdd.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)

    print("\n" + "="*70)
    print("NSL-KDD PREPROCESSING COMPLETE!")
    print("="*70)
    print(f"Final train shape: {X_train_scaled.shape}")
    print(f"Final test shape:  {X_test_scaled.shape}")
    print(f"Features: {X_train_scaled.shape[1]}")
    print(f"Class distribution (train): Normal={np.sum(y_train==0):,}, Attack={np.sum(y_train==1):,}")
    print(f"Class distribution (test):  Normal={np.sum(y_test==0):,}, Attack={np.sum(y_test==1):,}")

    return X_train_scaled, X_test_scaled, y_train, y_test

In [13]:
# ============================================================================
# RUN PREPROCESSING
# ============================================================================


print("STARTING DATA PREPROCESSING")

# Process DS2OS
try:
    print("\n\n DATASET 1: DS2OS")
    X_train_ds2os, X_test_ds2os, y_train_ds2os, y_test_ds2os = preprocess_ds2os()
    print("DS2OS complete!")
except Exception as e:
    print(f" Error preprocessing DS2OS: {e}")
    import traceback
    traceback.print_exc()

# Process NSL-KDD
try:
    print("\n\n DATASET 2: NSL-KDD")
    X_train_nsl, X_test_nsl, y_train_nsl, y_test_nsl = preprocess_nsl_kdd()
    print("NSL-KDD complete!")
except Exception as e:
    print(f"Error preprocessing NSL-KDD: {e}")
    import traceback
    traceback.print_exc()



STARTING DATA PREPROCESSING


 DATASET 1: DS2OS

PREPROCESSING DS2OS DATASET

[1/7] Loading data...
Loaded: 357,952 rows × 13 columns

[2/7] Separating features and target
Features: 12 columns
Target distribution:
normality
normal                           347935
anomalous(DoSattack)               5780
anomalous(scan)                    1547
anomalous(malitiousControl)         889
anomalous(malitiousOperation)       805
anomalous(spying)                   532
anomalous(dataProbing)              342
anomalous(wrongSetUp)               122
Name: count, dtype: int64

[3/7] Encoding categorical features...
Found 11 categorical columns:
  - sourceID
  - sourceAddress
  - sourceType
  - sourceLocation
  - destinationServiceAddress
  - destinationServiceType
  - destinationLocation
  - accessedNodeAddress
  - accessedNodeType
  - operation
  - value
All categorical features encoded

[4/7] Handling missing values...
  No missing values

[5/7] Encoding target variable...
Binary encoding:
  Norm

In [14]:
# ============================================================================
# SUMMARY
# ============================================================================

print("\n\n" + "="*70)
print("PREPROCESSING SUMMARY")
print("="*70)

# List all processed files
processed_files = [f for f in os.listdir('data/processed') if f.endswith('.npy') or f.endswith('.pkl')]
print(f"\n Created {len(processed_files)} files in data/processed/:")
for f in sorted(processed_files):
    size_mb = os.path.getsize(f'data/processed/{f}') / (1024**2)
    print(f"  {f:<45} {size_mb:>8.2f} MB")

# Create summary table
summary_data = []

for dataset in ['ds2os', 'nsl_kdd']:
    try:
        X_train = np.load(f'data/processed/X_train_{dataset}.npy')
        X_test = np.load(f'data/processed/X_test_{dataset}.npy')
        y_train = np.load(f'data/processed/y_train_{dataset}.npy')
        y_test = np.load(f'data/processed/y_test_{dataset}.npy')

        summary_data.append({
            'Dataset': dataset.upper(),
            'Train Samples': f"{len(y_train):,}",
            'Test Samples': f"{len(y_test):,}",
            'Features': X_train.shape[1],
            'Train Normal': f"{np.sum(y_train==0):,}",
            'Train Attack': f"{np.sum(y_train==1):,}",
            'Test Normal': f"{np.sum(y_test==0):,}",
            'Test Attack': f"{np.sum(y_test==1):,}"
        })
    except:
        pass

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    print("\n Dataset Statistics:")
    print(summary_df.to_string(index=False))

    # Save summary
    summary_df.to_csv('results/preprocessing_summary.csv', index=False)
    print("\n Summary saved to: results/preprocessing_summary.csv")

print("\n")
print("ALL PREPROCESSING COMPLETE!")
print("\nNEXT STEP: Run Model notebook (04_Model.ipynb)")



PREPROCESSING SUMMARY

 Created 12 files in data/processed/:
  X_test_ds2os.npy                                  9.83 MB
  X_test_nsl_kdd.npy                                7.05 MB
  X_train_ds2os.npy                                44.60 MB
  X_train_nsl_kdd.npy                              39.41 MB
  label_encoders_ds2os.pkl                          0.12 MB
  label_encoders_nsl_kdd.pkl                        0.00 MB
  scaler_ds2os.pkl                                  0.00 MB
  scaler_nsl_kdd.pkl                                0.00 MB
  y_test_ds2os.npy                                  0.82 MB
  y_test_nsl_kdd.npy                                0.17 MB
  y_train_ds2os.npy                                 3.72 MB
  y_train_nsl_kdd.npy                               0.96 MB

 Dataset Statistics:
Dataset Train Samples Test Samples  Features Train Normal Train Attack Test Normal Test Attack
  DS2OS       487,108      107,386        12      243,554      243,554     104,381       3,005
NSL_K