In [None]:
import pandas as pd
import numpy as np
import os
import gzip
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, average_precision_score, precision_recall_fscore_support

# Command to convert .log.labeled files to .csv
# tr '\t' ',' < "(name).log.labeled" | cut -d ',' -f 1-22 > "(name).csv"

# Command to change directory
# cd /mnt/(path name)

# Replace first 8 lines of .csv file (until #types) with the header below
# timestamp,uid,src_ip,src_port,dest_ip,dest_port,protocol,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label

# Load the CSV files
benign = pd.read_csv(r"CSV\Benign.csv")
torii = pd.read_csv(r"CSV\Torii.csv")
mirai = pd.read_csv(r"CSV\Mirai.csv")

# Add a label column for each dataset
benign['label'] = 0
torii['label'] = 1
mirai['label'] = 2

# Concatenate the datasets
data = pd.concat([benign, torii, mirai], axis=0)

# Shuffle the dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows
data.head()

In [None]:
# Separate the features (X) and the target (y)
X = data.drop('label', axis=1)
y = data['label']

# Identify numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns

# Identify categorical columns that should be encoded
categorical_cols = X.select_dtypes(include=['object']).columns

# Label encode categorical columns
label_encoder = LabelEncoder()
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

# Normalize only the numeric columns
scaler = StandardScaler()
X_scaled = X.copy()  # Make a copy of the encoded DataFrame
X_scaled[numeric_cols] = scaler.fit_transform(X_scaled[numeric_cols])

# One-hot encode with sparse matrix
X_encoded = pd.get_dummies(X_scaled, columns=categorical_cols, sparse=True, drop_first=True)

# Convert the labels to one-hot encoding
y_encoded = to_categorical(y, num_classes=3)  # 3 classes: Benign, Torii, Mirai

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Check class distribution before resampling
print("Original training set class distribution:")
print(pd.Series(np.argmax(y_train, axis=1)).value_counts())

# Shuffle data to randomize before chunking
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Define chunk size for processing
chunk_size = 1000  # Adjust this based on memory constraints

# Create a directory to store processed chunks
output_dir = "resampled_chunks"
os.makedirs(output_dir, exist_ok=True)

# Define final merged dataset files
final_X_file = os.path.join(output_dir, "final_X.npy")
final_y_file = os.path.join(output_dir, "final_y.npy")

In [None]:
# Initialize RandomOverSampler
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)

chunk_count = 0

# Process each chunk
for i in range(0, X_train.shape[0], chunk_size):
    print(f"Processing chunk: {i} to {i + chunk_size}...")
    chunk_X = X_train[i:i + chunk_size]
    chunk_y = np.argmax(y_train[i:i + chunk_size], axis=1)
    
    # Skip chunks with only one class
    if len(np.unique(chunk_y)) <= 1:
        print(f"Skipping chunk {i} to {i + chunk_size}: Only one class present.")
        continue
    
    # Skip empty chunks
    if chunk_X.size == 0 or chunk_y.size == 0:
        print(f"Skipping chunk {i} to {i + chunk_size}: No valid data.")
        continue

    # Ensure chunk_X has two dimensions
    if len(chunk_X.shape) != 2:
        print(f"Skipping chunk {i} to {i + chunk_size}: Unexpected shape {chunk_X.shape}.")
        continue

    # Apply RandomOverSampler to the chunk
    chunk_X_resampled, chunk_y_resampled = ros.fit_resample(chunk_X, chunk_y)

    # Ensure resampled data is not empty
    if chunk_X_resampled.size == 0 or chunk_y_resampled.size == 0:
        print(f"Skipping resampled chunk {chunk_count}: No valid data after resampling.")
        continue

    # Save resampled chunk as .npy files
    np.save(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy"), chunk_X_resampled)
    np.save(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy"), chunk_y_resampled)

    # Compress the .npy files after saving using gzip
    with open(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy"), 'rb') as f_in:
        with gzip.open(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy.gz"), 'wb') as f_out:
            f_out.writelines(f_in)
    os.remove(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy"))  # Remove the uncompressed file

    with open(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy"), 'rb') as f_in:
        with gzip.open(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy.gz"), 'wb') as f_out:
            f_out.writelines(f_in)
    os.remove(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy"))  # Remove the uncompressed file

    print(f"Saved and compressed chunk {chunk_count} with {chunk_X_resampled.shape[0]} samples.")
    chunk_count += 1

    # Free memory
    del chunk_X, chunk_y, chunk_X_resampled, chunk_y_resampled

print(f"Total chunks saved and compressed: {chunk_count}")

In [None]:
first_chunk = True
X_shape_total = 0
y_shape_total = 0

# Load and merge all chunks
for file in sorted(os.listdir(output_dir)):  
    if file.startswith("chunk_X_") and file.endswith(".npy.gz"):
        chunk_index = file.split("_")[-1].split(".")[0]  # Extract index number
        # Decompress the chunk file before loading
        with gzip.open(os.path.join(output_dir, f"chunk_X_{chunk_index}.npy.gz"), 'rb') as f:
            chunk_X = np.load(f)
        with gzip.open(os.path.join(output_dir, f"chunk_y_{chunk_index}.npy.gz"), 'rb') as f:
            chunk_y = np.load(f)

        print(f"Chunk {chunk_index} X shape: {chunk_X.shape}, y shape: {chunk_y.shape}")

        # Skip corrupted or empty chunks
        if len(chunk_X.shape) != 2 or chunk_X.shape[0] == 0 or chunk_y.shape[0] == 0:
            print(f"Skipping chunk {chunk_index} due to invalid shape.")
            continue

        if first_chunk:
            # Initialize memmap with first chunk shape
            X_memmap = np.memmap(final_X_file, dtype=chunk_X.dtype, mode="w+", shape=chunk_X.shape)
            y_memmap = np.memmap(final_y_file, dtype=chunk_y.dtype, mode="w+", shape=chunk_y.shape)

            X_memmap[:] = chunk_X
            y_memmap[:] = chunk_y

            first_chunk = False
        else:
            # Update total shape
            X_shape_old = X_shape_total
            y_shape_old = y_shape_total

            # Update shape counters
            X_shape_total += chunk_X.shape[0]
            y_shape_total += chunk_y.shape[0]

            # Resize `memmap`
            X_memmap = np.memmap(final_X_file, dtype=chunk_X.dtype, mode="r+", shape=(X_shape_total, chunk_X.shape[1]))
            y_memmap = np.memmap(final_y_file, dtype=chunk_y.dtype, mode="r+", shape=(y_shape_total,))

            # Append new data
            X_memmap[X_shape_old:] = chunk_X
            y_memmap[y_shape_old:] = chunk_y

        del chunk_X, chunk_y  # Free memory
        print(f"Final X shape: {X_memmap.shape}, Final y shape: {y_memmap.shape}")

print("All chunks processed and saved to disk!")

# Final dataset loading
print("Loading final dataset into memory (only when needed)...")
X_train_resampled = np.memmap(final_X_file, dtype=np.float64, mode="r", shape=(X_shape_total, input_dim))
y_train_resampled = np.memmap(final_y_file, dtype=np.int64, mode="r", shape=(y_shape_total,))

# Verify shapes before training
print(f"X_train_resampled shape: {X_train_resampled.shape}")
print(f"y_train_resampled shape: {y_train_resampled.shape}")

# Convert labels to categorical (one-hot encoding)
y_train_resampled_encoded = to_categorical(y_train_resampled, num_classes=3)

# Verify categorical encoding
print(f"y_train_resampled_encoded shape: {y_train_resampled_encoded.shape}")

# Compute class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_resampled), y=y_train_resampled)
class_weight_dict = dict(zip(np.unique(y_train_resampled), class_weights))
print("Class weights:", class_weight_dict)

In [None]:
# Define the model architecture
model = Sequential([
    Dense(512, input_dim=input_dim, activation='relu', kernel_regularizer="l2"),
    Dropout(0.3),
    Dense(256, activation='relu', kernel_regularizer="l2"),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer="l2"),
    Dropout(0.3),
    Dense(3, activation='softmax')  # 3 classes: Benign, Torii, Mirai
])

# Compile the model with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall()])

# Display model summary
model.summary()

In [None]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train using memory-efficient batch generator
def data_generator(X_file, y_file, batch_size=64):
    X_memmap = np.memmap(X_file, dtype=np.float64, mode="r", shape=(X_shape_total, input_dim))
    y_memmap = np.memmap(y_file, dtype=np.int64, mode="r", shape=(y_shape_total,))
    y_memmap_encoded = to_categorical(y_memmap, num_classes=3)

    while True:  # Infinite loop for generator
        for i in range(0, X_shape_total, batch_size):
            X_batch = X_memmap[i:i+batch_size]
            y_batch = y_memmap_encoded[i:i+batch_size]
            yield X_batch, y_batch

# Train the model with a generator
history = model.fit(
    data_generator(final_X_file, final_y_file, batch_size=64),
    steps_per_epoch=X_shape_total // 64,
    epochs=5,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=64,
    class_weight=class_weight_dict,
    callbacks=[early_stopping]
)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

# Print classification report (Precision, Recall, F1-score)
print("Final Classification Report:")
print(classification_report(y_true_labels, y_pred_labels))

# Plot the confusion matrix
cm = confusion_matrix(y_true_labels, y_pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign', 'Torii', 'Mirai'])
disp.plot(cmap='viridis')
plt.show()

# Calculate precision-recall curves and average precision scores
precision = {}
recall = {}
average_precision = {}

for i in range(3):  # 3 classes: 0 (Benign), 1 (Torii), 2 (Mirai)
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_pred[:, i])  # OvR for each class
    average_precision[i] = average_precision_score(y_test[:, i], y_pred[:, i])

# Plot precision-recall curves for each class
for i in range(3):
    plt.plot(recall[i], precision[i], marker='.', label=f'Class {i} (AP={average_precision[i]:.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves for Each Class')
plt.legend()
plt.show()

# Print average precision scores for each class
for i in range(3):
    print(f'Average Precision for Class {i}: {average_precision[i]:.2f}')