In [None]:
import pandas as pd
import numpy as np
import os
import gzip
import pickle
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import VarianceThreshold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

# Command to change directory
# cd /mnt/(path name)

# Command to convert .log.labeled files to .csv
# tr '\t' ',' < "(name).log.labeled" | cut -d ',' -f 1-22 > "(name).csv"

# Replace first 8 lines of .csv file (until #types) with the header below
# timestamp,uid,src_ip,src_port,dest_ip,dest_port,protocol,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label

# Load the CSV files
benign = pd.read_csv(r"CSV\Benign.csv")
torii = pd.read_csv(r"CSV\Torii.csv")
mirai = pd.read_csv(r"CSV\Mirai.csv")

# Add a label column for each dataset
benign['label'] = 0
torii['label'] = 1
mirai['label'] = 2

# Concatenate the datasets
data = pd.concat([benign, torii, mirai], axis=0)

# Shuffle the dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows
data.head()

In [None]:
train_test_split_ratio = 0.2

# Separate the features (X) and the target (y)
X = data.drop('label', axis=1)
y = data['label']

# Identify numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns

# Identify categorical columns that should be encoded
categorical_cols = X.select_dtypes(include=['object']).columns

X[categorical_cols] = X[categorical_cols].astype(str)

# Normalize only the numeric columns
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X[numeric_cols]) 

# One-hot encode categorical columns using sklearn
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
X_categorical_sparse = encoder.fit_transform(X[categorical_cols])

X_full_sparse = sparse.hstack([sparse.csr_matrix(X_numeric), X_categorical_sparse]).tocsr()

# Remove features with low variance
selector = VarianceThreshold(threshold=0.01)
X_encoded = selector.fit_transform(X_full_sparse)
print("Reduced feature count:", X_encoded.shape[1])

# Convert to dense float32 matrix for ANN
X_encoded = X_encoded.astype(np.float32)

# Convert the labels to one-hot encoding
y_encoded = to_categorical(y, num_classes=3)  # 3 classes: Benign, Torii, Mirai

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=train_test_split_ratio, random_state=42)

# Convert y_train to class labels
y_train = np.argmax(y_train, axis=1)

# Confirm new input dimension
input_dim = X_train.shape[1]
print(f"input_dim: {input_dim}")

In [None]:
# Define chunk size for processing
chunk_size = 1000  # Adjust this based on memory constraints
output_dir = "resampled_chunks"

# Check class distribution before resampling
print("Class distribution before resampling:")
print(pd.Series(y_train).value_counts())

# Shuffle data to randomize before chunking
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Create a directory to store processed chunks
os.makedirs(output_dir, exist_ok=True)

# Define final merged dataset files
final_X_file = os.path.join(output_dir, "final_X.npy")
final_y_file = os.path.join(output_dir, "final_y.npy")

# Save test data for validation later
np.save(os.path.join(output_dir, "X_test.npy"), X_test)
np.save(os.path.join(output_dir, "y_test.npy"), y_test)

In [None]:
# Initialize RandomOverSampler
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)

chunk_count = 0

# Process each chunk
for i in range(0, X_train.shape[0], chunk_size):
    print(f"Processing chunk: {i} to {i + chunk_size}...")
    chunk_X = X_train[i:i + chunk_size]
    chunk_y = y_train[i:i + chunk_size]
    
    # Skip chunks with only one class
    if len(np.unique(chunk_y)) <= 1:
        print(f"Skipping chunk {i} to {i + chunk_size}: Only one class present.")
        continue
    
    # Skip empty chunks
    if chunk_X.size == 0 or chunk_y.size == 0:
        print(f"Skipping chunk {i} to {i + chunk_size}: No valid data.")
        continue

    # Ensure chunk_X has two dimensions
    if len(chunk_X.shape) != 2:
        print(f"Skipping chunk {i} to {i + chunk_size}: Unexpected shape {chunk_X.shape}.")
        continue

    # Apply RandomOverSampler to the chunk
    chunk_X_resampled, chunk_y_resampled = ros.fit_resample(chunk_X, chunk_y)

    # Ensure resampled data is not empty
    if chunk_X_resampled.size == 0 or chunk_y_resampled.size == 0:
        print(f"Skipping resampled chunk {chunk_count}: No valid data after resampling.")
        continue

    # Save resampled chunk as .npy files
    np.save(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy"), chunk_X_resampled)
    np.save(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy"), chunk_y_resampled)

    # Compress the .npy files after saving using gzip
    with open(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy"), 'rb') as f_in:
        with gzip.open(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy.gz"), 'wb') as f_out:
            f_out.writelines(f_in)
    os.remove(os.path.join(output_dir, f"chunk_X_{chunk_count}.npy"))  # Remove the uncompressed file

    with open(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy"), 'rb') as f_in:
        with gzip.open(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy.gz"), 'wb') as f_out:
            f_out.writelines(f_in)
    os.remove(os.path.join(output_dir, f"chunk_y_{chunk_count}.npy"))  # Remove the uncompressed file

    print(f"Saved and compressed chunk {chunk_count} with {chunk_X_resampled.shape[0]} samples.")
    chunk_count += 1

    # Free memory
    del chunk_X, chunk_y, chunk_X_resampled, chunk_y_resampled

print(f"Total chunks saved and compressed: {chunk_count}")

In [None]:
first_chunk = True
X_shape_total = 0
y_shape_total = 0
input_dim = None  # Store the number of features

# Load and merge all chunks
for file in sorted(os.listdir(output_dir)):
    if file.startswith("chunk_X_") and file.endswith(".npy.gz"):
        chunk_index = file.split("_")[-1].split(".")[0]
        print(f"Merging chunk {chunk_index}...")

        # Decompress before loading
        with gzip.open(os.path.join(output_dir, f"chunk_X_{chunk_index}.npy.gz"), 'rb') as f:
            chunk_X = np.load(f, allow_pickle=True).item()
            chunk_X = chunk_X.toarray()
        with gzip.open(os.path.join(output_dir, f"chunk_y_{chunk_index}.npy.gz"), 'rb') as f:
            chunk_y = np.load(f, allow_pickle=True)

        print(f"Chunk {chunk_index} X shape: {chunk_X.shape}, y shape: {chunk_y.shape}")

        # Skip corrupted or empty chunks
        if len(chunk_X.shape) != 2 or chunk_X.shape[0] == 0 or chunk_y.shape[0] == 0:
            print(f"Skipping chunk {chunk_index} due to invalid shape.")
            continue

        if first_chunk:
            # Set input dimension from the first valid chunk
            input_dim = chunk_X.shape[1]

            # Initialize memmap with first chunk
            X_shape_total = chunk_X.shape[0]
            y_shape_total = chunk_y.shape[0]

            X_memmap = np.memmap(final_X_file, dtype=chunk_X.dtype, mode="w+", shape=(X_shape_total, input_dim))
            y_memmap = np.memmap(final_y_file, dtype=chunk_y.dtype, mode="w+", shape=(y_shape_total,))

            X_memmap[:] = chunk_X
            y_memmap[:] = chunk_y

            first_chunk = False
        else:
            X_shape_old = X_shape_total  # Store previous shape
            y_shape_old = y_shape_total

            # Update shape counters
            X_shape_total += chunk_X.shape[0]
            y_shape_total += chunk_y.shape[0]

            print(f"Before resizing: X shape = ({X_shape_old}, {input_dim}), y shape = ({y_shape_old},)")

            # Resize files before reopening memmap
            with open(final_X_file, "ab") as f:
                f.truncate(X_shape_total * input_dim * chunk_X.itemsize)
            with open(final_y_file, "ab") as f:
                f.truncate(y_shape_total * chunk_y.itemsize)

            # Reopen memmap with the new shape
            X_memmap = np.memmap(final_X_file, dtype=chunk_X.dtype, mode="r+", shape=(X_shape_total, input_dim))
            y_memmap = np.memmap(final_y_file, dtype=chunk_y.dtype, mode="r+", shape=(y_shape_total,))

            print(f"After resizing: X shape = {X_memmap.shape}, y shape = {y_memmap.shape}")

            # Ensure correct index range when appending
            X_memmap[X_shape_old:X_shape_total] = chunk_X
            y_memmap[y_shape_old:y_shape_total] = chunk_y

            print(f"Chunk {chunk_index} successfully appended.")

        # Ensure data is written to disk
        X_memmap.flush()
        y_memmap.flush()

        del chunk_X, chunk_y  # Free memory
        print(f"Final X shape: {X_memmap.shape}, Final y shape: {y_memmap.shape}")

print("All chunks processed and saved to disk!")

X_mmap = np.memmap(final_X_file, dtype=np.float32, mode='r', shape=(X_shape_total, input_dim))
y_mmap = np.memmap(final_y_file, dtype=np.int64, mode='r', shape=(y_shape_total,))

print(f"X shape: {X_mmap.shape}")
print(f"y shape: {y_mmap.shape}")

# Class distribution
class_distribution = np.bincount(y_mmap)
print("Class distribution:", dict(enumerate(class_distribution)))

In [None]:
# Define the model architecture
model = Sequential([
    Dense(512, input_dim=input_dim, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(3, activation='softmax')  # 3 classes: Benign, Torii, Mirai
])

# Compile the model with a lower learning rate
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy', Precision(), Recall()]
)

# Display model summary
model.summary()

In [None]:
batch_size = 64
num_classes = 3
model_save_path = "models/my_model.h5"
history_save_path = "metrics/history.pkl"

os.makedirs("models", exist_ok=True)
os.makedirs("metrics", exist_ok=True)

# Load memmaps for training
X_memmap = np.memmap(final_X_file, dtype=np.float32, mode="r", shape=(X_shape_total, input_dim))
y_memmap = np.memmap(final_y_file, dtype=np.int64, mode="r", shape=(y_shape_total,))

# Compute class weights
class_labels = np.unique(y_memmap)
class_weights_array = compute_class_weight(class_weight='balanced', classes=class_labels, y=y_memmap)
class_weight_dict = dict(zip(class_labels, class_weights_array))
print("Class weights:", class_weight_dict)

# Load validation data as memmaps
X_test = np.load(os.path.join(output_dir, "X_test.npy"), allow_pickle=True)

# Unpack if it's a scalar object
if X_test.shape == ():
    X_test = X_test.item()
    if hasattr(X_test, 'toarray'):
        X_test = X_test.toarray()
X_test = X_test.astype(np.float32)

y_test = np.load(os.path.join(output_dir, "y_test.npy"))
y_test = to_categorical(np.argmax(y_test, axis=1) if y_test.ndim == 2 else y_test, num_classes=num_classes)

# Train using batch generator
def data_generator(X_memmap, y_memmap, batch_size=batch_size, class_weights=None):
    indices = np.arange(X_memmap.shape[0])
    while True:
        np.random.shuffle(indices)
        for i in range(0, X_memmap.shape[0], batch_size):
            batch_idx = indices[i:i+batch_size]
            X_batch = X_memmap[batch_idx]
            y_batch_int = y_memmap[batch_idx]
            y_batch = to_categorical(y_batch_int, num_classes=num_classes)

            if class_weights:
                sample_weights = np.array([class_weights[cls] for cls in y_batch_int])
                yield X_batch, y_batch, sample_weights
            else:
                yield X_batch, y_batch

def val_data_generator(X_val_memmap, y_val_memmap, batch_size=batch_size):
    indices = np.arange(X_val_memmap.shape[0])
    while True:
        for i in range(0, X_val_memmap.shape[0], batch_size):
            batch_idx = indices[i:i + batch_size]
            X_batch = X_val_memmap[batch_idx]
            y_batch = y_val_memmap[batch_idx]
            yield X_batch, y_batch

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Model Training
history = model.fit(
    data_generator(X_memmap, y_memmap, batch_size, class_weights=class_weight_dict),
    steps_per_epoch=X_memmap.shape[0] // batch_size,
    epochs=100,
    validation_data=val_data_generator(X_test, y_test, batch_size),
    validation_steps=X_test.shape[0] // batch_size,
    callbacks=[early_stopping],
)

# Save model and history
model.save(model_save_path)
with open(history_save_path, "wb") as f:
    pickle.dump(history.history, f)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, average_precision_score, accuracy_score

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

# Print classification report (Precision, Recall, F1-score)
print("Final Classification Report:")
print(classification_report(y_true_labels, y_pred_labels))

accuracy = accuracy_score(y_true_labels, y_pred_labels)
print(f"\nFinal Accuracy on Test Set: {accuracy:.4f}")

# Plot the confusion matrix
cm = confusion_matrix(y_true_labels, y_pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign', 'Torii', 'Mirai'])
disp.plot(cmap='viridis')
plt.title("Confusion Matrix")
plt.show()

# Normalized Confusion Matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(6, 5))
sns.heatmap(cm_normalized, annot=True, cmap='Blues', fmt='.2f',
            xticklabels=['Benign', 'Torii', 'Mirai'],
            yticklabels=['Benign', 'Torii', 'Mirai'])
plt.title("Normalized Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Precision-Recall Curve
precision = {}
recall = {}
average_precision = {}
for i in range(num_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_pred[:, i])
    average_precision[i] = average_precision_score(y_test[:, i], y_pred[:, i])
    plt.plot(recall[i], precision[i], marker='.', label=f'Class {i} (AP={average_precision[i]:.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves for Each Class')
plt.legend()
plt.grid(True)
plt.show()

# Summary Table
print(f"\n{'Class':<10}{'Precision':<10}{'Recall':<10}{'F1-score':<10}{'AP':<10}")
report = classification_report(y_true_labels, y_pred_labels, output_dict=True)
for i in range(num_classes):
    p = report[str(i)]['precision']
    r = report[str(i)]['recall']
    f1 = report[str(i)]['f1-score']
    ap = average_precision[i]
    print(f"{i:<10}{p:<10.2f}{r:<10.2f}{f1:<10.2f}{ap:<10.2f}")