In [1]:
import pandas as pd

df = pd.read_parquet("../Datasets/final_prepared_dataset.parquet")

print("Shape of dataset:", df.shape)
print("\nColumn names:\n", df.columns.tolist())
print("\nFirst few rows:")
#display(df.head())

print("\nClass distribution:")
print(df['label'].value_counts())

print("\nFamily distribution:")
print(df['family'].value_counts())


Shape of dataset: (194466, 26)

Column names:
 ['family', 'window_id', 'read_throughput', 'read_lba_var', 'write_throughput', 'write_lba_var', 'write_entropy', 'mem_read_count_4k', 'mem_read_count_2m', 'mem_read_count_mmio', 'mem_read_gpa_var', 'mem_write_entropy', 'mem_write_count_4k', 'mem_write_count_2m', 'mem_write_count_mmio', 'mem_write_gpa_var', 'mem_readwrite_entropy', 'mem_readwrite_count_4k', 'mem_readwrite_count_2m', 'mem_readwrite_count_mmio', 'mem_readwrite_gpa_var', 'mem_exec_count_4k', 'mem_exec_count_2m', 'mem_exec_count_mmio', 'mem_exec_gpa_var', 'label']

First few rows:

Class distribution:
label
0    101443
1     93023
Name: count, dtype: int64

Family distribution:
family
Firefox     20207
AESCrypt    19557
Zip         18295
LockBit     18093
Conti       17739
SDelete     17332
Darkside    16771
REvil       15296
WannaCry    14488
Office      13468
Idle        12584
Ryuk        10636
Name: count, dtype: int64


In [2]:
import numpy as np
from tqdm import tqdm

SEQUENCE_LENGTH = 40

df_sorted = df.sort_values(by=["family", "window_id"]).reset_index(drop=True)

feature_cols = [col for col in df.columns if col not in ["family", "window_id", "label"]]

sequences = []
sequence_labels = []
sequence_families = []

for family, group in tqdm(df_sorted.groupby("family")):
    
    group_features = group[feature_cols].values
    group_labels = group["label"].values

    for i in range(len(group) - SEQUENCE_LENGTH + 1):
        seq = group_features[i:i+SEQUENCE_LENGTH]
        label = group_labels[i+SEQUENCE_LENGTH - 1]  
        
        sequences.append(seq)
        sequence_labels.append(label)
        sequence_families.append(family)

sequences = np.array(sequences)
sequence_labels = np.array(sequence_labels)
sequence_families = np.array(sequence_families)

print("Total sequences generated:", sequences.shape[0])
print("Shape of each sequence:", sequences.shape[1:])
print("Sequence labels distribution:")
unique, counts = np.unique(sequence_labels, return_counts=True)
print(dict(zip(unique, counts)))


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 82.19it/s]


Total sequences generated: 193998
Shape of each sequence: (40, 23)
Sequence labels distribution:
{np.int64(0): np.int64(101209), np.int64(1): np.int64(92789)}


In [3]:
from collections import Counter

family_aware_folds = {
    1: {
        "train_families": ['LockBit', 'WannaCry', 'Office', 'Zip', 'SDelete', 'AESCrypt'],
        "val_families": ['Firefox', 'Ryuk', 'Darkside']
    },
    2: {
        "train_families": ['Conti', 'Firefox', 'Idle', 'LockBit', 'REvil', 'Ryuk'],
        "val_families": ['Darkside', 'Office', 'Zip']
    },
    3: {
        "train_families": ['Darkside', 'Firefox', 'Office', 'REvil', 'Ryuk', 'Zip'],
        "val_families": ['Conti', 'LockBit', 'Idle']
    }
}

FOLD_NUMBER = 1

current_fold = family_aware_folds[FOLD_NUMBER]
train_families = current_fold["train_families"]
val_families = current_fold["val_families"]

test_families = ['Conti', 'Idle', 'REvil']

print(f"📌 Using Fold {FOLD_NUMBER}")
print("Train Families:", train_families)
print("Validation Families:", val_families)
print("Test Families:", test_families)


train_indices = []
val_indices = []
test_indices = []

for idx, family in enumerate(sequence_families):
    if family in train_families:
        train_indices.append(idx)
    elif family in val_families:
        val_indices.append(idx)
    elif family in test_families:
        test_indices.append(idx)

X_family_train = sequences[train_indices]
y_family_train = sequence_labels[train_indices]

X_family_val = sequences[val_indices]
y_family_val = sequence_labels[val_indices]

X_family_test = sequences[test_indices]
y_family_test = sequence_labels[test_indices]



def print_split(name, X, y, families):
    print(f"\n{name} → {X.shape[0]} sequences")
    counts = Counter(y)
    print(f"  - Benign: {counts[0]} sequences")
    print(f"  - Ransomware: {counts[1]} sequences")
    print(f"  Families: {families}")



📌 Using Fold 1
Train Families: ['LockBit', 'WannaCry', 'Office', 'Zip', 'SDelete', 'AESCrypt']
Validation Families: ['Firefox', 'Ryuk', 'Darkside']
Test Families: ['Conti', 'Idle', 'REvil']


In [4]:
print("📌 Family-Aware Split Summary\n")

print("Train (Seen Families):")
print_split("Train", X_family_train, y_family_train, train_families)

print("Validation (Unseen Families):")
print_split("Validation", X_family_val, y_family_val, val_families)

print("Test (Unseen Families):")
print_split("Test", X_family_test, y_family_test, test_families)


📌 Family-Aware Split Summary

Train (Seen Families):

Train → 100999 sequences
  - Benign: 68496 sequences
  - Ransomware: 32503 sequences
  Families: ['LockBit', 'WannaCry', 'Office', 'Zip', 'SDelete', 'AESCrypt']
Validation (Unseen Families):

Validation → 47497 sequences
  - Benign: 20168 sequences
  - Ransomware: 27329 sequences
  Families: ['Firefox', 'Ryuk', 'Darkside']
Test (Unseen Families):

Test → 45502 sequences
  - Benign: 12545 sequences
  - Ransomware: 32957 sequences
  Families: ['Conti', 'Idle', 'REvil']


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Conv1D, MaxPooling1D, LSTM, Dense

SEQ_LEN = 40
NUM_FEATURES = 23

model = Sequential([
    InputLayer(input_shape=(SEQ_LEN, NUM_FEATURES)),           # (40, 23)
    Conv1D(filters=64, kernel_size=2, activation='relu'),      # Conv1D over time
    MaxPooling1D(pool_size=2),                                 # Downsample to length ~20
    LSTM(256, return_sequences=True),                          # First LSTM layer
    LSTM(256, return_sequences=False),                         # Second LSTM, output final state
    Dense(1, activation='sigmoid')                             # Binary output
])

model.summary()


In [7]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Accuracy, Precision, Recall
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

# ─── Compiling the Model ───
model.compile(
    loss=BinaryCrossentropy(),
    optimizer=Adam(learning_rate=0.001),
    metrics=[
        BinaryAccuracy(name='accuracy'),
        Precision(name='precision'),
        Recall(name='recall')
    ]
)

# ─── Callbacks ───
checkpoint_path = "cnn_lstm_best_weights.h5"
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]

In [8]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_family_train),
    y=y_family_train
)

class_weight_dict = dict(enumerate(class_weights))


In [9]:
# ─── Train on Family-Aware Split ───
history = model.fit(
    X_family_train, y_family_train,
    validation_data=(X_family_val, y_family_val),
    epochs=50,
    batch_size=128,
    callbacks=callbacks,
    #shuffle=False,
    class_weight=class_weight_dict,  # 👈 Add this line
    verbose=1
)

Epoch 1/50
[1m789/790[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 91ms/step - accuracy: 0.9669 - loss: 0.1779 - precision: 0.7729 - recall: 0.7120           
Epoch 1: val_loss improved from inf to 2.68593, saving model to cnn_lstm_best_weights.h5




[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 108ms/step - accuracy: 0.9668 - loss: 0.1779 - precision: 0.7734 - recall: 0.7125 - val_accuracy: 0.4246 - val_loss: 2.6859 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/50
[1m789/790[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 89ms/step - accuracy: 0.8777 - loss: 0.3439 - precision: 0.6339 - recall: 0.5445          
Epoch 2: val_loss improved from 2.68593 to 1.73305, saving model to cnn_lstm_best_weights.h5




[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 103ms/step - accuracy: 0.8776 - loss: 0.3441 - precision: 0.6342 - recall: 0.5450 - val_accuracy: 0.4246 - val_loss: 1.7330 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/50
[1m789/790[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 90ms/step - accuracy: 0.8356 - loss: 0.4154 - precision: 0.5629 - recall: 0.4847          
Epoch 3: val_loss improved from 1.73305 to 1.18271, saving model to cnn_lstm_best_weights.h5




[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 104ms/step - accuracy: 0.8354 - loss: 0.4158 - precision: 0.5630 - recall: 0.4852 - val_accuracy: 0.4246 - val_loss: 1.1827 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/50
[1m789/790[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 92ms/step - accuracy: 0.7800 - loss: 0.4829 - precision: 0.4749 - recall: 0.3467          
Epoch 4: val_loss improved from 1.18271 to 1.07733, saving model to cnn_lstm_best_weights.h5




[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 106ms/step - accuracy: 0.7798 - loss: 0.4832 - precision: 0.4751 - recall: 0.3470 - val_accuracy: 0.4246 - val_loss: 1.0773 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 5/50
[1m789/790[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 89ms/step - accuracy: 0.7760 - loss: 0.4937 - precision: 0.4618 - recall: 0.3197          
Epoch 5: val_loss did not improve from 1.07733
[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 103ms/step - accuracy: 0.7759 - loss: 0.4939 - precision: 0.4622 - recall: 0.3203 - val_accuracy: 0.4246 - val_loss: 1.5040 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 6/50
[1m789/790[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 92ms/step - accuracy: 0.8138 - loss: 0.4555 - precision: 0.5312 - recall: 0.4360          
Epoch 6: val_loss did not improve from 1.07733
[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 106ms/step - ac

KeyboardInterrupt: 

In [None]:
# ─── Plot Training Curves ───
def plot_metrics(history):
    metrics = ['loss', 'accuracy', 'precision', 'recall']
    plt.figure(figsize=(14, 10))
    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i+1)
        plt.plot(history.history[metric], label='Train')
        plt.plot(history.history[f'val_{metric}'], label='Val')
        plt.title(metric.capitalize())
        plt.xlabel('Epoch')
        plt.ylabel(metric)
        plt.grid(True)
        plt.legend()
    plt.tight_layout()
    plt.show()

plot_metrics(history)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# ─── Predict on Test Set ───
y_pred_probs = model.predict(X_family_test)
y_pred = (y_pred_probs >= 0.5).astype(int).flatten()

# ─── Evaluation Report ───
print("📊 Classification Report:")
print(classification_report(y_family_test, y_pred, digits=4))

# ─── F1 Score (Optional Manual) ───
f1 = f1_score(y_family_test, y_pred)
print(f"F1-score (manual): {f1:.4f}")

# ─── Confusion Matrix ───
cm = confusion_matrix(y_family_test, y_pred)
labels = ['Benign (0)', 'Ransomware (1)']

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix on Family-Aware Test Set")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()
