# **1. Data Loading**

In [1]:
import os
import numpy as np

def parse_cell_file(content):

    #Parses the content of a .cell file and converts it into a structured data format.
    features = []
    for line in content.strip().splitlines():
        if not line.strip():
            continue
        try:
            timestamp, direction, size = map(float, line.split())
            signed_size = size if direction > 0 else -size
            features.append([timestamp, signed_size])
        except ValueError:
            continue
    return np.array(features)

def load_mon_data(mon_folder):
    
    #Loads 'split' files from the 'mon' folder and labels them.
    instances = []
    labels = []

    for file in os.listdir(mon_folder):
        if "split" in file:
            file_path = os.path.join(mon_folder, file)
            try:
                class_label = int(file.split('-')[0])  # Extract class label from the filename
            except ValueError:
                continue

            with open(file_path, 'r') as f:
                instance = parse_cell_file(f.read())
                if instance.size > 0:
                    instances.append(instance)
                    labels.append(class_label)

    print(f"Total split files loaded: {len(instances)}")

    return np.array(instances, dtype=object), np.array(labels)

# data path
mon_folder_path = './mon/ts'

# Load data
X_raw, y = load_mon_data(mon_folder_path)

Total split files loaded: 87038


# **2. Feature Engineering**

In [2]:
def create_features(X_raw):
    feature_matrix = []

    for instance in X_raw:
        timestamps = instance[:, 0]
        signed_sizes = instance[:, 1]

        # Compute Features
        packet_size_direction = np.sum(signed_sizes)
        cumulative_packet_size = np.sum(np.abs(signed_sizes))
        burst_lengths = len(signed_sizes)
        num_incoming_packets = np.sum(signed_sizes > 0)
        num_outgoing_packets = np.sum(signed_sizes < 0)
        ratio_incoming_packets = (
            num_incoming_packets / burst_lengths if burst_lengths > 0 else 0
        )

        if len(timestamps) > 1:
            time_intervals = np.diff(timestamps)
            mean_time_intervals = np.mean(time_intervals)
            std_time_intervals = np.std(time_intervals)
        else:
            mean_time_intervals = 0
            std_time_intervals = 0

        # Feature vector
        feature_vector = [
            packet_size_direction,
            cumulative_packet_size,
            burst_lengths,
            num_incoming_packets,
            num_outgoing_packets,
            ratio_incoming_packets,
            mean_time_intervals,
            std_time_intervals,
        ]
        feature_matrix.append(feature_vector)

    return np.array(feature_matrix)

X = create_features(X_raw)


In [3]:
# Verify data loading and feature extraction
print(f"Total instances loaded: {len(X_raw)}")
print(f"Unique labels: {np.unique(y)}") 
print(f"Feature matrix shape: {X.shape}")

# Check data distribution by class
unique, counts = np.unique(y, return_counts=True)
print("Class distribution:")
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} instances")


Total instances loaded: 87038
Unique labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94]
Feature matrix shape: (87038, 8)
Class distribution:
Class 0: 956 instances
Class 1: 915 instances
Class 2: 956 instances
Class 3: 913 instances
Class 4: 900 instances
Class 5: 968 instances
Class 6: 968 instances
Class 7: 844 instances
Class 8: 918 instances
Class 9: 958 instances
Class 10: 948 instances
Class 11: 982 instances
Class 12: 840 instances
Class 13: 949 instances
Class 14: 858 instances
Class 15: 895 instances
Class 16: 886 instances
Class 17: 952 instances
Class 18: 859 instances
Class 19: 977 instances
Class 20: 810 instances
Class 21: 969 instances
Class 22: 981 instances
Class 23: 792 instances
Class 24: 910 instances
Class 25: 718 instance

# **3. Data Splitting**

In [4]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [5]:
import pandas as pd

# Data Distribution
print("Training Data Distribution:")
print(pd.Series(y_train).value_counts())
print("Test Data Distribution:")
print(pd.Series(y_test).value_counts())

Training Data Distribution:
66    694
69    692
48    692
34    692
73    691
     ... 
82    492
86    491
26    459
30    413
75    406
Name: count, Length: 95, dtype: int64
Test Data Distribution:
66    298
73    296
69    296
34    296
48    296
     ... 
82    211
86    210
26    196
30    177
75    174
Name: count, Length: 95, dtype: int64


# **4. Scaling the Data**

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **5. Model training**

In [7]:
from tensorflow.keras.utils import to_categorical

# Check the number of classes
num_classes = len(np.unique(y))

# One-hot encoding (required for Keras models)
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the Deep Fingerprinting (DF) model
def create_df_model(input_shape, num_classes):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_shape,)),  # Input layer with 256 neurons
        Dropout(0.5),  # Dropout to prevent overfitting
        Dense(128, activation='relu'),  # Hidden layer with 128 neurons
        Dropout(0.3),  # Dropout to prevent overfitting
        Dense(64, activation='relu'),  # Hidden layer with 64 neurons
        Dropout(0.3),  # Dropout to prevent overfitting
        Dense(num_classes, activation='softmax')  # Output layer with softmax activation for multi-class classification
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Compile the model
    return model

# Create the model
df_model = create_df_model(X_train.shape[1], num_classes)

# Train the model
df_model.fit(X_train, y_train_categorical, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 679us/step - accuracy: 0.0146 - loss: 4.5011
Epoch 2/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 709us/step - accuracy: 0.0232 - loss: 4.3951
Epoch 3/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 726us/step - accuracy: 0.0274 - loss: 4.3701
Epoch 4/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 672us/step - accuracy: 0.0293 - loss: 4.3512
Epoch 5/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 671us/step - accuracy: 0.0310 - loss: 4.3402
Epoch 6/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 773us/step - accuracy: 0.0335 - loss: 4.3310
Epoch 7/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0353 - loss: 4.3253
Epoch 8/10
[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 773us/step - accuracy: 0.0339 - loss: 4.3170
Epoch 9/10
[1m952/952[0m

<keras.src.callbacks.history.History at 0x2b1820980>

# **6. Model Evaluation**

In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate the model
print("Evaluating the model...")
y_pred_probs = df_model.predict(X_test)  # Predict probabilities for each class
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels

# Display evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred) * 100)
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")


Evaluating the model...
[1m816/816[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211us/step
Accuracy: 3.9292279411764706
Confusion Matrix:
[[0 0 2 ... 0 0 0]
 [0 0 3 ... 0 0 0]
 [0 0 3 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 3 ... 0 0 1]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       287
           1       0.00      0.00      0.00       275
           2       0.02      0.01      0.01       287
           3       0.00      0.00      0.00       274
           4       0.00      0.00      0.00       270
           5       0.08      0.00      0.01       290
           6       0.02      0.00      0.01       290
           7       0.04      0.02      0.03       253
           8       0.00      0.00      0.00       275
           9       0.00      0.00      0.00       287
          10       0.00      0.00      0.00       284
          11       0.06      0.27      0.10       295
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
