In [11]:
%pip install tensorflow

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

Note: you may need to restart the kernel to use updated packages.
Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/e4/14/d795bb156f8cc10eb1dcfe1332b7dbb8405b634688980aa9be8f885cc888/tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.16.1 from https://files.pythonhosted.org/packages/e0/36/6278e4e7e69a90c00e0f82944d8f2713dd85a69d1add455d9e50446837ab/tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.16.1->tensorflow)
  Obtaining dependency information for astunparse>=1.6.0 from https://files.pythonhosted.org/packages/2b/03/13dde6512ad7b4557eb792fbcf0c653af6076b81e5941d36ec61f7ce6028/astunpar

In [2]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

def process_network_traffic(data,  resample_interval):
    # Convert TIME column to datetime format assuming it's in Unix timestamp format
    data['TIME'] = pd.to_datetime(data['TIME'], unit='s')

    # Set TIME as the index
    data.set_index('TIME', inplace=True)

    # Resample data in 15-second intervals
    resampled_data = data.resample(resample_interval)

    # Function to calculate flow attributes for each interval
    def calculate_flow_attributes(interval):
        if interval.empty:
            return pd.DataFrame()

        # Reset index to make 'TIME' column available
        interval = interval.reset_index()

        # Extract flows based on source IP, destination IP, and eth.src
        flows = interval.groupby(['IP.src', 'IP.dst', 'eth.src'])

        # Calculate additional features for each flow
        flow_features = flows.agg(
            total_packets=('Size', 'count'),
            total_bytes=('Size', 'sum'),
            start_time=('TIME', 'min'),
            end_time=('TIME', 'max'),
            mean_packet_size=('Size', 'mean'),
            std_packet_size=('Size', 'std'),
            min_packet_size=('Size', 'min'),
            max_packet_size=('Size', 'max')
        ).reset_index()

        # Calculate flow duration
        flow_features['flow_duration'] = (flow_features['end_time'] - flow_features['start_time']).dt.total_seconds()

        # Calculate packet rate and byte rate
        flow_features['packet_rate'] = flow_features['total_packets'] / flow_features['flow_duration']
        flow_features['byte_rate'] = flow_features['total_bytes'] / flow_features['flow_duration']

        # Handle cases where flow_duration is zero to avoid division by zero errors
        flow_features.replace([np.inf, -np.inf], 0, inplace=True)

        # Calculate inter-arrival times
        interval['inter_arrival_time'] = interval.groupby(['IP.src', 'IP.dst', 'eth.src'])['TIME'].diff().dt.total_seconds()

        # Calculate statistical features of inter-arrival times for each flow
        inter_arrival_stats = interval.groupby(['IP.src', 'IP.dst', 'eth.src'])['inter_arrival_time'].agg(
            mean_inter_arrival=('mean'),
            std_inter_arrival=('std'),
            min_inter_arrival=('min'),
            max_inter_arrival=('max')
        ).reset_index()

        # Merge the inter-arrival stats with the flow features
        flow_features = pd.merge(flow_features, inter_arrival_stats, on=['IP.src', 'IP.dst', 'eth.src'])

        # Fill NaN values with 0 (for flows with single packet where diff results in NaN)
        flow_features.fillna(0, inplace=True)

        return flow_features

    # Apply the function to each interval
    flow_attributes_list = []
    for _, interval in resampled_data:
        flow_attributes = calculate_flow_attributes(interval)
        flow_attributes_list.append(flow_attributes)

    # Combine all intervals into a single DataFrame
    combined_flow_attributes = pd.concat(flow_attributes_list, ignore_index=True)

    # Optional: Define the MAC address to device type mapping
    mac_to_device = {
        'd0:52:a8:00:67:5e': ('Smart Things', 'hubs/controllers'),
        '44:65:0d:56:cc:d3': ('Amazon Echo', 'hubs/controllers'),
        '70:ee:50:18:34:43': ('Netatmo Welcome', 'cameras'),
        'f4:f2:6d:93:51:f1': ('TP-Link Day Night Cloud camera', 'cameras'),
        '00:16:6c:ab:6b:88': ('Samsung SmartCam', 'cameras'),
        '30:8c:fb:2f:e4:b2': ('Dropcam', 'cameras'),
        '00:62:6e:51:27:2e': ('Insteon Camera', 'cameras'),
        'e8:ab:fa:19:de:4f': ('Insteon Camera', 'cameras'),
        '00:24:e4:11:18:a8': ('Withings Smart Baby Monitor', 'cameras'),
        'ec:1a:59:79:f4:89': ('Belkin Wemo switch', 'energy management'),
        '50:c7:bf:00:56:39': ('TP-Link Smart plug', 'energy management'),
        '74:c6:3b:29:d7:1d': ('iHome', 'appliances'),
        'ec:1a:59:83:28:11': ('Belkin wemo motion sensor', 'health-monitor'),
        '18:b4:30:25:be:e4': ('NEST Protect smoke alarm', 'health-monitor'),
        '70:ee:50:03:b8:ac': ('Netatmo weather station', 'health-monitor'),
        '00:24:e4:1b:6f:96': ('Withings Smart scale', 'health-monitor'),
        '74:6a:89:00:2e:25': ('Blipcare Blood Pressure meter', 'health-monitor'),
        '00:24:e4:20:28:c6': ('Withings Aura smart sleep sensor', 'health-monitor'),
        'd0:73:d5:01:83:08': ('Light Bulbs LiFX Smart Bulb', 'appliances'),
        '18:b7:9e:02:20:44': ('Triby Speaker', 'appliances'),
        'e0:76:d0:33:bb:85': ('PIX-STAR Photo-frame', 'appliances'),
        '70:5a:0f:e4:9b:c0': ('HP Printer', 'appliances'),
        '08:21:ef:3b:fc:e3': ('Samsung Galaxy Tab', 'non-IoT'),
        '30:8c:fb:b6:ea:45': ('Nest Dropcam', 'cameras'),
        '40:f3:08:ff:1e:da': ('Android Phone', 'non-IoT'),
        '74:2f:68:81:69:42': ('Laptop', 'non-IoT'),
        'ac:bc:32:d4:6f:2f': ('MacBook', 'non-IoT'),
        'b4:ce:f6:a7:a3:c2': ('Android Phone', 'non-IoT'),
        'd0:a6:37:df:a1:e1': ('IPhone', 'non-IoT'),
        'f4:5c:89:93:cc:85': ('MacBook/Iphone', 'non-IoT'),
        '14:cc:20:51:33:ea': ('TPLink Router Bridge LAN (Gateway)', 'Gateaway')
    }

    # Map the eth.src values to device and categories
    combined_flow_attributes['Device'] = combined_flow_attributes['eth.src'].map(lambda mac: mac_to_device.get(mac, ('Unknown', 'Unknown'))[0])
    combined_flow_attributes['Device Category'] = combined_flow_attributes['eth.src'].map(lambda mac: mac_to_device.get(mac, ('Unknown', 'Unknown'))[1])

    # Remove rows where 'Device Category' is 'Gateaway'
    combined_flow_attributes = combined_flow_attributes[combined_flow_attributes['Device Category'] != 'Gateaway']

    return combined_flow_attributes


In [3]:
import time
import joblib
import os
from memory_profiler import memory_usage
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

def preprocesTestTrain(combined_df):
    # Step 1: Encode categorical features
    label_encoder = LabelEncoder()
    combined_df['eth.src'] = label_encoder.fit_transform(combined_df['eth.src'])
    combined_df['IP.src'] = label_encoder.fit_transform(combined_df['IP.src'])
    combined_df['IP.dst'] = label_encoder.fit_transform(combined_df['IP.dst'])
    combined_df['Device Category'] = label_encoder.fit_transform(combined_df['Device Category'])

				
    # Step 2: Prepare features and target variable; 'std_inter_arrival', 'std_packet_size', 'min_inter_arrival', 'min_packet_size', 'flow_duration' are removed
    features = [ 'eth.src', 'IP.src', 'IP.dst', 'total_packets', 'total_bytes', 'mean_packet_size', 
                 'max_packet_size', 'packet_rate', 'byte_rate', 'mean_inter_arrival', 
                'max_inter_arrival']
    X = combined_df[features]
    return X

def train_model(model, X_train, y_train):
    start_time = time.time()
    mem_usage = memory_usage((model.fit, (X_train, y_train)), interval=0.1)
    end_time = time.time()
    training_time = end_time - start_time
    peak_memory_usage = max(mem_usage)
    return model, training_time, peak_memory_usage

def predict_model(model, X_test, y_test):
    start_time = time.time()
    mem_usage = memory_usage((model.predict, (X_test,)), interval=0.1)
    end_time = time.time()
    prediction_time = end_time - start_time
    peak_memory_usage = max(mem_usage)
    y_pred = model.predict(X_test)
    accuracy = classification_report(y_test, y_pred)

    return prediction_time, peak_memory_usage, accuracy

def get_model_size(model):
    joblib.dump(model, 'temp_model.joblib')
    model_size = os.path.getsize('temp_model.joblib') / 1024**2  # size in MB
    os.remove('temp_model.joblib')
    return model_size


In [4]:
# Load the training data
df1 = pd.read_csv('16-09-23.csv', low_memory=False)
df2 = pd.read_csv('16-09-24.csv', low_memory=False)
df3 = pd.read_csv('16-09-25.csv', low_memory=False)
df4 = pd.read_csv('16-09-26.csv', low_memory=False)
df5 = pd.read_csv('16-09-27.csv', low_memory=False)
df6 = pd.read_csv('16-09-29.csv', low_memory=False)
df7 = pd.read_csv('16-10-04.csv', low_memory=False)
df8 = pd.read_csv('16-10-12.csv', low_memory=False)

data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8])
data_60sec_train = process_network_traffic(data, '60S')
# Save the final table as a CSV file
#data_60sec_train.to_csv('Dataset_60sec.csv', index=False)

X_60 = preprocesTestTrain(data_60sec_train)
y_60 = data_60sec_train['Device Category']
# Split the data into training and testing sets
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X_60, y_60, test_size=0.2, random_state=100)

In [5]:
# Load the training data
df1 = pd.read_csv('16-09-23.csv', low_memory=False)
df2 = pd.read_csv('16-09-24.csv', low_memory=False)
df3 = pd.read_csv('16-09-25.csv', low_memory=False)
df4 = pd.read_csv('16-09-26.csv', low_memory=False)
df5 = pd.read_csv('16-09-27.csv', low_memory=False)
df6 = pd.read_csv('16-09-29.csv', low_memory=False)
df7 = pd.read_csv('16-10-04.csv', low_memory=False)
df8 = pd.read_csv('16-10-12.csv', low_memory=False)

data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8])
data_15sec_train = process_network_traffic(data, '15S')
# Save the final table as a CSV file
#data_60sec_train.to_csv('Dataset_60sec.csv', index=False)

X_15 = preprocesTestTrain(data_15sec_train)
y_15 = data_15sec_train['Device Category']
# Split the data into training and testing sets
X_train_15, X_test_15, y_train_15, y_test_15 = train_test_split(X_15, y_15, test_size=0.2, random_state=100)

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, 12, 15, 20, 25, 30],
    'min_samples_split': [2, 3, 5, 10],
    'min_samples_leaf': [1, 2, 4, 5, 6, 10]
}

# Initialize the classifier
dt_clf = DecisionTreeClassifier(random_state=100)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt_clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1_macro')

# Fit GridSearchCV on first dataset
grid_search.fit(X_train_15, y_train_15)
best_params_15 = grid_search.best_params_

# Fit GridSearchCV on second dataset
grid_search.fit(X_train_60, y_train_60)
best_params_60 = grid_search.best_params_

print(f"Best parameters for dataset 1: {best_params_15}")
print(f"Best parameters for dataset 2: {best_params_60}")

# Train the model with best parameters
dt_clf_15 = DecisionTreeClassifier(**best_params_15, random_state=100)
dt_clf_15.fit(X_train_15, y_train_15)

dt_clf_60 = DecisionTreeClassifier(**best_params_60, random_state=100)
dt_clf_60.fit(X_train_60, y_train_60)


Best parameters for dataset 1: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best parameters for dataset 2: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [16]:
# Train the Random Forest Original:100, 10,20,5,100, second iteration: 50,15,20,t,100
rf_clf =  RandomForestClassifier(n_estimators=100, max_depth=15,min_samples_split=5, min_samples_leaf=2,random_state=100)

# Measure training time and memory usage
rf_model_15, training_time, training_memory = train_model(rf_clf, X_train_15, y_train_15)
print(f"Training time: {training_time:.2f} seconds")
print(f"Training peak memory usage: {training_memory:.2f} MB")

rf_model_60, training_time, training_memory = train_model(rf_clf, X_train_60, y_train_60)
print(f"Training time: {training_time:.2f} seconds")
print(f"Training peak memory usage: {training_memory:.2f} MB")


# Measure prediction time, memory usage, and accuracy
prediction_time, prediction_memory, accuracy = predict_model(rf_model_15, X_test_15, y_test_15)
print(f"Prediction time: {prediction_time:.4f} seconds")
print(f"Prediction peak memory usage: {prediction_memory:.2f} MB")
print(f"Accuracy: {accuracy}")

prediction_time, prediction_memory, accuracy = predict_model(rf_model_60, X_test_60, y_test_60)
print(f"Prediction time: {prediction_time:.4f} seconds")
print(f"Prediction peak memory usage: {prediction_memory:.2f} MB")
print(f"Accuracy: {accuracy}")

# Measure model size
model_size = get_model_size(rf_model_15)
print(f"Model size: {model_size:.2f} MB")

model_size = get_model_size(rf_model_60)
print(f"Model size: {model_size:.2f} MB")

Training time: 18.06 seconds
Training peak memory usage: 205.57 MB
Training time: 9.70 seconds
Training peak memory usage: 172.26 MB
Prediction time: 1.5049 seconds
Prediction peak memory usage: 173.18 MB
Accuracy:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7673
           1       1.00      1.00      1.00     49240
           2       1.00      1.00      1.00      2052
           3       1.00      1.00      1.00      8101
           4       1.00      1.00      1.00     24769
           5       1.00      1.00      1.00      6080

    accuracy                           1.00     97915
   macro avg       1.00      1.00      1.00     97915
weighted avg       1.00      1.00      1.00     97915

Prediction time: 1.0490 seconds
Prediction peak memory usage: 174.07 MB
Accuracy:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6991
           1       1.00      1.00      1.00     235

In [6]:
# Train the Random Forest Original:100, 10,20,5,100, second iteration: 50,15,20,t,100
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
}
# Perform grid search
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
# Best model
rf_clf = grid_search.best_estimator_

# Measure training time and memory usage
rf_model, training_time, training_memory = train_model(rf_clf, X_train, y_train)
print(f"Training time: {training_time:.2f} seconds")
print(f"Training peak memory usage: {training_memory:.2f} MB")

# Measure prediction time, memory usage, and accuracy
prediction_time, prediction_memory, accuracy = predict_model(rf_model, X_test, y_test)
print(f"Prediction time: {prediction_time:.4f} seconds")
print(f"Prediction peak memory usage: {prediction_memory:.2f} MB")
print(f"Accuracy: {accuracy}")

# Measure model size
model_size = get_model_size(rf_model)
print(f"Model size: {model_size:.2f} MB")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Training time: 25.23 seconds
Training peak memory usage: 194.02 MB
Prediction time: 2.2357 seconds
Prediction peak memory usage: 180.43 MB
Accuracy:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     10386
           1       1.00      1.00      1.00     67114
           2       1.00      1.00      1.00      2887
           3       1.00      1.00      1.00     10431
           4       1.00      1.00      1.00     34729
           5       1.00      1.00      1.00      6555

    accuracy                           1.00    132102
   macro avg       1.00      1.00      1.00    132102
weighted avg       1.00      1.00      1.00    132102

Model size: 7.49 MB


In [60]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


X = preprocesTestTrain(data_15sec_train)
y = data_15sec_train['Device Category']

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=100)

# Standardize the feature columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the neural network model
model = Sequential([
    Dense(128, input_shape=(X_train_scaled.shape[1],), activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(y_categorical.shape[1], activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")

# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Decode the predicted and true classes
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

# Display the first few predictions
for i in range(10):
    print(f"True label: {y_true_labels[i]}, Predicted label: {y_pred_labels[i]}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m9792/9792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.7827 - loss: 0.6251 - val_accuracy: 0.9850 - val_loss: 0.0710
Epoch 2/50
[1m9792/9792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step - accuracy: 0.9481 - loss: 0.1695 - val_accuracy: 0.9921 - val_loss: 0.0381
Epoch 3/50
[1m9792/9792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step - accuracy: 0.9627 - loss: 0.1251 - val_accuracy: 0.9934 - val_loss: 0.0305
Epoch 4/50
[1m9792/9792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - accuracy: 0.9697 - loss: 0.1061 - val_accuracy: 0.9956 - val_loss: 0.0228
Epoch 5/50
[1m9792/9792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step - accuracy: 0.9730 - loss: 0.1015 - val_accuracy: 0.9969 - val_loss: 0.0224
Epoch 6/50
[1m9792/9792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9759 - loss: 0.0879 - val_accuracy: 0.9976 - val_loss: 0.0161
Epoch 7/50

In [61]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


X = preprocesTestTrain(data_60sec_train)
y = data_60sec_train['Device Category']

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=100)

# Standardize the feature columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the neural network model
model = Sequential([
    Dense(128, input_shape=(X_train_scaled.shape[1],), activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(y_categorical.shape[1], activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")

# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Decode the predicted and true classes
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5269/5269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - accuracy: 0.6994 - loss: 0.8325 - val_accuracy: 0.9544 - val_loss: 0.1466
Epoch 2/50
[1m5269/5269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9159 - loss: 0.2693 - val_accuracy: 0.9837 - val_loss: 0.0672
Epoch 3/50
[1m5269/5269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9432 - loss: 0.1899 - val_accuracy: 0.9910 - val_loss: 0.0465
Epoch 4/50
[1m5269/5269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.9554 - loss: 0.1540 - val_accuracy: 0.9939 - val_loss: 0.0320
Epoch 5/50
[1m5269/5269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.9646 - loss: 0.1315 - val_accuracy: 0.9958 - val_loss: 0.0275
Epoch 6/50
[1m5269/5269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.9671 - loss: 0.1229 - val_accuracy: 0.9953 - val_loss: 0.0242
Epoch 7/50
[1m5269/5269[0

In [8]:
# Testing the model on new data
new_df1 = pd.read_csv('16-09-30.csv', low_memory=False)
new_df2 = pd.read_csv('16-10-01.csv', low_memory=False)
new_df3 = pd.read_csv('16-10-02.csv', low_memory=False)
new_df4 = pd.read_csv('16-10-03.csv', low_memory=False)
new_df5 = pd.read_csv('16-10-05.csv', low_memory=False)

# Skip the first row (header row with explanations)
new_df1 = new_df1.iloc[1:]
new_df2 = new_df2.iloc[1:]
new_df3 = new_df3.iloc[1:]
new_df4 = new_df4.iloc[1:]
new_df5 = new_df5.iloc[1:]


# Concatenate the dataframes
new_df = pd.concat([new_df1,new_df2, new_df3, new_df4, new_df5], ignore_index=True)
# 60sec interval
new_df_60 = process_network_traffic(new_df, '60S')
X_new_60 = preprocesTestTrain(new_df_60)

# Assuming you have ground truth labels in the new dataset
y_new_true_60 = new_df_60['Device Category']

In [9]:
# Testing the model on new data
new_df1 = pd.read_csv('16-09-30.csv', low_memory=False)
new_df2 = pd.read_csv('16-10-01.csv', low_memory=False)
new_df3 = pd.read_csv('16-10-02.csv', low_memory=False)
new_df4 = pd.read_csv('16-10-03.csv', low_memory=False)
new_df5 = pd.read_csv('16-10-05.csv', low_memory=False)

# Skip the first row (header row with explanations)
new_df1 = new_df1.iloc[1:]
new_df2 = new_df2.iloc[1:]
new_df3 = new_df3.iloc[1:]
new_df4 = new_df4.iloc[1:]
new_df5 = new_df5.iloc[1:]


# Concatenate the dataframes
new_df = pd.concat([new_df1,new_df2, new_df3, new_df4, new_df5], ignore_index=True)

# 15sec interval
new_df_15 = process_network_traffic(new_df, '15S')
X_new_15 = preprocesTestTrain(new_df_15)

# Assuming you have ground truth labels in the new dataset
y_new_true_15 = new_df_15['Device Category']

In [19]:
# Predict and measure prediction time, memory usage, and accuracy: Decision Tree
# 60sec interval
print("\n60 sec interval\n")
prediction_time, prediction_memory, accuracy = predict_model(dt_clf_60, X_new_60, y_new_true_60)
print(f"Prediction time: {prediction_time:.4f} seconds")
print(f"Prediction peak memory usage: {prediction_memory:.2f} MB")
print(f"Accuracy: {accuracy}")

# 15sec interval
print("\n15 sec interval\n")
prediction_time, prediction_memory, accuracy = predict_model(dt_clf_15, X_new_15, y_new_true_15)
print(f"Prediction time: {prediction_time:.4f} seconds")
print(f"Prediction peak memory usage: {prediction_memory:.2f} MB")
print(f"Accuracy: {accuracy}")


60 sec interval

Prediction time: 2.9732 seconds
Prediction peak memory usage: 169.34 MB
Accuracy:               precision    recall  f1-score   support

           0       0.56      0.83      0.67     35134
           1       1.00      0.89      0.94    123434
           2       0.02      0.04      0.03      7130
           3       0.94      0.98      0.96     23063
           4       0.00      0.00      0.00     35062
           5       0.28      0.42      0.33     21218

    accuracy                           0.70    245041
   macro avg       0.47      0.53      0.49    245041
weighted avg       0.70      0.70      0.69    245041


15 sec interval

Prediction time: 1.2109 seconds
Prediction peak memory usage: 215.88 MB
Accuracy:               precision    recall  f1-score   support

           0       0.35      0.78      0.48     38435
           1       1.00      0.85      0.92    233973
           2       0.03      0.02      0.02      7505
           3       0.73      0.99      0

In [68]:
# Predict and measure prediction time, memory usage, and accuracy: Random Forest
# 60sec interval
print("\n60 sec interval\n")
prediction_time, prediction_memory, accuracy = predict_model(rf_model_60, X_new_60, y_new_true_60)
print(f"Prediction time: {prediction_time:.4f} seconds")
print(f"Prediction peak memory usage: {prediction_memory:.2f} MB")
print(f"Accuracy: {accuracy}")

# 15sec interval
print("\n15 sec interval\n")
prediction_time, prediction_memory, accuracy = predict_model(rf_model_15, X_new_15, y_new_true_15)
print(f"Prediction time: {prediction_time:.4f} seconds")
print(f"Prediction peak memory usage: {prediction_memory:.2f} MB")
print(f"Accuracy: {accuracy}")


60 sec interval

Prediction time: 3.0330 seconds
Prediction peak memory usage: 144.91 MB
Accuracy:               precision    recall  f1-score   support

           0       0.90      0.95      0.93     35134
           1       1.00      0.96      0.98    123434
           2       0.98      0.84      0.90      7130
           3       0.94      0.97      0.96     23063
           4       0.78      0.93      0.85     35062
           5       0.84      0.70      0.77     21218

    accuracy                           0.93    245041
   macro avg       0.91      0.89      0.90    245041
weighted avg       0.93      0.93      0.93    245041


15 sec interval

Prediction time: 3.6233 seconds
Prediction peak memory usage: 195.73 MB
Accuracy:               precision    recall  f1-score   support

           0       0.58      0.95      0.72     38435
           1       0.97      0.89      0.93    233973
           2       0.96      0.79      0.87      7505
           3       0.96      0.97      0

In [67]:
# 15sec interval
# Encode the target labels
label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_new_true_15)
y_true_categorical = to_categorical(y_true_encoded)

# Standardize the feature columns using the same scaler used for training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)  # X from the training phase
X_new_scaled = scaler.transform(X_new_15)

# Make predictions on the new data
y_pred = model.predict(X_new_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_true_categorical, axis=1)

# Decode the predicted and true classes
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

# Evaluate the model
accuracy = accuracy_score(y_true_classes, y_pred_classes)
print(f"Test accuracy on new data: {accuracy:.4f}")

# 60sec interval
# Encode the target labels
label_encoder = LabelEncoder()
y_true_encoded = label_encoder.fit_transform(y_new_true_60)
y_true_categorical = to_categorical(y_true_encoded)

# Standardize the feature columns using the same scaler used for training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)  # X from the training phase
X_new_scaled = scaler.transform(X_new_60)

# Make predictions on the new data
y_pred = model.predict(X_new_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_true_categorical, axis=1)

# Decode the predicted and true classes
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

# Evaluate the model
accuracy = accuracy_score(y_true_classes, y_pred_classes)
print(f"Test accuracy on new data: {accuracy:.4f}")

[1m13723/13723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 683us/step
Test accuracy on new data: 0.9102
[1m7658/7658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 695us/step
Test accuracy on new data: 0.9021
