In [5]:
from autogluon.tabular import TabularPredictor
import os
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
PROJECT_ROOT = os.path.abspath("..")
train_path = os.path.join(PROJECT_ROOT, "data", "unsw-nb15","raw", "UNSW_NB15_training-set.csv")
test_path = os.path.join(PROJECT_ROOT, "data", "unsw-nb15","raw", "UNSW_NB15_testing-set.csv")

In [7]:
train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

DROP_COLS = ["id", "label"]
train_df = train_df.drop(columns=DROP_COLS, errors="ignore")
test_df  = test_df.drop(columns=DROP_COLS, errors="ignore")

full_df = pd.concat([train_df, test_df], ignore_index=True)

train_df, test_df = train_test_split(
    full_df,
    test_size=0.2,
    stratify=full_df["attack_cat"],
    random_state=42
)

In [15]:
from autogluon.tabular import TabularPredictor

predictor = TabularPredictor.load(
    "/home/e20094/e20-4yp-backdoor-resilient-federated-nids/notebooks/AutogluonModels/ag-20251224_121820"
)

print(predictor.feature_metadata)


('category', [])  :  3 | ['proto', 'service', 'state']
('float', [])     : 11 | ['dur', 'rate', 'sload', 'dload', 'sinpkt', ...]
('int', [])       : 26 | ['spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl', ...]
('int', ['bool']) :  1 | ['is_sm_ips_ports']


In [18]:
model = predictor._trainer.load_model("NeuralNetTorch_BAG_L1")
print(type(model))
print(model)


<class 'autogluon.core.models.ensemble.stacker_ensemble_model.StackerEnsembleModel'>
<autogluon.core.models.ensemble.stacker_ensemble_model.StackerEnsembleModel object at 0x7b6ddb231d80>


In [19]:
bag = predictor._trainer.load_model("NeuralNetTorch_BAG_L1")

print("Child models:")
print(bag.models)


Child models:
['S1F1', 'S1F2', 'S1F3', 'S1F4', 'S1F5']


In [25]:
!ls AutogluonModels/ag-20251224_121820/models/NeuralNetTorch_BAG_L1/S1F1


model.pkl


In [73]:
from autogluon.tabular.models.tabular_nn.torch.tabular_nn_torch import TabularNeuralNetTorchModel

fold_path = "/home/e20094/e20-4yp-backdoor-resilient-federated-nids/notebooks/AutogluonModels/ag-20251224_121820/models/NeuralNetTorch_BAG_L1/S1F1"

fold_model = TabularNeuralNetTorchModel.load(fold_path)

print(type(fold_model))
print(fold_model)


<class 'autogluon.tabular.models.tabular_nn.torch.tabular_nn_torch.TabularNeuralNetTorchModel'>
<autogluon.tabular.models.tabular_nn.torch.tabular_nn_torch.TabularNeuralNetTorchModel object at 0x7b6c5230ba30>


In [74]:
print("MODEL KEYS:")
print([k for k in fold_model.__dict__.keys()
       if "data" in k.lower()
       or "processor" in k.lower()
       or "feature" in k.lower()
       or "network" in k.lower()
       or "model" in k.lower()])


MODEL KEYS:
['model', 'features', 'feature_metadata', '_features_internal', '_features_internal_to_align', '_feature_metadata', '_is_features_in_same_as_ex', '_is_fit_metadata_registered', '_fit_metadata', '_types_of_features', 'feature_arraycol_map', 'feature_type_map', 'features_to_drop', 'processor', 'num_dataloading_workers']


In [75]:
print([k for k in fold_model.__dict__.keys()])


['name', 'path_root', 'path', 'num_classes', 'quantile_levels', 'model', 'problem_type', 'conformalize', 'label_cleaner', 'eval_metric', 'stopping_metric', 'normalize_pred_probas', 'features', 'feature_metadata', '_features_internal', '_features_internal_to_align', '_feature_metadata', '_is_features_in_same_as_ex', 'fit_time', 'predict_time', '_predict_n_size', 'predict_1_time', 'compile_time', 'val_score', '_memory_usage_estimate', '_user_params', '_user_params_aux', 'params', 'params_aux', 'params_trained', 'nondefault_params', '_is_initialized', '_is_fit_metadata_registered', '_fit_metadata', 'saved_learning_curves', '_compiler', 'random_seed', '_types_of_features', 'feature_arraycol_map', 'feature_type_map', 'features_to_drop', 'processor', 'num_dataloading_workers', '_architecture_desc', 'optimizer', 'device', 'max_batch_size', '_num_cpus_infer']


In [76]:
processor = fold_model.processor
print(type(processor))
print(processor)


<class 'sklearn.compose._column_transformer.ColumnTransformer'>
ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('continuous',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['sttl', 'dttl', 'swin', 'dwin']),
                                ('skewed',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('quantile',
                                                  QuantileTransformer(output_distribution='nor...
                                  'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb',
                                  'dtcpb', 'tcprtt', 'synack', 'ackdat',
                  

In [70]:
print("Processor attributes:")
print([k for k in processor.__dict__.keys()])


Processor attributes:
['transformers', 'remainder', 'sparse_threshold', 'n_jobs', 'transformer_weights', 'verbose', 'verbose_feature_names_out', 'force_int_remainder_cols', 'feature_names_in_', 'n_features_in_', '_columns', '_transformer_to_input_indices', '_remainder', 'sparse_output_', 'transformers_', 'output_indices_']


In [71]:
print("External features:", fold_model.features)
print("Internal features:", fold_model._features_internal)
print("Feature array col map:", fold_model.feature_arraycol_map)
print("Feature type map:", fold_model.feature_type_map)
print("Features dropped:", fold_model.features_to_drop)


External features: ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'proto', 'service', 'state']
Internal features: ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'proto', 'service', 'state']
Feature array 

In [72]:
net = fold_model.model
print(net)


EmbedNet(
  (embed_blocks): ModuleList(
    (0): Embedding(102, 21)
    (1): Embedding(14, 7)
    (2): Embedding(9, 5)
  )
  (main_block): Sequential(
    (0): Linear(in_features=71, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=256, out_features=256, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.1, inplace=False)
    (9): Linear(in_features=256, out_features=256, bias=True)
    (10): ReLU()
    (11): Linear(in_features=256, out_features=10, bias=True)
  )
  (softmax): Softmax(dim=1)
)


In [35]:
state_dict = net.state_dict()
print(state_dict.keys())


odict_keys(['embed_blocks.0.weight', 'embed_blocks.1.weight', 'embed_blocks.2.weight', 'main_block.0.weight', 'main_block.0.bias', 'main_block.3.weight', 'main_block.3.bias', 'main_block.6.weight', 'main_block.6.bias', 'main_block.9.weight', 'main_block.9.bias', 'main_block.11.weight', 'main_block.11.bias'])


## Replicating preprocessing and the NN model

In [36]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, QuantileTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer

class Preprocessor:
    def __init__(self):
        # Continuous features
        self.cont_features = ['sttl', 'dttl', 'swin', 'dwin']
        self.cont_imputer = SimpleImputer(strategy='median')
        self.cont_scaler = StandardScaler()
        
        # Skewed numeric features
        self.skew_features = [
            'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload',
            'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb', 'dtcpb',
            'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
            'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
            'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login',
            'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'
        ]
        self.skew_imputer = SimpleImputer(strategy='median')
        self.skew_scaler = QuantileTransformer(output_distribution='normal', random_state=42)
        
        # Categorical features
        self.cat_features = ['proto', 'service', 'state']
        self.cat_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        
        # Maintain feature order
        self.feature_order = [
            'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload',
            'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
            'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
            'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm',
            'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm',
            'ct_srv_dst', 'is_sm_ips_ports', 'proto', 'service', 'state'
        ]
        
    def fit(self, df):
        self.cont_imputer.fit(df[self.cont_features])
        self.cont_scaler.fit(self.cont_imputer.transform(df[self.cont_features]))
        
        self.skew_imputer.fit(df[self.skew_features])
        self.skew_scaler.fit(self.skew_imputer.transform(df[self.skew_features]))
        
        self.cat_encoder.fit(df[self.cat_features])
        return self
    
    def transform(self, df):
        cont = self.cont_scaler.transform(self.cont_imputer.transform(df[self.cont_features]))
        skew = self.skew_scaler.transform(self.skew_imputer.transform(df[self.skew_features]))
        cat = self.cat_encoder.transform(df[self.cat_features])
        # Concatenate in the AutoGluon internal feature order
        X_dict = {f: [] for f in self.feature_order}
        for f in self.feature_order:
            if f in self.cont_features:
                X_dict[f] = cont[:, self.cont_features.index(f)]
            elif f in self.skew_features:
                X_dict[f] = skew[:, self.skew_features.index(f)]
            elif f in self.cat_features:
                X_dict[f] = cat[:, self.cat_features.index(f)]
            else:
                raise ValueError(f"Unknown feature: {f}")
        X_ordered = np.column_stack([X_dict[f] for f in self.feature_order])
        return X_ordered


In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EmbedNet(nn.Module):
    def __init__(self, embedding_sizes, num_continuous, hidden_size=256, num_classes=10, dropout=0.1):
        super().__init__()
        # Embeddings for categorical features
        self.embed_blocks = nn.ModuleList([nn.Embedding(cat_size, emb_dim) 
                                           for cat_size, emb_dim in embedding_sizes])
        
        total_emb_dim = sum([emb_dim for _, emb_dim in embedding_sizes])
        self.num_inputs = num_continuous + total_emb_dim
        
        # Main block
        self.main_block = nn.Sequential(
            nn.Linear(self.num_inputs, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x_cont, x_cat):
        # x_cat is integer encoded
        x_emb = [emb_layer(x_cat[:, i].long()) for i, emb_layer in enumerate(self.embed_blocks)]
        x_emb = torch.cat(x_emb, dim=1)
        x = torch.cat([x_cont, x_emb], dim=1)
        out = self.main_block(x)
        out = self.softmax(out)
        return out


In [38]:
embedding_sizes = [(102,21), (14,7), (9,5)]  # proto, service, state
num_continuous = 38  # 41 features total - 3 categorical
num_classes = 10

In [40]:
# Instantiate
model = EmbedNet(embedding_sizes, num_continuous=num_continuous, num_classes=num_classes)

# Example tensors
x_cont = torch.randn(5, num_continuous)      # batch of 5
x_cat = torch.zeros(5,3, dtype=torch.long)
x_cat[:,0] = torch.randint(0, 102, (5,))  # proto
x_cat[:,1] = torch.randint(0, 14, (5,))   # service
x_cat[:,2] = torch.randint(0, 9, (5,))    # state


y_pred = model(x_cont, x_cat)
print(y_pred.shape)  # should be (5,10)


torch.Size([5, 10])


In [43]:
from autogluon.tabular import TabularPredictor

# Load the entire predictor
predictor_path = "/home/e20094/e20-4yp-backdoor-resilient-federated-nids/notebooks/AutogluonModels/ag-20251224_121820"
predictor = TabularPredictor.load(predictor_path)

# Get the bagged neural network
bag_model = predictor._trainer.load_model('NeuralNetTorch_BAG_L1')

# List child models
print(bag_model.models)  # ['S1F1', 'S1F2', ...]

# Load one child
child_model = bag_model.load_child('S1F1')

# Access PyTorch model
torch_model = child_model.model
state_dict = torch_model.state_dict()
print(state_dict.keys())


['S1F1', 'S1F2', 'S1F3', 'S1F4', 'S1F5']
odict_keys(['embed_blocks.0.weight', 'embed_blocks.1.weight', 'embed_blocks.2.weight', 'main_block.0.weight', 'main_block.0.bias', 'main_block.3.weight', 'main_block.3.bias', 'main_block.6.weight', 'main_block.6.bias', 'main_block.9.weight', 'main_block.9.bias', 'main_block.11.weight', 'main_block.11.bias'])


In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from autogluon.tabular import TabularPredictor

# =========================
# 1. Load AutoGluon Predictor and Extract Weights
# =========================
predictor_path = "/home/e20094/e20-4yp-backdoor-resilient-federated-nids/notebooks/AutogluonModels/ag-20251224_121820"
predictor = TabularPredictor.load(predictor_path)

# Load bagged NN
bag_model = predictor._trainer.load_model('NeuralNetTorch_BAG_L1')
child_model = bag_model.load_child('S1F1')

# Extract PyTorch model and state_dict
torch_model = child_model.model
state_dict = torch_model.state_dict()
print("State dict keys:", state_dict.keys())

# =========================
# 2. Preprocessing
# =========================
# Continuous features (from your previous info)
continuous_features = ['sttl', 'dttl', 'swin', 'dwin', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes',
                       'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit',
                       'stcpb', 'dtcpb', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
                       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm',
                       'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_flw_http_mthd',
                       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']

# Categorical features
categorical_features = ['proto', 'service', 'state']

# Load training data to fit scalers/encoders
train_path = "/home/e20094/e20-4yp-backdoor-resilient-federated-nids/data/unsw-nb15/raw/UNSW_NB15_training-set.csv"
train_df = pd.read_csv(train_path).drop(columns=["id", "label"], errors="ignore")

# Continuous preprocessing
cont_imputer = SimpleImputer(strategy="median")
cont_scaler = StandardScaler()
train_cont = cont_scaler.fit_transform(cont_imputer.fit_transform(train_df[continuous_features]))

# Categorical preprocessing
cat_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_cat = cat_encoder.fit_transform(train_df[categorical_features])

# =========================
# 3. Define EmbedNet (replicating AutoGluon NN)
# =========================
class EmbedNet(nn.Module):
    def __init__(self, embed_input_sizes, continuous_input_size, hidden_size=256, output_size=10, dropout=0.1):
        super().__init__()
        # Embeddings
        self.embed_blocks = nn.ModuleList([
            nn.Embedding(num_categories, dim)
            for num_categories, dim in embed_input_sizes
        ])
        
        # Main fully connected block
        total_embed_size = sum([dim for _, dim in embed_input_sizes])
        self.main_block = nn.Sequential(
            nn.Linear(continuous_input_size + total_embed_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x_cont, x_cat):
        x_emb = [emb_layer(x_cat[:, i].long()) for i, emb_layer in enumerate(self.embed_blocks)]
        x_emb = torch.cat(x_emb, dim=1)
        x = torch.cat([x_cont, x_emb], dim=1)
        x = self.main_block(x)
        x = self.softmax(x)
        return x

# =========================
# 4. Instantiate and Load Weights
# =========================
# Define embedding sizes (from AutoGluon processor info)
embed_input_sizes = [(102,21), (14,7), (9,5)]  # proto, service, state
continuous_input_size = len(continuous_features)
output_size = 10  # 10 classes

model = EmbedNet(embed_input_sizes, continuous_input_size, hidden_size=256, output_size=output_size, dropout=0.1)
model.load_state_dict(state_dict)  # load weights from S1F1
model.eval()

# =========================
# 5. Predict on new data
# =========================
# Example batch (replace with real preprocessed data)
x_cont = torch.tensor(train_cont[:5], dtype=torch.float32)  # batch of 5
# Ensure x_cat values are in valid range for embeddings
x_cat_test = torch.zeros((5,3), dtype=torch.long)
x_cat_test[:,0] = torch.randint(0,102,(5,))  # proto
x_cat_test[:,1] = torch.randint(0,14,(5,))   # service
x_cat_test[:,2] = torch.randint(0,9,(5,))    # state

y_pred = model(x_cont, x_cat_test)
print(y_pred.shape)



State dict keys: odict_keys(['embed_blocks.0.weight', 'embed_blocks.1.weight', 'embed_blocks.2.weight', 'main_block.0.weight', 'main_block.0.bias', 'main_block.3.weight', 'main_block.3.bias', 'main_block.6.weight', 'main_block.6.bias', 'main_block.9.weight', 'main_block.9.bias', 'main_block.11.weight', 'main_block.11.bias'])
torch.Size([5, 10])
