In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

class ExpertModel(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_units, dropout_rate):
        super(ExpertModel, self).__init__()
        layers = []
        for units in hidden_units:
            layers.append(nn.Linear(input_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            input_dim = units
        layers.append(nn.Linear(input_dim, output_dim))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

class GateModel(nn.Module):
    def __init__(self, input_dim, num_experts, hidden_units, dropout_rate):
        super(GateModel, self).__init__()
        layers = []
        for units in hidden_units:
            layers.append(nn.Linear(input_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            input_dim = units
        layers.append(nn.Linear(input_dim, num_experts))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return F.softmax(self.model(x), dim=1)


class MixtureOfExperts(pl.LightningModule):
    def __init__(self, input_dim, output_dim, num_experts, expert_hidden_units, gate_hidden_units, num_active_experts, dropout_rate, learning_rate=1e-3):
        super(MixtureOfExperts, self).__init__()
        self.save_hyperparameters()

        self.experts = nn.ModuleList([ExpertModel(input_dim, output_dim, expert_hidden_units, dropout_rate) for _ in range(num_experts)])
        self.gate = GateModel(input_dim, num_experts, gate_hidden_units, dropout_rate)
        self.num_active_experts = num_active_experts
        self.expert_usage_count = torch.zeros(num_experts, dtype=torch.float32)

        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)
        gate_output = self.gate(x)

        expert_usage_count_adjusted = self.expert_usage_count + 1e-10
        importance_scores = gate_output / expert_usage_count_adjusted

        top_n_expert_indices = torch.argsort(importance_scores, dim=1, descending=True)[:, :self.num_active_experts]
        selected_expert_indices = top_n_expert_indices.view(-1)

        self.expert_usage_count += torch.bincount(selected_expert_indices, minlength=len(self.experts)).float()

        mask = torch.sum(F.one_hot(top_n_expert_indices, num_classes=len(self.experts)), dim=1)
        masked_gate_output = gate_output * mask
        normalized_gate_output = masked_gate_output / (torch.sum(masked_gate_output, dim=1, keepdim=True) + 1e-7)

        masked_expert_outputs = torch.stack([expert_outputs[:, i] * normalized_gate_output[:, i].unsqueeze(1)
                                              for i in range(len(self.experts))], dim=1)
        final_output = torch.sum(masked_expert_outputs, dim=1)

        return final_output

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        preds = torch.argmax(y_hat, dim=1)
        f2_score = fbeta_score(y.cpu().numpy(), preds.cpu().numpy(), beta=2, average='macro')
        self.log('val_f2', f2_score, prog_bar=True, sync_dist=True)
        return {'val_f2': f2_score}

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', factor=0.1, patience=5, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_f2',
                'interval': 'epoch',
                'frequency': 1
            }
        }

    def on_fit_start(self):
        self.expert_usage_count = self.expert_usage_count.to(self.device)

class ExpertUsageLogger(pl.Callback):
    def __init__(self, moe_model):
        super(ExpertUsageLogger, self).__init__()
        self.moe_model = moe_model
        self.expert_usage_history = []

    def on_train_epoch_end(self, trainer, pl_module):
        usage_count = self.moe_model.expert_usage_count.clone().cpu().numpy()
        self.expert_usage_history.append(usage_count)

    def plot_expert_usage(self):
        import matplotlib.pyplot as plt
        usage_history = torch.tensor(self.expert_usage_history)
        plt.figure(figsize=(10, 6))
        for i in range(usage_history.shape[1]):
            plt.plot(usage_history[:, i], label=f'Expert {i}')
        plt.xlabel('Epoch')
        plt.ylabel('Expert Usage Count')
        plt.title('Expert Usage Over Epochs')
        plt.legend(loc='upper left')
        plt.show()


In [3]:
train_data_path = 'CIC_IoMT_2024_WiFi_MQTT_train.parquet'
test_data_path = 'CIC_IoMT_2024_WiFi_MQTT_test.parquet'
usage_ratio=0.2

In [None]:
df_train = pd.read_parquet(train_data_path)
df_test = pd.read_parquet(test_data_path)

# Combine train and test data
df_combined = pd.concat([df_train, df_test])

display(df_train.nunique())
df_train.info()
# Perform stratified sampling
df_sampled, _ = train_test_split(df_combined, train_size=usage_ratio, stratify=df_combined['label'], random_state=42)

# Split back into train and test based on the original indices
df_train: pd.DataFrame = df_sampled[df_sampled.index.isin(df_train.index)]
df_test: pd.DataFrame = df_sampled[df_sampled.index.isin(df_test.index)]
numeric_columns = df_train.select_dtypes(include=[np.number]).columns
df_train[numeric_columns] = df_train[numeric_columns].astype(np.float32)
df_test[numeric_columns] = df_test[numeric_columns].astype(np.float32)

In [None]:
df_train.info()
df_train.nunique()

In [None]:
df_train['DHCP'].unique()

In [7]:
numerical_columns = [col for col in df_train.columns if col not in ['label', 'Drate']]

target_train = df_train['label']
df_train = df_train.drop(columns=['label', 'Drate'])
target_test = df_test['label']
df_test = df_test.drop(columns=['label', 'Drate'])

In [None]:
print(f"Number of missing values: {df_train.isna().sum().sum()}")

In [None]:
display(df_train.describe())

In [10]:
from sklearn.preprocessing import OrdinalEncoder


encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
target_train_encoded = encoder.fit_transform(target_train.values.reshape(-1, 1))
target_test_encoded = encoder.transform(target_test.values.reshape(-1, 1))

In [None]:
target_train_encoded.shape, target_test_encoded.shape

In [None]:
z_scores = np.abs(stats.zscore(df_train[numerical_columns].astype(np.float64)))

outlier_mask = np.any(z_scores > 4, axis=1)

# Filter out rows with outliers
df_train = df_train[~outlier_mask]
target_train_encoded = target_train_encoded[~outlier_mask]

print(f"{outlier_mask.sum()} out of {len(outlier_mask)} samples were filtered out as outliers.")
print(f"Number of missing values: {df_train.isna().sum().sum()}")

In [None]:
df_train.sample(15)

In [None]:
mean = df_train.mean()
std = df_train.std()
display(mean)
display(std)

In [None]:
mean = mean + 1e-5
std = std + 1e-5
df_train = ((df_train - mean) / std).dropna(axis=1)
df_test = ((df_test - mean) / std).dropna(axis=1)

print(f"Number of missing values: {df_train.isna().sum().sum()}")
print(f"Number of missing values: {df_test.isna().sum().sum()}")

In [16]:
corr_matrix = np.corrcoef(df_train, rowvar=False)
upper_triangle_indices = np.triu_indices_from(corr_matrix, k=1)
correlated_pairs = [(i, j) for i, j in zip(*upper_triangle_indices) if np.abs(corr_matrix[i, j]) >= 0.8]
cols_train, cols_test = df_train.columns, df_test.columns
correlated_features = set(j for _, j in correlated_pairs)
df_train = np.delete(df_train, list(correlated_features), axis=1)
df_test = np.delete(df_test, list(correlated_features), axis=1)
df_train = pd.DataFrame(df_train, columns=cols_train.drop(cols_train[list(correlated_features)]))
df_test = pd.DataFrame(df_test, columns=cols_test.drop(cols_test[list(correlated_features)]))

In [None]:
cols_train

In [None]:
df_train

In [None]:
n_components = 15
name_cols = [f'PC{i}' for i in range(1, n_components + 1)]
pca = PCA(n_components=n_components)
pca.fit(df_train)
reduced_train = pca.transform(df_train)
reduced_test = pca.transform(df_test)
reduced_train = pd.DataFrame(reduced_train, columns=name_cols)
reduced_test = pd.DataFrame(reduced_test, columns=name_cols)
display(reduced_train)
display(reduced_test)

In [20]:
X, y = reduced_train, target_train_encoded

In [21]:
X_test, y_test = reduced_test, target_test_encoded

In [22]:
import optuna
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from torch.utils.data import DataLoader, TensorDataset

def objective(trial, X_train, y_train, X_val, y_val, input_dim, output_dim):
    gate_hidden_units_options = {
        "16": [16], 
        "32": [32], 
        "64": [64], 
        "32_16": [32, 16]
    }
    
    chosen_gate_hidden_units_str = trial.suggest_categorical('gate_hidden_units', list(gate_hidden_units_options.keys()))
    chosen_gate_hidden_units = gate_hidden_units_options[chosen_gate_hidden_units_str]

    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    
    # Instantiate the model
    model = MixtureOfExperts(
        input_dim=input_dim,
        output_dim=output_dim,
        num_experts=output_dim,  # Number of experts equals the number of classes
        expert_hidden_units=[32, 64, 32],
        gate_hidden_units=chosen_gate_hidden_units,
        num_active_experts=3,
        dropout_rate=dropout_rate
    )
    
    # Initialize the expert usage logger
    expert_usage_logger = ExpertUsageLogger(model)

    # Initialize the PyTorch Lightning trainer
    logger = TensorBoardLogger("logs", name="MoE_experimental")
    lr_monitor = LearningRateMonitor(logging_interval='epoch')
    checkpoint_callback = ModelCheckpoint(monitor='val_f2', mode='max') 

    trainer = pl.Trainer(
        max_epochs=300,
        logger=logger,
        callbacks=[lr_monitor, checkpoint_callback, expert_usage_logger],
        accelerator='gpu',
    )
    
    # Create PyTorch DataLoaders
    train_loader = DataLoader(TensorDataset(torch.tensor(X_train.values, device='cuda'), torch.tensor(y_train, device='cuda')), batch_size=8192, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(X_val.values, device='cuda'), torch.tensor(y_val, device='cuda')), batch_size=8192)
    
    # Train the model
    trainer.fit(model, train_loader, val_loader)
    
    # Evaluate the model on the validation set
    val_f2 = trainer.callback_metrics["val_f2"].item()

    return val_f2

# Run the Optuna optimization
def tune_model(X_train, y_train, X_val, y_val, input_dim, output_dim, n_trials=20):
    gate_hidden_units_options = {
        "16": [16], 
        "32": [32], 
        "64": [64], 
        "32_16": [32, 16]
    }
    study = optuna.create_study(direction="maximize")
    
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, input_dim, output_dim), 
                   n_trials=n_trials)
    
    print(f"Best Hyperparameters: {study.best_params}")
    
    # Optionally, retrain the model with the best hyperparameters and return it
    best_params = study.best_params
    best_gate_hidden_units = gate_hidden_units_options[best_params['gate_hidden_units']]
    
    best_model = MixtureOfExperts(
        input_dim=input_dim,
        output_dim=output_dim,
        num_experts=output_dim, 
        expert_hidden_units=[32, 64, 32],
        gate_hidden_units=best_gate_hidden_units,
        num_active_experts=3,
        dropout_rate=best_params['dropout_rate']
    )
    
    # Initialize the expert usage logger
    expert_usage_logger = ExpertUsageLogger(best_model)

    # Train the model with the best hyperparameters
    trainer = pl.Trainer(
        max_epochs=50,
        callbacks=[expert_usage_logger],
        accelerator='gpu'
    )
    
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=4096, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=4096)
    
    trainer.fit(best_model, train_loader, val_loader)
    
    expert_usage_logger.plot_expert_usage()
    
    return best_model, study.best_params

In [None]:
tune_model(X, y, X_test, y_test, X.shape[1], 1)