# Developing an Autoencoder (AE) for Behavorial Representation Learning:

### Imports

In [20]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
from tensorflow.keras import layers, models
from typing import Tuple
from sklearn.preprocessing import StandardScaler

### Defining Autoencoder Class:

In [None]:
class Autoencoder:
    """
    Autoencoder for learning latent representation of UEBA behavioral features.
    """
    
    def __init__(self, input_dim: int, latent_dim: int=16, hidden_dim: int=64, learning_rate: float=1e-3) -> None:
        """
        Initializes the autoencoder architecture.
        
        Args:
            input_dim: The number of input features
            latent_dim: The size of latent embeddings
            hidden_dim: The size of hidden layers
            learning_rate: Optimizer learning rate
            
        Returns:
            None:
        """
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.learning_rate = learning_rate
        
        self.autoencoder, self.encoder = self._build_model()
        
    
    def _build_model(self) -> tuple:
        """
        Builds the autoencoder and encoder models. The autoencoder serves as the model to train, whereas the encoder
        will be utilized for extracting behavioral embeddings.
        
        Args:
            None:
        
        Returns:
            tuple: A two-element tuple containing the autoencoder and encoder models
        """
        # Encoder construction
        inputs = layers.Input(shape=(self.input_dim,), name="ueba_input")
        x = layers.Dense(self.hidden_dim, activation="relu")(inputs)
        latent = layers.Dense(self.latent_dim, activation="relu", name="latent_space")(x)
        
        # Decoder construction
        x = layers.Dense(self.hidden_dim, activation="relu")(latent)
        outputs = layers.Dense(self.input_dim, activation="linear")(x)
        
        # Defining the autoencoder and encoder
        autoencoder = models.Model(inputs, outputs, name="ueba_autoencoder")
        encoder = models.Model(inputs, latent, name="ueba_encoder")
        
        # Compiling the autoencoder
        autoencoder.compile(
            optimizer=tf.keras.optimizers.Adam(self.learning_rate),
            loss="mse"
        )
        
        return (autoencoder, encoder)
    
    
    def train(self, x_train: np.ndarray, epochs: int=50, batch_size: int=128, validation_split: float=0.1) -> None:
        """
        Trains the autoencoder using the specified hyperparameters.
        
        Args:
            x_train: The scaled UEBA-enhanced feature matrix
            epochs: The number of epochs to train the autoencoder for
            batch_size: Batch size
            validation_split: The validation data ratio
            
        Returns:
            None:
        """
        self.autoencoder.fit(
            x_train,
            x_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            shuffle=True,
            verbose=1
        )
        
    
    def encode(self, feature_matrix: np.ndarray) -> np.ndarray:
        """
        Generates latent embeddings for UEBA data.
        
        Args:
            feature_matrix: The scaled UEBA feature matrix
            
        Returns:
            np.ndarray: The generate latent embeddings
        """
        return self.encoder.predict(feature_matrix)
    
    
    def reconstruction_error(self, feature_matrix: np.ndarray) -> np.ndarray:
        """
        Computes the reconstruction error per sample.
        
        Args:
            feature_matrix: The scaled UEBA feature matrix
            
        Returns:
            np.ndarray: Reconstruction MSE per sample
        """
        # Reconstructing original feature matrix
        reconstruction = self.autoencoder.predict(feature_matrix)
        
        # Computing the mean sqaured error
        error = np.mean(np.square(feature_matrix - reconstruction), axis=1)
        
        return error

### Loading UEBA-Enhanced Feature Matrix:

In [6]:
ueba_matrix = pd.read_csv(r"processed_datasets\ueba_dataset.csv", index_col=0)

In [7]:
ueba_matrix.head()

Unnamed: 0,user,pc,day,logon_count,logoff_count,off_hours_logon,file_open_count,file_write_count,file_copy_count,file_delete_count,...,usb_remove_count_rolling_delta,off_hours_usb_usage_rolling_delta,emails_sent_rolling_delta,unique_recipients_rolling_delta,external_emails_rolling_delta,attachements_sent_rolling_delta,off_hours_emails_rolling_delta,usb_file_activity_flag,off_hours_activity_flag,external_comm_activity_flag
0,aab0162,pc-6599,2010-01-04,1,1,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
1,aab0162,pc-6599,2010-01-05,1,1,2,0,0,0,0,...,0.0,0.0,0.0,1.5,0.0,0.5,-2.5,0,0,1
2,aab0162,pc-6599,2010-01-06,1,1,2,0,0,0,0,...,0.0,0.0,0.0,1.0,-0.666667,-0.333333,3.666667,0,1,0
3,aab0162,pc-6599,2010-01-07,1,1,2,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,-0.25,0.5,0,1,1
4,aab0162,pc-6599,2010-01-08,1,1,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.8,-1.0,-3.6,0,0,1


In [8]:
# Extracting numeric features only
x = ueba_matrix.drop(columns=["user", "pc", "day"]).values

In [9]:
# Standardizing feature matrix
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

### Initializing and Training Autoencoder:

In [10]:
ae = Autoencoder(
    input_dim=x_scaled.shape[1],
    latent_dim=16,
    hidden_dim=64
)

In [11]:
ae.train(x_train=x_scaled, epochs=40, batch_size=128)

Epoch 1/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step - loss: 0.1573 - val_loss: 0.0602
Epoch 2/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step - loss: 0.0523 - val_loss: 0.0528
Epoch 3/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 0.0456 - val_loss: 0.0478
Epoch 4/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 0.0412 - val_loss: 0.0441
Epoch 5/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 0.0391 - val_loss: 0.0433
Epoch 6/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 0.0378 - val_loss: 0.0446
Epoch 7/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step - loss: 0.0363 - val_loss: 0.0398
Epoch 8/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step - loss: 0.0355 - val_loss: 0.0389


In [12]:
# Generating embeddings
latent_embeddings = ae.encode(x_scaled)

[1m48900/48900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 621us/step


### Saving Encoder, Scaler, and Embeddings:

In [25]:
save_path = r"encoders\encoder_model_1"
os.makedirs(save_path, exist_ok=True)

In [26]:
# Saving the encoder model
ae.encoder.save(os.path.join(save_path, "encoder_model.keras"))

In [27]:
# Saving the trained scaler to prevent future behavorial drift
joblib.dump(scaler, os.path.join(save_path, r"feature_scaler.pkl"))

['encoders\\encoder_model_1\\feature_scaler.pkl']

In [28]:
# Saving the embeddings generated by the encoder
np.save(os.path.join(save_path, "latent_embeddings.npy"), latent_embeddings)