# Developing an Autoencoder (AE) for Behavorial Representation Learning:

### Imports

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
from tensorflow.keras import layers, models
from typing import Tuple
from sklearn.preprocessing import StandardScaler

### Defining Autoencoder Class:

In [3]:
class Autoencoder:
    """
    Autoencoder for learning latent representation of UEBA behavioral features.
    """
    
    def __init__(self, input_dim: int, latent_dim: int=16, hidden_dim: int=64, learning_rate: float=1e-3) -> None:
        """
        Initializes the autoencoder architecture.
        
        Args:
            input_dim: The number of input features
            latent_dim: The size of latent embeddings
            hidden_dim: The size of hidden layers
            learning_rate: Optimizer learning rate
            
        Returns:
            None:
        """
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.learning_rate = learning_rate
        
        self.autoencoder, self.encoder = self._build_model()
        
    
    def _build_model(self) -> tuple:
        """
        Builds the autoencoder and encoder models. The autoencoder serves as the model to train, whereas the encoder
        will be utilized for extracting behavioral embeddings.
        
        Args:
            None:
        
        Returns:
            tuple: A two-element tuple containing the autoencoder and encoder models
        """
        # Encoder construction
        inputs = layers.Input(shape=(self.input_dim,), name="ueba_input")
        x = layers.Dense(self.hidden_dim, activation="relu")(inputs)
        latent = layers.Dense(self.latent_dim, activation="relu", name="latent_space")(x)
        
        # Decoder construction
        x = layers.Dense(self.hidden_dim, activation="relu")(latent)
        outputs = layers.Dense(self.input_dim, activation="linear")(x)
        
        # Defining the autoencoder and encoder
        autoencoder = models.Model(inputs, outputs, name="ueba_autoencoder")
        encoder = models.Model(inputs, latent, name="ueba_encoder")
        
        # Compiling the autoencoder
        autoencoder.compile(
            optimizer=tf.keras.optimizers.Adam(self.learning_rate),
            loss="mse"
        )
        
        return (autoencoder, encoder)
    
    
    def train(self, x_train: np.ndarray, epochs: int=50, batch_size: int=128, validation_split: float=0.1) -> None:
        """
        Trains the autoencoder using the specified hyperparameters.
        
        Args:
            x_train: The scaled UEBA-enhanced feature matrix
            epochs: The number of epochs to train the autoencoder for
            batch_size: Batch size
            validation_split: The validation data ratio
            
        Returns:
            None:
        """
        self.autoencoder.fit(
            x_train,
            x_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            shuffle=True,
            verbose=1
        )
        
    
    def encode(self, feature_matrix: np.ndarray) -> np.ndarray:
        """
        Generates latent embeddings for UEBA data.
        
        Args:
            feature_matrix: The scaled UEBA feature matrix
            
        Returns:
            np.ndarray: The generate latent embeddings
        """
        return self.encoder.predict(feature_matrix)
    
    
    def reconstruction_error(self, feature_matrix: np.ndarray) -> np.ndarray:
        """
        Computes the reconstruction error per sample.
        
        Args:
            feature_matrix: The scaled UEBA feature matrix
            
        Returns:
            np.ndarray: Reconstruction MSE per sample
        """
        # Reconstructing original feature matrix
        reconstruction = self.autoencoder.predict(feature_matrix)
        
        # Computing the mean sqaured error
        error = np.mean(np.square(feature_matrix - reconstruction), axis=1)
        
        return error

### Loading UEBA-Enhanced Feature Matrix:

In [20]:
ueba_matrix = pd.read_csv(r"processed_datasets\ueba_dataset.csv", index_col=0)

In [21]:
ueba_matrix.head()

Unnamed: 0,user,pc,day,logon_count,logoff_count,off_hours_logon,file_open_count,file_write_count,file_copy_count,file_delete_count,unique_files_accessed,off_hours_files_accessed,usb_insert_count,usb_remove_count,off_hours_usb_usage,emails_sent,unique_recipients,external_emails,attachements_sent,off_hours_emails,logon_count_zscore,logoff_count_zscore,off_hours_logon_zscore,file_open_count_zscore,file_write_count_zscore,file_copy_count_zscore,file_delete_count_zscore,unique_files_accessed_zscore,off_hours_files_accessed_zscore,usb_insert_count_zscore,usb_remove_count_zscore,off_hours_usb_usage_zscore,emails_sent_zscore,unique_recipients_zscore,external_emails_zscore,attachements_sent_zscore,off_hours_emails_zscore,logon_count_rolling_delta,logoff_count_rolling_delta,off_hours_logon_rolling_delta,file_open_count_rolling_delta,file_write_count_rolling_delta,file_copy_count_rolling_delta,file_delete_count_rolling_delta,unique_files_accessed_rolling_delta,off_hours_files_accessed_rolling_delta,usb_insert_count_rolling_delta,usb_remove_count_rolling_delta,off_hours_usb_usage_rolling_delta,emails_sent_rolling_delta,unique_recipients_rolling_delta,external_emails_rolling_delta,attachements_sent_rolling_delta,off_hours_emails_rolling_delta,usb_file_activity_flag,off_hours_activity_flag,external_comm_activity_flag
0,aab0162,pc-6599,2010-01-04,1,1,2,0,0,0,0,0,0,0,0,0,9,5,1,1,5,0.053,0.0,0.053,-0.259378,0.0,0.0,0.0,-0.260788,-0.151407,0.0,0.0,0.0,0.166342,-1.924866,-0.867676,-0.52419,1.128145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
1,aab0162,pc-6599,2010-01-05,1,1,2,0,0,0,0,0,0,0,0,0,9,8,1,2,0,0.053,0.0,0.053,-0.259378,0.0,0.0,0.0,-0.260788,-0.151407,0.0,0.0,0.0,0.166342,0.22326,-0.867676,0.273296,-0.778881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.5,-2.5,0,0,1
2,aab0162,pc-6599,2010-01-06,1,1,2,0,0,0,0,0,0,0,0,0,9,8,0,1,8,0.053,0.0,0.053,-0.259378,0.0,0.0,0.0,-0.260788,-0.151407,0.0,0.0,0.0,0.166342,0.22326,-1.418286,-0.52419,2.27236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.666667,-0.333333,3.666667,0,1,0
3,aab0162,pc-6599,2010-01-07,1,1,2,0,0,0,0,0,0,0,0,0,9,7,2,1,5,0.053,0.0,0.053,-0.259378,0.0,0.0,0.0,-0.260788,-0.151407,0.0,0.0,0.0,0.166342,-0.492782,-0.317065,-0.52419,1.128145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.25,0.5,0,1,1
4,aab0162,pc-6599,2010-01-08,1,1,2,0,0,0,0,0,0,0,0,0,9,7,2,0,0,0.053,0.0,0.053,-0.259378,0.0,0.0,0.0,-0.260788,-0.151407,0.0,0.0,0.0,0.166342,-0.492782,-0.317065,-1.321675,-0.778881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,-1.0,-3.6,0,0,1


In [22]:
# Extracting numeric features only
x = ueba_matrix.drop(columns=["user", "pc", "day"]).values

In [None]:
# Standardizing feature matrix
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

### Initializing and Training Autoencoder:

In [25]:
ae = Autoencoder(
    input_dim=x_scaled.shape[1],
    latent_dim=16,
    hidden_dim=64
)

In [26]:
ae.train(x_train=x_scaled, epochs=40, batch_size=128)

Epoch 1/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 0.1508 - val_loss: 0.0601
Epoch 2/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - loss: 0.0496 - val_loss: 0.0524
Epoch 3/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - loss: 0.0434 - val_loss: 0.0455
Epoch 4/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 0.0402 - val_loss: 0.0441
Epoch 5/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 0.0394 - val_loss: 0.0437
Epoch 6/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - loss: 0.0380 - val_loss: 0.0431
Epoch 7/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 0.0372 - val_loss: 0.0468
Epoch 8/40
[1m11003/11003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 0.0367 - val_loss: 0.0414


In [27]:
# Generating embeddings
latent_embeddings = ae.encode(x_scaled)

[1m48900/48900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 400us/step


In [30]:
ae.encoder.export("encoder_model")
joblib.dump(scaler, "feature_scaler.pkl")
np.save("latent_embeddings.npy", latent_embeddings)

INFO:tensorflow:Assets written to: encoder_model\assets


INFO:tensorflow:Assets written to: encoder_model\assets


Saved artifact at 'encoder_model'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 54), dtype=tf.float32, name='ueba_input')
Output Type:
  TensorSpec(shape=(None, 16), dtype=tf.float32, name=None)
Captures:
  2874012673104: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2874012673680: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2874012673488: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2874012674640: TensorSpec(shape=(), dtype=tf.resource, name=None)
