# This notebook contains training step and analysis using nmae result...

In [1]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [3]:
# Normalized Mean Absolute Error
def nmae(y_pred, y_test):
    mae = mean_absolute_error(y_test, y_pred)
    mean_true = np.mean(np.abs(y_test))
    return (mae / mean_true)

def normalized_mean_absolute_error(y_true, y_pred):
    """
    Calculates the Normalized Mean Absolute Error (NMAE).

    Args:
        y_true (array-like): Ground truth (correct) target values.
        y_pred (array-like): Estimated target values.

    Returns:
        float: The Normalized Mean Absolute Error.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if len(y_true) != len(y_pred):
        raise ValueError("y_true and y_pred must have the same length.")

    mae = np.mean(np.abs(y_true - y_pred))

    # Calculate the range of actual values
    y_range = np.max(y_true) - np.min(y_true)

    # Avoid division by zero if the range is zero
    if y_range == 0:
        return 0.0 if mae == 0 else np.inf
    else:
        nmae = mae / y_range
        return nmae

In [4]:
# Load write dataset
x_ds_c500 = pd.read_csv('datasets/exp60c_2h/t500/prometheus_metrics_wide.csv', low_memory=True).apply(pd.to_numeric, errors='coerce').fillna(0)
print(x_ds_c500.shape)
x_ds_c500.head(5)

(7803, 1864)


Unnamed: 0,timestamp,container_blkio_device_usage_total_0,container_blkio_device_usage_total_1,container_blkio_device_usage_total_2,container_blkio_device_usage_total_3,container_blkio_device_usage_total_4,container_blkio_device_usage_total_5,container_blkio_device_usage_total_6,container_blkio_device_usage_total_7,container_blkio_device_usage_total_8,...,network_transmit_bytes_per_container_35,network_transmit_bytes_per_container_36,network_transmit_bytes_per_container_37,network_transmit_bytes_per_container_38,network_transmit_bytes_per_container_39,network_transmit_bytes_per_container_40,network_transmit_bytes_per_container_41,network_transmit_bytes_per_container_42,network_transmit_bytes_per_container_43,network_transmit_bytes_per_container_44
0,1762718666,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,12989.561587,0,91995.257453,0,13904.017857,0,94496.974755,0,0.0,0.0
1,1762718667,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,12841.486068,0,93890.4337,0,14582.299227,0,104701.936976,0,0.0,0.0
2,1762718668,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,13765.763274,0,74584.501237,0,12281.804734,0,101769.951293,0,12604.872585,430005.600672
3,1762718669,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,11904.655612,0,97924.033523,0,12256.643701,0,94527.552886,0,10402.588398,354876.357754
4,1762718670,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,7763.962677,0,97924.033523,0,10511.954993,0,80978.202448,0,0.0,0.0


In [5]:
y_ds_c500 = pd.read_csv('datasets/exp60c_2h/t500/20251109_200426169_w.csv', low_memory=True).apply(pd.to_numeric, errors='coerce').fillna(0)
print(y_ds_c500.shape)
y_ds_c500.head(5)

# y_ds_c500_w_99th_percentile = y_ds_c500[['w_99th_percentile']].copy()
# print(y_ds_c500_w_99th_percentile.shape)
# y_ds_c500_w_99th_percentile.head(5)

(7803, 45)


Unnamed: 0,timestamp,queries_num,queries_requested,errors_occurred,iter_errors_occurred,average_latency,99_9_latency_percentile,mean_rate,one_minute_rate,five_minute_rate,...,w_min,w_max,w_mean,w_std_dev,w_median,w_75th_percentile,w_95th_percentile,w_98th_percentile,w_99th_percentile,w_99_9th_percentile
0,1762718666,0,0,0,0,0,0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1762718667,9,0,0,0,4,1,0.0595,0.0,0.0,...,3,7,4.444444,1.236033,4.0,4.666667,7.0,7.0,7.0,7
2,1762718668,16,0,0,0,4,1,0.105083,0.0,0.0,...,3,7,4.25,0.930949,4.0,4.0,6.85,7.0,7.0,7
3,1762718669,24,0,0,0,4,1,0.156597,0.0,0.0,...,3,7,4.208333,0.779028,4.0,4.0,6.45,7.0,7.0,7
4,1762718670,32,0,0,0,4,1,0.207441,0.0,0.0,...,3,7,4.16129,0.687836,4.0,4.0,6.1,7.0,7.0,7


# Random Forest training using all features

## train 70%, test 30%

## T500

In [None]:
# Normalize the data
X_scaler = MinMaxScaler()
x_ds_scaled = X_scaler.fit_transform(x_ds_c500)

y_scaler = MinMaxScaler()
y_ds_c500_w_99th_percentile = y_ds_c500[['w_99th_percentile']].copy()
y_ds_scaled = y_scaler.fit_transform(y_ds_c500_w_99th_percentile)

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
  x_ds_scaled, y_ds_scaled, test_size=0.30, random_state=42)

class Data(Dataset):
  '''Dataset Class to store the samples and their corresponding labels, 
  and DataLoader wraps an iterable around the Dataset to enable easy access to the samples.
  '''

  def __init__(self, X: np.ndarray, y: np.ndarray) -> None:

    # need to convert float64 to float32 else 
    # will get the following error
    # RuntimeError: expected scalar type Double but found Float
    self.X = torch.from_numpy(X.astype(np.float32))
    self.y = torch.from_numpy(y.astype(np.float32))
    self.len = self.X.shape[0]
  
  def __getitem__(self, index: int) -> tuple:
    return self.X[index], self.y[index]

  def __len__(self) -> int:
    return self.len
  
# Generate the training dataset
traindata = Data(X_train, y_train)

batch_size = 64
# tells the data loader instance how many sub-processes to use for data loading
# if the num_worker is zero (default) the GPU has to weight for CPU to load data
# Theoretically, greater the num_workers, 
# more efficiently the CPU load data and less the GPU has to wait
num_workers = 2

# Load the training data into data loader with the 
# respective batch_size and num_workers
trainloader = DataLoader(traindata, batch_size=batch_size, 
                         shuffle=True, num_workers=num_workers)

class LinearRegression(nn.Module):
  '''Linear Regression Model
  '''

  def __init__(self, input_dim: int, hidden_dim: int, output_dim: int) -> None:
    '''The network has 4 layers
         - input layer
         - hidden layer
         - hidden layer
         - output layer
    '''
    super(LinearRegression, self).__init__()
    self.input_to_hidden = nn.Linear(input_dim, hidden_dim)
    self.hidden_layer_1 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden_layer_2 = nn.Linear(hidden_dim, hidden_dim)
    self.hidden_to_output = nn.Linear(hidden_dim, output_dim)

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    # no activation and no softmax at the end
    x = self.input_to_hidden(x)
    x = self.hidden_layer_1(x)
    x = self.hidden_layer_2(x)
    x = self.hidden_to_output(x)
    return x

# number of features (len of X cols)
input_dim = X_train.shape[1]
# number of hidden layers
hidden_layers = 50
# output dimension is 1 because of linear regression
output_dim = 1
# initiate the linear regression model
model = LinearRegression(input_dim, hidden_layers, output_dim)
print(model)


In [46]:
# Full dataset - wihout normalization...

random_forest_model = RandomForestRegressor(n_estimators=240, random_state=42, n_jobs=-1)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 44.28541s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.27%
Rand. Forest Training time: 56.095947s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 3.56%
Rand. Forest Training time: 93.024478s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 17.66%


In [47]:
# Full dataset, using pipeline to normalize features and target.

# Define the feature pipeline
feature_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), # Normalize input features
])

rf_regressor = RandomForestRegressor(n_estimators=240, random_state=42, n_jobs=-1)

# Combine with target scaling using TransformedTargetRegressor
model = TransformedTargetRegressor(
    regressor=Pipeline([
        ('preprocess', feature_pipeline),
        ('model', rf_regressor)
    ]),
    transformer=MinMaxScaler()  # Normalizes the target y
)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 48.580744s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.27%
Rand. Forest Training time: 59.981796s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 3.61%
Rand. Forest Training time: 97.792044s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 17.69%


In [48]:
# Full dataset, using xgboost

# 3. Initialize the XGBoost Regressor model
# objective='reg:squarederror' is the default for regression, but explicitly setting it is good practice.
# n_estimators controls the number of boosting rounds (trees).
# random_state for reproducibility.
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, random_state=42)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 87.175224s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.44%
Rand. Forest Training time: 117.375869s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 3.74%
Rand. Forest Training time: 118.866501s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 16.17%


In [33]:
# Dataset removing latencies > 800ms - wihout normalization...
# Define the list of columns to be included in the new DataFrame
selected_columns = ['timestamp', '99th_percentile', 'd_99th_percentile', 'w_99th_percentile']

# Create the new DataFrame by selecting the specified columns
y_ds_c500_filtered = y_ds_c500[selected_columns].copy()
# print(y_ds_c500_filtered.shape)
# y_ds_c500_filtered = y_ds_c500_filtered[(y_ds_c500_filtered['w_99th_percentile'] < 800.00)]
# print(y_ds_c500_filtered.shape)

# print("Min:", y_ds_c500_filtered['w_99th_percentile'].min())
# print("Max:", y_ds_c500_filtered['w_99th_percentile'].max())

print(x_ds_c500.shape)
# Merge x_ds_c500 with y_ds_c500_filtered to add the 'Target' column
x_ds_c500_merged = pd.merge(x_ds_c500, y_ds_c500_filtered, on='timestamp', how='left')
print(x_ds_c500_merged.shape)

# Filter target columns to less than 800ms
x_ds_c500_filtered = x_ds_c500_merged[
    (x_ds_c500_merged['99th_percentile'] < 100.00) & 
    (x_ds_c500_merged['d_99th_percentile'] < 100.00) & 
    (x_ds_c500_merged['w_99th_percentile'] < 100.00)
]
print(x_ds_c500_filtered.shape)

# Create a new DataFrame with the columns to be dropped
columns_to_drop = ['99th_percentile', 'd_99th_percentile', 'w_99th_percentile']
y_ds_c500_filtered = x_ds_c500_filtered[columns_to_drop].copy()
print(y_ds_c500_filtered.shape)

# Drop the columns from the original DataFrame to create a new DataFrame without them
x_ds_c500_filtered = x_ds_c500_filtered.drop(columns=columns_to_drop, axis=1)
print(x_ds_c500_filtered.shape)

random_forest_model = RandomForestRegressor(n_estimators=120, random_state=42, n_jobs=-1)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500_filtered, y_ds_c500_filtered['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500_filtered, y_ds_c500_filtered['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500_filtered, y_ds_c500_filtered['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

(7803, 1864)
(7803, 1867)
(6981, 1867)
(6981, 3)
(6981, 1864)
Rand. Forest Training time: 14.45166s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.21%
Rand. Forest Training time: 18.403955s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 2.58%
Rand. Forest Training time: 27.313033s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 8.80%
