# This notebook contains training step and analysis using nmae result...

In [1]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, GenericUnivariateSelect
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

import xgboost as xgb

In [2]:
# Normalized Mean Absolute Error
def nmae(y_pred, y_test):
    mae = mean_absolute_error(y_test, y_pred)
    mean_true = np.mean(np.abs(y_test))
    return (mae / mean_true)

def normalized_mean_absolute_error(y_true, y_pred):
    """
    Calculates the Normalized Mean Absolute Error (NMAE).

    Args:
        y_true (array-like): Ground truth (correct) target values.
        y_pred (array-like): Estimated target values.

    Returns:
        float: The Normalized Mean Absolute Error.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if len(y_true) != len(y_pred):
        raise ValueError("y_true and y_pred must have the same length.")

    mae = np.mean(np.abs(y_true - y_pred))

    # Calculate the range of actual values
    y_range = np.max(y_true) - np.min(y_true)

    # Avoid division by zero if the range is zero
    if y_range == 0:
        return 0.0 if mae == 0 else np.inf
    else:
        nmae = mae / y_range
        return nmae

In [3]:
# Load write dataset
x_ds_c100 = pd.read_csv('datasets/exp60c_2h/t100/prometheus_metrics_wide.csv', low_memory=True).apply(pd.to_numeric, errors='coerce').fillna(0)
print(x_ds_c100.shape)
x_ds_c100.head(5)

(7803, 1864)


Unnamed: 0,timestamp,container_blkio_device_usage_total_0,container_blkio_device_usage_total_1,container_blkio_device_usage_total_2,container_blkio_device_usage_total_3,container_blkio_device_usage_total_4,container_blkio_device_usage_total_5,container_blkio_device_usage_total_6,container_blkio_device_usage_total_7,container_blkio_device_usage_total_8,...,network_transmit_bytes_per_container_35,network_transmit_bytes_per_container_36,network_transmit_bytes_per_container_37,network_transmit_bytes_per_container_38,network_transmit_bytes_per_container_39,network_transmit_bytes_per_container_40,network_transmit_bytes_per_container_41,network_transmit_bytes_per_container_42,network_transmit_bytes_per_container_43,network_transmit_bytes_per_container_44
0,1762727858,124854272,543268864,124854272,524222464,23887872,28512849920,23887872,27526397952,126197760,...,12189.93135,0,71774.335259,0,11775.149795,0,95732.086407,0,0.0,0.0
1,1762727859,124854272,543268864,124854272,524222464,23887872,28512858112,23887872,27526397952,126197760,...,12233.595801,0,89717.154189,0,15092.75538,0,95143.60587,0,0.0,0.0
2,1762727860,124854272,543268864,124854272,524222464,23887872,28512858112,23887872,27526397952,126197760,...,12233.595801,0,69570.165321,0,9959.6,0,98389.159892,0,0.0,0.0
3,1762727861,124854272,543268864,124854272,524222464,23887872,28512858112,23887872,27526397952,126197760,...,13788.9043,0,78847.409665,0,14200.45623,0,81172.475424,0,0.0,0.0
4,1762727862,124854272,543268864,124854272,524222464,23887872,28512858112,23887872,27526397952,126197760,...,12768.875193,0,89157.100689,0,10807.99305,0,81172.475424,0,0.0,0.0


In [4]:
# Load write dataset
x_ds_c500 = pd.read_csv('datasets/exp60c_2h/t500/prometheus_metrics_wide.csv', low_memory=True).apply(pd.to_numeric, errors='coerce').fillna(0)
print(x_ds_c500.shape)
x_ds_c500.head(5)

(7803, 1864)


Unnamed: 0,timestamp,container_blkio_device_usage_total_0,container_blkio_device_usage_total_1,container_blkio_device_usage_total_2,container_blkio_device_usage_total_3,container_blkio_device_usage_total_4,container_blkio_device_usage_total_5,container_blkio_device_usage_total_6,container_blkio_device_usage_total_7,container_blkio_device_usage_total_8,...,network_transmit_bytes_per_container_35,network_transmit_bytes_per_container_36,network_transmit_bytes_per_container_37,network_transmit_bytes_per_container_38,network_transmit_bytes_per_container_39,network_transmit_bytes_per_container_40,network_transmit_bytes_per_container_41,network_transmit_bytes_per_container_42,network_transmit_bytes_per_container_43,network_transmit_bytes_per_container_44
0,1762718666,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,12989.561587,0,91995.257453,0,13904.017857,0,94496.974755,0,0.0,0.0
1,1762718667,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,12841.486068,0,93890.4337,0,14582.299227,0,104701.936976,0,0.0,0.0
2,1762718668,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,13765.763274,0,74584.501237,0,12281.804734,0,101769.951293,0,12604.872585,430005.600672
3,1762718669,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,11904.655612,0,97924.033523,0,12256.643701,0,94527.552886,0,10402.588398,354876.357754
4,1762718670,124854272,523153408,124854272,504750080,22294528,18027241472,22294528,17279209472,125186048,...,7763.962677,0,97924.033523,0,10511.954993,0,80978.202448,0,0.0,0.0


In [5]:
y_ds_c100 = pd.read_csv('datasets/exp60c_2h/t100/20251109_223738084_w.csv', low_memory=True).apply(pd.to_numeric, errors='coerce').fillna(0)
print(y_ds_c100.shape)
y_ds_c100.head(5)

y_ds_c100_w_99th_percentile = y_ds_c100[['w_99th_percentile']].copy()
print(y_ds_c100_w_99th_percentile.shape)
y_ds_c100_w_99th_percentile.head(5)

(7803, 45)
(7803, 1)


Unnamed: 0,w_99th_percentile
0,0.0
1,5.0
2,5.0
3,6.0
4,6.0


In [6]:
y_ds_c500 = pd.read_csv('datasets/exp60c_2h/t500/20251109_200426169_w.csv', low_memory=True).apply(pd.to_numeric, errors='coerce').fillna(0)
print(y_ds_c500.shape)
y_ds_c500.head(5)

# y_ds_c500_w_99th_percentile = y_ds_c500[['w_99th_percentile']].copy()
# print(y_ds_c500_w_99th_percentile.shape)
# y_ds_c500_w_99th_percentile.head(5)

(7803, 45)


Unnamed: 0,timestamp,queries_num,queries_requested,errors_occurred,iter_errors_occurred,average_latency,99_9_latency_percentile,mean_rate,one_minute_rate,five_minute_rate,...,w_min,w_max,w_mean,w_std_dev,w_median,w_75th_percentile,w_95th_percentile,w_98th_percentile,w_99th_percentile,w_99_9th_percentile
0,1762718666,0,0,0,0,0,0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1762718667,9,0,0,0,4,1,0.0595,0.0,0.0,...,3,7,4.444444,1.236033,4.0,4.666667,7.0,7.0,7.0,7
2,1762718668,16,0,0,0,4,1,0.105083,0.0,0.0,...,3,7,4.25,0.930949,4.0,4.0,6.85,7.0,7.0,7
3,1762718669,24,0,0,0,4,1,0.156597,0.0,0.0,...,3,7,4.208333,0.779028,4.0,4.0,6.45,7.0,7.0,7
4,1762718670,32,0,0,0,4,1,0.207441,0.0,0.0,...,3,7,4.16129,0.687836,4.0,4.0,6.1,7.0,7.0,7


# SELECTKBEST FEATURES - 100COLUNAS

In [10]:
# # Identify columns to normalize (all except the first)
# cols_to_normalize = x_ds_c100.columns[1:]

# # Extract the columns to be normalized
# df_to_normalize = x_ds_c100[cols_to_normalize]

# # Initialize the MinMaxScaler
# scaler = MinMaxScaler()

# # Fit and transform the selected columns
# normalized_data = scaler.fit_transform(df_to_normalize)

# # Create a new DataFrame with the normalized values
# x_ds_c100_norm = pd.DataFrame(normalized_data, columns=cols_to_normalize, index=x_ds_c100.index)

# # Concatenate the first column with the normalized DataFrame
# x_ds_c100_norm = pd.concat([x_ds_c100.iloc[:, :1], x_ds_c100_norm], axis=1)

# # Print the final DataFrame
# print(x_ds_c100_norm.shape)

selector = SelectKBest(f_regression, k=10)
x_ds_c100_10f = selector.fit_transform(x_ds_c100, y_ds_c100_w_99th_percentile)

# Get a boolean mask of selected features
selected_features_mask = selector.get_support()

# Get the names of the selected features
selected_feature_names = x_ds_c100.columns[selected_features_mask]
print(selected_feature_names)

x_ds_c100_10f = pd.DataFrame(x_ds_c100_10f, columns=selected_feature_names, index=x_ds_c100.index)
print(x_ds_c100_10f.shape)
x_ds_c100_10f.head(5)

Index(['network_receive_bytes_per_container_3',
       'network_receive_bytes_per_container_5',
       'network_receive_bytes_per_container_8',
       'network_receive_bytes_per_container_11',
       'network_receive_bytes_per_container_12',
       'network_transmit_bytes_per_container_3',
       'network_transmit_bytes_per_container_5',
       'network_transmit_bytes_per_container_8',
       'network_transmit_bytes_per_container_11',
       'network_transmit_bytes_per_container_12'],
      dtype='object')
(7803, 10)


  y = column_or_1d(y, warn=True)
  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


Unnamed: 0,network_receive_bytes_per_container_3,network_receive_bytes_per_container_5,network_receive_bytes_per_container_8,network_receive_bytes_per_container_11,network_receive_bytes_per_container_12,network_transmit_bytes_per_container_3,network_transmit_bytes_per_container_5,network_transmit_bytes_per_container_8,network_transmit_bytes_per_container_11,network_transmit_bytes_per_container_12
0,0.0,7721.381673,6353.459119,4263.591936,4293.746052,0.0,8830.145589,6316.981132,4104.764814,4428.932407
1,0.0,7721.381673,8298.475413,7187.890754,4855.717137,0.0,8830.145589,7587.932145,4811.780191,4568.577131
2,8274.781561,28078.035553,9507.306226,27163.73728,28649.805447,10219.343176,97589.936728,7690.279543,9837.650324,11675.745785
3,12356.867196,33536.435868,43527.118147,38872.992701,28649.805447,17018.16118,112449.262202,17233.948989,12519.343066,11675.745785
4,19343.283582,48180.170576,44647.224631,43682.130584,49720.636068,26640.014215,170982.942431,16565.047702,12706.921944,17022.045537


In [23]:
def get_select_k_best_ds(x, y, k):
    selector = SelectKBest(f_regression, k=k)
    k_best_ds = selector.fit_transform(x, y)

    # Get a boolean mask of selected features
    selected_features_mask = selector.get_support()

    # Get the names of the selected features
    selected_feature_names = x.columns[selected_features_mask]
    print(selected_feature_names)

    k_best_ds = pd.DataFrame(k_best_ds, columns=selected_feature_names, index=x.index)
    print(k_best_ds.shape)
    k_best_ds.head(5)
    return k_best_ds

# Random Forest training using all features and k best features...

## train 70%, test 30%

## T100

In [53]:
# Full dataset - wihout normalization...

random_forest_model = RandomForestRegressor(n_estimators=240, random_state=42, n_jobs=-1)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 24.167081s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.09%
Rand. Forest Training time: 48.347937s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 2.33%
Rand. Forest Training time: 72.590769s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 7.91%


In [54]:
# Full dataset, using pipeline to normalize features and target.

# Define the feature pipeline
feature_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), # Normalize input features
])

rf_regressor = RandomForestRegressor(n_estimators=240, random_state=42, n_jobs=-1)

# Combine with target scaling using TransformedTargetRegressor
model = TransformedTargetRegressor(
    regressor=Pipeline([
        ('preprocess', feature_pipeline),
        ('model', rf_regressor)
    ]),
    transformer=MinMaxScaler()  # Normalizes the target y
)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 30.79498s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.10%
Rand. Forest Training time: 52.514156s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 2.43%
Rand. Forest Training time: 73.947967s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 6.83%


In [55]:
# Full dataset, using xgboost

# 3. Initialize the XGBoost Regressor model
# objective='reg:squarederror' is the default for regression, but explicitly setting it is good practice.
# n_estimators controls the number of boosting rounds (trees).
# random_state for reproducibility.
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, random_state=42)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100, y_ds_c100['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 25.202146s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.09%
Rand. Forest Training time: 59.407267s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 3.09%
Rand. Forest Training time: 69.290943s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 5.87%


## T500

In [46]:
# Full dataset - wihout normalization...

random_forest_model = RandomForestRegressor(n_estimators=240, random_state=42, n_jobs=-1)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 44.28541s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.27%
Rand. Forest Training time: 56.095947s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 3.56%
Rand. Forest Training time: 93.024478s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 17.66%


In [47]:
# Full dataset, using pipeline to normalize features and target.

# Define the feature pipeline
feature_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), # Normalize input features
])

rf_regressor = RandomForestRegressor(n_estimators=240, random_state=42, n_jobs=-1)

# Combine with target scaling using TransformedTargetRegressor
model = TransformedTargetRegressor(
    regressor=Pipeline([
        ('preprocess', feature_pipeline),
        ('model', rf_regressor)
    ]),
    transformer=MinMaxScaler()  # Normalizes the target y
)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 48.580744s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.27%
Rand. Forest Training time: 59.981796s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 3.61%
Rand. Forest Training time: 97.792044s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 17.69%


In [48]:
# Full dataset, using xgboost

# 3. Initialize the XGBoost Regressor model
# objective='reg:squarederror' is the default for regression, but explicitly setting it is good practice.
# n_estimators controls the number of boosting rounds (trees).
# random_state for reproducibility.
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, random_state=42)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500, y_ds_c500['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

Rand. Forest Training time: 87.175224s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.44%
Rand. Forest Training time: 117.375869s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 3.74%
Rand. Forest Training time: 118.866501s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 16.17%


In [33]:
# Dataset removing latencies > 800ms - wihout normalization...
# Define the list of columns to be included in the new DataFrame
selected_columns = ['timestamp', '99th_percentile', 'd_99th_percentile', 'w_99th_percentile']

# Create the new DataFrame by selecting the specified columns
y_ds_c500_filtered = y_ds_c500[selected_columns].copy()
# print(y_ds_c500_filtered.shape)
# y_ds_c500_filtered = y_ds_c500_filtered[(y_ds_c500_filtered['w_99th_percentile'] < 800.00)]
# print(y_ds_c500_filtered.shape)

# print("Min:", y_ds_c500_filtered['w_99th_percentile'].min())
# print("Max:", y_ds_c500_filtered['w_99th_percentile'].max())

print(x_ds_c500.shape)
# Merge x_ds_c500 with y_ds_c500_filtered to add the 'Target' column
x_ds_c500_merged = pd.merge(x_ds_c500, y_ds_c500_filtered, on='timestamp', how='left')
print(x_ds_c500_merged.shape)

# Filter target columns to less than 800ms
x_ds_c500_filtered = x_ds_c500_merged[
    (x_ds_c500_merged['99th_percentile'] < 100.00) & 
    (x_ds_c500_merged['d_99th_percentile'] < 100.00) & 
    (x_ds_c500_merged['w_99th_percentile'] < 100.00)
]
print(x_ds_c500_filtered.shape)

# Create a new DataFrame with the columns to be dropped
columns_to_drop = ['99th_percentile', 'd_99th_percentile', 'w_99th_percentile']
y_ds_c500_filtered = x_ds_c500_filtered[columns_to_drop].copy()
print(y_ds_c500_filtered.shape)

# Drop the columns from the original DataFrame to create a new DataFrame without them
x_ds_c500_filtered = x_ds_c500_filtered.drop(columns=columns_to_drop, axis=1)
print(x_ds_c500_filtered.shape)

random_forest_model = RandomForestRegressor(n_estimators=120, random_state=42, n_jobs=-1)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500_filtered, y_ds_c500_filtered['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500_filtered, y_ds_c500_filtered['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500_filtered, y_ds_c500_filtered['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

(7803, 1864)
(7803, 1867)
(6981, 1867)
(6981, 3)
(6981, 1864)
Rand. Forest Training time: 14.45166s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.21%
Rand. Forest Training time: 18.403955s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 2.58%
Rand. Forest Training time: 27.313033s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 8.80%


## KBest Features

In [31]:
# kbest (100) dataset - without normalization...
x_ds_c500_100f = get_select_k_best_ds(x_ds_c500, y_ds_c500['w_99th_percentile'], 500)
print(x_ds_c500_100f.shape)

random_forest_model = RandomForestRegressor(n_estimators=120, random_state=42, n_jobs=-1)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c500_100f, y_ds_c500['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test):.2%}')

# # using d_99th_percentile
# x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_10f, y_ds_c100['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

# time = datetime.datetime.now(tz=datetime.timezone.utc)
# random_forest_model.fit(x_train, y_train)
# print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

# predicted = random_forest_model.predict(x_test)
# print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')

# # using w_99th_percentile
# x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_10f, y_ds_c100['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

# time = datetime.datetime.now(tz=datetime.timezone.utc)
# random_forest_model.fit(x_train, y_train)
# print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

# predicted = random_forest_model.predict(x_test)
# print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


Index(['timestamp', 'container_blkio_device_usage_total_1',
       'container_blkio_device_usage_total_3',
       'container_blkio_device_usage_total_4',
       'container_blkio_device_usage_total_5',
       'container_blkio_device_usage_total_6',
       'container_blkio_device_usage_total_7',
       'container_blkio_device_usage_total_9',
       'container_blkio_device_usage_total_11',
       'container_blkio_device_usage_total_13',
       ...
       'network_receive_bytes_per_container_5',
       'network_receive_bytes_per_container_6',
       'network_receive_bytes_per_container_11',
       'network_receive_bytes_per_container_14',
       'network_receive_bytes_per_container_38',
       'network_transmit_bytes_per_container_3',
       'network_transmit_bytes_per_container_5',
       'network_transmit_bytes_per_container_6',
       'network_transmit_bytes_per_container_11',
       'network_transmit_bytes_per_container_14'],
      dtype='object', length=500)
(7803, 500)
(7803, 500)
Ra

In [24]:
# kbest(10) dataset, using pipeline to normalize features and target.

# Define the feature pipeline
feature_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), # Normalize input features
])

rf_regressor = RandomForestRegressor(n_estimators=120, random_state=42, n_jobs=-1)

# Combine with target scaling using TransformedTargetRegressor
model = TransformedTargetRegressor(
    regressor=Pipeline([
        ('preprocess', feature_pipeline),
        ('model', rf_regressor)
    ]),
    transformer=MinMaxScaler()  # Normalizes the target y
)

# using 99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_10f, y_ds_c100['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')

# using d_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_10f, y_ds_c100['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_10f, y_ds_c100['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')

Rand. Forest Training time: 0.796619s
Full dataset | 99th_percentile -> Rand. Forest NMAE: 0.05543340395091246
Rand. Forest Training time: 0.562692s
Full dataset | d_99th_percentile -> Rand. Forest NMAE: 0.1671126164743545
Rand. Forest Training time: 0.671782s
Full dataset | w_99th_percentile -> Rand. Forest NMAE: 0.6105355775550029


In [None]:
# kbest (20) dataset - without normalization...
random_forest_model = RandomForestRegressor(n_estimators=120, random_state=42, n_jobs=-1)

# using 99th_percentile
x_ds_c100_20f = get_select_k_best_ds(x_ds_c100, y_ds_c100[['99th_percentile']].copy(), 20)

x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_20f, y_ds_c100['99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | 99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')

# using d_99th_percentile
x_ds_c100_20f = get_select_k_best_ds(x_ds_c100, y_ds_c100[['d_99th_percentile']].copy(), 20)

x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_20f, y_ds_c100['d_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | d_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')

# using w_99th_percentile
x_train, x_test, y_train, y_test = train_test_split(x_ds_c100_10f, y_ds_c100['w_99th_percentile'].to_numpy(), test_size=0.30, random_state=42)

time = datetime.datetime.now(tz=datetime.timezone.utc)
random_forest_model.fit(x_train, y_train)
print(f'Rand. Forest Training time: {(datetime.datetime.now(tz=datetime.timezone.utc) - time).total_seconds()}s')

predicted = random_forest_model.predict(x_test)
print(f'Full dataset | w_99th_percentile -> Rand. Forest NMAE: {nmae(predicted, y_test)}')