In [205]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, BatchNormalization

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [206]:
# Load dataset from CSV file
df = pd.read_csv("k8s_pod_metrics.csv")

# Display basic info
#print(df.head())  # Preview first few rows
#print(df.info())  # Check data types and missing values
print(df.columns.to_list())
print('pod_errors' in df.columns.to_list())

print("Dataset loaded successfully.")
print("deployment" in df.columns.to_list()) 
print(len(df.columns.to_list()))
print(df['pod'].value_counts()) 

df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', dayfirst=True).view('int64') // 10**9



['timestamp', 'namespace', 'pod', 'node', 'cpu_usage', 'cpu_limit', 'cpu_request', 'cpu_throttling', 'memory_usage', 'memory_limit', 'memory_request', 'memory_rss', 'network_receive_bytes', 'network_transmit_bytes', 'network_errors', 'restarts', 'oom_killed', 'pod_ready', 'pod_phase', 'disk_read_bytes', 'disk_write_bytes', 'disk_io_errors', 'pod_scheduled', 'pod_pending', 'pod_unschedulable', 'container_running', 'container_terminated', 'container_waiting', 'pod_uptime_seconds', 'cpu_utilization_ratio', 'memory_utilization_ratio', 'CPU Throttling', 'High CPU Usage', 'OOMKilled (Out of Memory)', 'CrashLoopBackOff', 'ContainerNotReady', 'PodUnschedulable', 'NodePressure', 'ImagePullFailure', 'node_cpu_usage', 'node_cpu_capacity', 'node_cpu_allocatable', 'node_cpu_utilization_ratio', 'node_memory_usage', 'node_memory_capacity', 'node_memory_allocatable', 'node_memory_utilization_ratio', 'node_memory_pressure', 'node_disk_read_bytes', 'node_disk_usage', 'node_disk_write_bytes', 'node_disk_

In [207]:
# Fill missing values with column mean (for numeric columns)
df.fillna(df.mean(numeric_only=True), inplace=True)

# Ensure no NaNs remain in any column
df.fillna('None', inplace=True)

# Replace empty brackets '[]' with zero
df.replace('[]', 'None', inplace=True)
print(df['deployment'])

0                               None
1       default/cpu-throttle-extreme
2       default/cpu-throttle-extreme
3       default/cpu-throttle-extreme
4       default/cpu-throttle-extreme
                    ...             
2595                            None
2596                            None
2597                            None
2598                            None
2599                            None
Name: deployment, Length: 2600, dtype: object


In [208]:
# Define columns to encode
categorical_cols = ['namespace', 'pod', 'node', 'deployment']  # Categorical values
label_encoders = {}

# Apply Label Encoding
for col in categorical_cols:
    #if col == 'deployment':
        #print(df['deployment'])
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

print("Encoding completed.")
print("Remaining object columns:", df.select_dtypes(include=['object']).columns.tolist())  # Should be empty if all are encoded properly


#print(df.head())  # Check if encoding is correct
#print("Encoding completed. Deployment is now label-encoded like pod and node.")
#print(df['pod'].value_counts()) 

# debug
for i in categorical_cols:
    print(df[i].unique())

Encoding completed.
Remaining object columns: ['disk_read_bytes', 'disk_write_bytes', 'node_cpu_utilization_ratio', 'node_memory_utilization_ratio']
[0 1 2 3]
[ 10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27
  28  29  31  32  39  40  43  44  45  46  47  48  49  86  88  89  90  92
  98  99 100 101 102   8   9  50  70  71  72  73  74  75  76  77  78  79
  80  81  82  83  84  85   0 105 106 107 108 109 110 111 112 113 114  41
  42  51  52  53  54  55  93  94  95  56  57  58  59  60  65  66  67  68
  96  87 103   2   3  30  61  62  63  64   1   4   5   6  33  34  35  36
  37  38  91  97   7  69 104]
[3 2 5 6 4 1 0]
[ 0  2  3  4  5  8 10 11 16 18 20 21 22 23 24  9 12 17 13 15  1 14  6  7
 19]


In [209]:
df.replace('None', 0, inplace=True)
print(len(df.columns.to_list()))


107


In [210]:
non_numeric_cols = [col for col in df.columns if not np.issubdtype(df[col].dtype, np.number)]
print("Non-numeric columns:", non_numeric_cols)


Non-numeric columns: []


In [211]:
# Define target columns
target_cols = [
    'NodePressure', 'QuotaExceeded', 'Network Unavailable', 
    'CPU Throttling', 'PodUnschedulable', 'Replica Mismatch', 
    'Node Unschedulable', 'PID Pressure', 'Memory Pressure', 
    'Disk Pressure', 'ContainerNotReady', 'ImagePullFailure', 
    'High CPU Usage', 'Node Not Ready', 'OOMKilled (Out of Memory)', 
    'CPU Pressure', 'ProgressDeadlineExceeded', 'Unavailable Pods', 
    'FailedScheduling', 'CrashLoopBackOff'
]
 # this is what we are predicting

print(target_cols)


['NodePressure', 'QuotaExceeded', 'Network Unavailable', 'CPU Throttling', 'PodUnschedulable', 'Replica Mismatch', 'Node Unschedulable', 'PID Pressure', 'Memory Pressure', 'Disk Pressure', 'ContainerNotReady', 'ImagePullFailure', 'High CPU Usage', 'Node Not Ready', 'OOMKilled (Out of Memory)', 'CPU Pressure', 'ProgressDeadlineExceeded', 'Unavailable Pods', 'FailedScheduling', 'CrashLoopBackOff']


In [212]:

# Debugging: Check unique pod values after changes
#print(df['deployment'])  

In [213]:
from sklearn.preprocessing import MinMaxScaler

# Define target columns (binary labels) that should NOT be normalized
exclude_time = [    'NodePressure', 'QuotaExceeded', 'Network Unavailable', 
    'CPU Throttling', 'PodUnschedulable', 'Replica Mismatch', 
    'Node Unschedulable', 'PID Pressure', 'Memory Pressure', 
    'Disk Pressure', 'ContainerNotReady', 'ImagePullFailure', 
    'High CPU Usage', 'Node Not Ready', 'OOMKilled (Out of Memory)', 
    'CPU Pressure', 'ProgressDeadlineExceeded', 'Unavailable Pods', 
    'FailedScheduling', 'CrashLoopBackOff', 'timestamp']

# Identify feature columns (all columns except target columns)
feature_cols = [col for col in df.columns if col not in target_cols + ['timestamp']]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMax Scaling ONLY to feature columns (excluding target columns)
df[feature_cols] = scaler.fit_transform(df[feature_cols])

print("Feature normalization completed. Target labels remain unchanged.")


Feature normalization completed. Target labels remain unchanged.


In [214]:
# Define time window (how many past time steps to consider)
TIME_WINDOW = 2  # Use past 5 time steps

# Define prediction horizon (how far ahead we want to predict)
PREDICTION_HORIZON = 2  # Predict 2 time steps into the future

print(f"Time Window: {TIME_WINDOW} time steps")
print(f"Prediction Horizon: {PREDICTION_HORIZON} time steps")


Time Window: 2 time steps
Prediction Horizon: 2 time steps


In [215]:
target_indices = [i for i in range(len(target_cols))]

In [216]:

def create_sequences_per_pod(df, pod_column, target_columns, time_window, prediction_horizon):
    """
    Converts raw time-series data into sequences for training.

    - Each sequence contains `time_window` steps of historical data for a specific pod.
    - The corresponding label is taken from `prediction_horizon` steps into the future.

    Args:
    - df: Pandas DataFrame containing the dataset.
    - pod_column: The column name representing pod identifiers.
    - target_columns: List of column indices corresponding to target labels.
    - time_window: Number of past time steps to include in each sequence.
    - prediction_horizon: How far ahead to predict.

    Returns:
    - X (features): NumPy array of shape (samples, time_window, features).
    - y (labels): NumPy array of shape (samples, number of target labels).
    """

    X, y = [], []
    unique_pods = df[pod_column].unique()  # Get all unique pod names

    for pod in unique_pods:
        # Extract all rows for the given pod
        pod_data = df[df[pod_column] == pod].drop(columns=[pod_column]).values  
        num_rows = len(pod_data)
        if num_rows < time_window + prediction_horizon:
            print(f"Warning: Pod {pod} has insufficient data and will be skipped.")
            continue

        # Ensure we do not go out of bounds
        for i in range(num_rows - time_window - prediction_horizon):
            # Extract past `time_window` steps as input sequence
            input_sequence = pod_data[i:i + time_window]

            # Get target values from `prediction_horizon` steps in the future
            future_index = i + time_window + prediction_horizon  
            target_values = pod_data[future_index, target_columns]  

            X.append(input_sequence)
            y.append(target_values)

    return np.array(X), np.array(y)


# Generate sequences
X, y = create_sequences_per_pod(df, 'pod', target_indices, TIME_WINDOW, PREDICTION_HORIZON)

# Print shapes of output arrays
print(f"X shape: {X.shape} (samples, time steps, features)")
print(f"y shape: {y.shape} (samples, target labels)")
print("Time-series sequences created successfully.")


X shape: (2140, 2, 106) (samples, time steps, features)
y shape: (2140, 20) (samples, target labels)
Time-series sequences created successfully.


In [217]:
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
print(X_train)
# Print dataset shapes
print(f"Training Set: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing Set: X_test={X_test.shape}, y_test={y_test.shape}")
print("Sequences applied and dataset split successfully.")
#X_train = np.array(X_train, dtype=np.float32)
#y_train = np.array(y_train, dtype=np.float32)

[[[1.74266784e+09 3.33333333e-01 3.33333333e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [1.74266788e+09 3.33333333e-01 3.33333333e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]]

 [[1.74266826e+09 0.00000000e+00 6.66666667e-01 ... 0.00000000e+00
   0.00000000e+00 4.16666667e-01]
  [1.74266835e+09 0.00000000e+00 6.66666667e-01 ... 0.00000000e+00
   0.00000000e+00 4.16666667e-01]]

 [[1.74266904e+09 0.00000000e+00 8.33333333e-01 ... 0.00000000e+00
   0.00000000e+00 4.16666667e-02]
  [1.74266910e+09 0.00000000e+00 8.33333333e-01 ... 0.00000000e+00
   0.00000000e+00 4.16666667e-02]]

 ...

 [[1.74266885e+09 3.33333333e-01 8.33333333e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [1.74266892e+09 3.33333333e-01 8.33333333e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]]

 [[1.74266860e+09 1.00000000e+00 6.66666667e-01 ... 0.00000000e+00
   0.00000000e+00 9.16666667e-01]
  [1.74266868e+09 1.00000000e+00 6.66666667e-01 ... 0.00000000e+00
   0.00000

In [218]:

# Get input shape
n_time_steps = X_train.shape[1]  # TIME_WINDOW (sequence length)
n_features = X_train.shape[2]  # Number of input features (metrics)
n_outputs = y_train.shape[1]  # Number of target labels 

# Define the GRU model
model = Sequential([
    GRU(128, return_sequences=True, input_shape=(n_time_steps, n_features)),  # First GRU layer
    Dropout(0.6),  # Prevent overfitting
    BatchNormalization(),

    GRU(64, return_sequences=False),  # Second GRU layer (returns final output)
    Dropout(0.2),
    BatchNormalization(),

    Dense(32, activation='relu'),  # Fully connected layer
    Dense(n_outputs, activation='sigmoid')  # Output layer (sigmoid for multi-label classification)
])

# Compile the model


# Print the model summary
model.summary()


  super().__init__(**kwargs)


In [219]:
# Compile the GRU model
model.compile(
    optimizer='adam',  # Adaptive optimizer for efficient learning
    loss='categorical_crossentropy',  # Suitable for multi-label classification
    metrics=['accuracy']  # Monitor accuracy during training
)

print("Model compiled successfully.")


Model compiled successfully.


In [220]:
model.save("gru_model.h5")
loaded_model = tf.keras.models.load_model("gru_model.h5")

# Check model summary
loaded_model.summary()




In [221]:

# Define batch size and epochs
BATCH_SIZE = 32  # Number of samples per batch
EPOCHS = 50  # Number of training iterations

# Ensure all data is numeric before conversion
# Example: If X_train contains non-numeric data, preprocess it here
# X_train = preprocess_features(X_train)

# Convert training and test sets to float32
X_train = np.array(X_train).astype(np.float32)
X_test = np.array(X_test).astype(np.float32)
y_train = np.array(y_train).astype(np.float32)
y_test = np.array(y_test).astype(np.float32)

# Train the model
history = model.fit(
    X_train, y_train,  # Training data
    validation_data=(X_test, y_test),  # Validation during training
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1  # Print training progress
)

print("Model training complete.")

Epoch 1/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.1408 - loss: 4925079040.0000 - val_accuracy: 1.0000 - val_loss: 2997897472.0000
Epoch 2/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8197 - loss: 1738791168.0000 - val_accuracy: 1.0000 - val_loss: 1014638528.0000
Epoch 3/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9702 - loss: 424575552.0000 - val_accuracy: 1.0000 - val_loss: 120766528.0000
Epoch 4/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 49184212.0000 - val_accuracy: 1.0000 - val_loss: 78137.2500
Epoch 5/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 9677951.0000 - val_accuracy: 1.0000 - val_loss: 2327.2957
Epoch 6/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 1063664.2500 - v

In [222]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Print results
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 21724.1484 
Test Loss: 23708.0625
Test Accuracy: 1.0000
