In [7]:
import os
import time
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv
import networkx as nx
import matplotlib.pyplot as plt

# Disable GPU to avoid NCCL issues
os.environ["CUDA_VISIBLE_DEVICES"] = ""
device = torch.device("cpu")

# Create folder for outputs
os.makedirs("visualization", exist_ok=True)

# Load the dataset
df = pd.read_csv("Data/borg_traces_data.csv")
print("Dataset loaded:", df.shape)


Dataset loaded: (405894, 34)


In [None]:
from ast import literal_eval

# Parse histograms into consistent 10-bin arrays
def parse_hist(x):
    try:
        arr = np.array(literal_eval(x), dtype=np.float32)
        return arr if arr.shape[0] == 10 else None  # only allow exact shape
    except:
        return None

# Clean the dataset
df = df[['time', 'machine_id', 'alloc_collection_id', 'cpu_usage_distribution', 'failed']]
df['cpu_hist'] = df['cpu_usage_distribution'].apply(parse_hist)
df.dropna(subset=['cpu_hist', 'machine_id', 'alloc_collection_id', 'failed'], inplace=True)

In [None]:
df['hist_len'] = df['cpu_usage_distribution'].apply(
    lambda x: len(literal_eval(x)) if isinstance(x, str) else 0
)

print("Histogram length distribution:")
print(df['hist_len'].value_counts())


print("Jobs per machine_id (top 10):")
print(df['machine_id'].value_counts().head(10))

print(df['cpu_usage_distribution'].dropna().head(10).to_list())



.



Histogram length distribution:
Series([], Name: count, dtype: int64)
Jobs per machine_id (top 10):
Series([], Name: count, dtype: int64)
[]


In [17]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("Data/borg_traces_data.csv")
print("="*40)
print("✅ Dataset Loaded")
print("="*40)
print("Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Basic info
print("\n" + "="*40)
print("📋 Dataset Info")
print("="*40)
df.info()

# Summary stats
print("\n" + "="*40)
print("📊 Numerical Summary")
print("="*40)
print(df.describe(include=[np.number]))

# Categorical breakdowns
print("\n" + "="*40)
print("🧮 Unique Values by Column")
print("="*40)
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# NaN values
print("\n" + "="*40)
print("🚨 Missing Value Report")
print("="*40)
missing = df.isnull().sum()
print(missing[missing > 0].sort_values(ascending=False))

# Distribution of target label
if 'failed' in df.columns:
    print("\n" + "="*40)
    print("🟩 Failure Label Distribution")
    print("="*40)
    print(df['failed'].value_counts())


✅ Dataset Loaded
Shape: (405894, 34)

First 5 rows:
   Unnamed: 0           time  instance_events_type  collection_id  \
0           0              0                     2    94591244395   
1           1  2517305308183                     2   260697606809   
2           2   195684022913                     6   276227177776   
3           3              0                     2    10507389885   
4           4  1810627494172                     3    25911621841   

   scheduling_class  collection_type  priority  alloc_collection_id  \
0                 3                1       200                    0   
1                 2                0       360         221495397286   
2                 2                0       103                    0   
3                 3                0       200                    0   
4                 2                0         0                    0   

   instance_index    machine_id  ... assigned_memory page_cache_memory  \
0             144  168846390496 

In [None]:

from ast import literal_eval

# Parse histograms into consistent 10-bin arrays
def parse_hist(x):
    try:
        arr = np.array(literal_eval(x), dtype=np.float32)
        return arr if arr.shape[0] == 10 else None  # only allow exact shape
    except:
        return None

# Clean the dataset
df = df[['time', 'machine_id', 'alloc_collection_id', 'cpu_usage_distribution', 'failed']]
df['cpu_hist'] = df['cpu_usage_distribution'].apply(parse_hist)
df.dropna(subset=['cpu_hist', 'machine_id', 'alloc_collection_id', 'failed'], inplace=True)

# Build sequences of at least 6 time steps
sequences = []
labels = []
job_to_index = {}
index = 0
group_key = 'machine_id'  # instead of 'alloc_collection_id'
min_seq_len = 3  # you can return to 6 later

for entity_id, group in df.groupby(group_key):
    group = group.sort_values(by='time')
    hists = group['cpu_hist'].tolist()
    
    if len(hists) >= min_seq_len:
        valid_seq = np.stack(hists[:min_seq_len])
        sequences.append(valid_seq)
        labels.append(int(group['failed'].iloc[-1]))
        job_to_index[entity_id] = index
        index += 1

print("Total valid alloc_collection_id groups with >=6 time steps:", len(sequences))

X_seq = np.stack(sequences)
y_seq = np.array(labels)

print("Shape of X_seq:", X_seq.shape)
print("Total labeled jobs:", y_seq.shape[0])





Total valid alloc_collection_id groups with >=6 time steps: 0


ValueError: need at least one array to stack