<a href="https://colab.research.google.com/github/AriSu2904/gist/blob/main/CPU_AD_Global.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Isolation Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score
import os
import glob
import time
import itertools

# ==========================================
# --- CONFIGURATION ---
# ==========================================
INPUT_FOLDER = '/content/drive/MyDrive/NAB_RESOURCES/nab_resources/nab_final_split'

# GRID PARAMETER (Range lebih besar karena data lebih besar)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_samples': [128, 256, 512]   # Kita naikkan opsinya karena data global lebih variatif
}

print("="*60)
print("GLOBAL ISOLATION FOREST: MERGE -> TUNE -> EXECUTE")
print("="*60)

# ---------------------------------------------------------
# STEP 1: MERGE DATA (MEMBUAT DATASET RAKSASA)
# ---------------------------------------------------------
print("\n[STEP 1] Merging Data...")
train_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_train.csv")))
test_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_test.csv")))

if not train_files:
    print("[ERROR] File training tidak ditemukan.")
    exit()

# Gabung Training
train_list = []
for f in train_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_train.csv', '')
    train_list.append(df)
global_train = pd.concat(train_list, ignore_index=True)

# Gabung Testing
test_list = []
for f in test_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_test.csv', '')
    test_list.append(df)
global_test = pd.concat(test_list, ignore_index=True)

print(f" > Global Train Size : {len(global_train)} rows")
print(f" > Global Test Size  : {len(global_test)} rows")

# ---------------------------------------------------------
# STEP 2: PREPARATION & SCALING
# ---------------------------------------------------------
print("\n[STEP 2] Preprocessing...")
scaler = MinMaxScaler()
# Fit di Global Train
global_train['value_scaled'] = scaler.fit_transform(global_train['value'].values.reshape(-1,1))
# Transform di Global Test
global_test['value_scaled'] = scaler.transform(global_test['value'].values.reshape(-1,1))

# Feature Engineering
for df in [global_train, global_test]:
    df['roll_mean'] = df['value_scaled'].rolling(5, min_periods=1).mean()
    df['diff'] = df['value_scaled'].diff().fillna(0)
    df['accel'] = df['diff'].diff().fillna(0)

feats = ['value_scaled', 'roll_mean', 'diff', 'accel']

# ---------------------------------------------------------
# STEP 3: HYPERPARAMETER TUNING (KHUSUS GLOBAL)
# ---------------------------------------------------------
print("\n[STEP 3] Tuning Global Parameters...")
print(f"{'n_est':<10} | {'max_samp':<10} | {'AUC (Avg)':<10} | {'Time (s)':<10}")
print("-" * 50)

best_auc = -1
best_params = {}

keys = param_grid.keys()
combinations = list(itertools.product(*param_grid.values()))

# Kita tuning berdasarkan performa rata-rata di Global Test
for combo in combinations:
    params = dict(zip(keys, combo))
    t0 = time.time()

    # Train Model Global
    model = IsolationForest(
        n_estimators=params['n_estimators'],
        max_samples=params['max_samples'],
        contamination='auto',
        random_state=42,
        n_jobs=-1
    )
    model.fit(global_train[feats])

    # Test ke semua data test
    scores = -model.score_samples(global_test[feats])

    # Hitung AUC Global (Micro Average) - Cara cepat buat tuning
    # (Di step final nanti kita hitung per server biar detail)
    try:
        auc = roc_auc_score(global_test['label'], scores)
    except:
        auc = 0

    dt = time.time() - t0
    print(f"{params['n_estimators']:<10} | {params['max_samples']:<10} | {auc:.4f}     | {dt:.4f}")

    if auc > best_auc:
        best_auc = auc
        best_params = params

print("-" * 50)
print(f"JUARA GLOBAL: n_estimators={best_params['n_estimators']}, max_samples={best_params['max_samples']}")
print(f"Skor AUC Global Sementara: {best_auc:.4f}")

# ---------------------------------------------------------
# STEP 4: FINAL EXECUTION & EVALUATION PER SOURCE
# ---------------------------------------------------------
print("\n[STEP 4] Final Evaluation Per Server (Using Best Params)...")
print("-" * 65)
print(f"{'Source':<30} | {'AUC':<10} | {'F1':<10}")
print("-" * 65)

# Train Final Model
final_model = IsolationForest(
    n_estimators=best_params['n_estimators'],
    max_samples=best_params['max_samples'],
    contamination='auto',
    random_state=42,
    n_jobs=-1
)
final_model.fit(global_train[feats])

# Evaluasi per server (biar bisa dibandingin sama Local Model)
unique_sources = global_test['source'].unique()
results = []

for source in unique_sources:
    # Ambil data khusus server ini
    subset = global_test[global_test['source'] == source].copy()

    if len(subset['label'].unique()) > 1:
        scores = -final_model.score_samples(subset[feats])
        auc = roc_auc_score(subset['label'], scores)

        # Cari F1 Terbaik
        best_f1 = 0
        threshs = np.linspace(scores.min(), scores.max(), 100)
        for t in threshs:
            p = (scores > t).astype(int)
            f = f1_score(subset['label'], p, zero_division=0)
            if f > best_f1: best_f1 = f

        print(f"{source:<30} | {auc:.4f}     | {best_f1:.4f}")
        results.append({'source': source, 'auc': auc, 'f1': best_f1})
    else:
        pass

# ---------------------------------------------------------
# SUMMARY
# ---------------------------------------------------------
print("="*65)
print("FINAL RESULT: GLOBAL ISOLATION FOREST")
print("="*65)
df_res = pd.DataFrame(results)
if not df_res.empty:
    print(df_res)
    print("-" * 65)
    print(f"AVERAGE AUC (Macro): {df_res['auc'].mean():.4f}")
else:
    print("No valid results.")
print("="*65)

GLOBAL ISOLATION FOREST: MERGE -> TUNE -> EXECUTE

[STEP 1] Merging Data...
 > Global Train Size : 16122 rows
 > Global Test Size  : 16128 rows

[STEP 2] Preprocessing...

[STEP 3] Tuning Global Parameters...
n_est      | max_samp   | AUC (Avg)  | Time (s)  
--------------------------------------------------
100        | 128        | 0.7073     | 0.6375
100        | 256        | 0.7109     | 0.6118
100        | 512        | 0.7134     | 0.8181
200        | 128        | 0.7055     | 1.9802
200        | 256        | 0.7125     | 1.7021
200        | 512        | 0.7073     | 1.4439
300        | 128        | 0.7047     | 1.7676
300        | 256        | 0.7107     | 1.5556
300        | 512        | 0.7096     | 1.1003
--------------------------------------------------
JUARA GLOBAL: n_estimators=100, max_samples=512
Skor AUC Global Sementara: 0.7134

[STEP 4] Final Evaluation Per Server (Using Best Params)...
-----------------------------------------------------------------
Source          

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score
import os
import glob
import time
import itertools

# ==========================================
# --- CONFIGURATION ---
# ==========================================
INPUT_FOLDER = '/content/drive/MyDrive/NAB_RESOURCES/nab_resources/nab_final_split_30'

# GRID PARAMETER (Range lebih besar karena data lebih besar)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_samples': [128, 256, 512]   # Kita naikkan opsinya karena data global lebih variatif
}

print("="*60)
print("GLOBAL ISOLATION FOREST: MERGE -> TUNE -> EXECUTE")
print("="*60)

# ---------------------------------------------------------
# STEP 1: MERGE DATA (MEMBUAT DATASET RAKSASA)
# ---------------------------------------------------------
print("\n[STEP 1] Merging Data...")
train_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_train.csv")))
test_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_test.csv")))

if not train_files:
    print("[ERROR] File training tidak ditemukan.")
    exit()

# Gabung Training
train_list = []
for f in train_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_train.csv', '')
    train_list.append(df)
global_train = pd.concat(train_list, ignore_index=True)

# Gabung Testing
test_list = []
for f in test_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_test.csv', '')
    test_list.append(df)
global_test = pd.concat(test_list, ignore_index=True)

print(f" > Global Train Size : {len(global_train)} rows")
print(f" > Global Test Size  : {len(global_test)} rows")

# ---------------------------------------------------------
# STEP 2: PREPARATION & SCALING
# ---------------------------------------------------------
print("\n[STEP 2] Preprocessing...")
scaler = MinMaxScaler()
# Fit di Global Train
global_train['value_scaled'] = scaler.fit_transform(global_train['value'].values.reshape(-1,1))
# Transform di Global Test
global_test['value_scaled'] = scaler.transform(global_test['value'].values.reshape(-1,1))

# Feature Engineering
for df in [global_train, global_test]:
    df['roll_mean'] = df['value_scaled'].rolling(5, min_periods=1).mean()
    df['diff'] = df['value_scaled'].diff().fillna(0)
    df['accel'] = df['diff'].diff().fillna(0)

feats = ['value_scaled', 'roll_mean', 'diff', 'accel']

# ---------------------------------------------------------
# STEP 3: HYPERPARAMETER TUNING (KHUSUS GLOBAL)
# ---------------------------------------------------------
print("\n[STEP 3] Tuning Global Parameters...")
print(f"{'n_est':<10} | {'max_samp':<10} | {'AUC (Avg)':<10} | {'Time (s)':<10}")
print("-" * 50)

best_auc = -1
best_params = {}

keys = param_grid.keys()
combinations = list(itertools.product(*param_grid.values()))

# Kita tuning berdasarkan performa rata-rata di Global Test
for combo in combinations:
    params = dict(zip(keys, combo))
    t0 = time.time()

    # Train Model Global
    model = IsolationForest(
        n_estimators=params['n_estimators'],
        max_samples=params['max_samples'],
        contamination='auto',
        random_state=42,
        n_jobs=-1
    )
    model.fit(global_train[feats])

    # Test ke semua data test
    scores = -model.score_samples(global_test[feats])

    # Hitung AUC Global (Micro Average) - Cara cepat buat tuning
    # (Di step final nanti kita hitung per server biar detail)
    try:
        auc = roc_auc_score(global_test['label'], scores)
    except:
        auc = 0

    dt = time.time() - t0
    print(f"{params['n_estimators']:<10} | {params['max_samples']:<10} | {auc:.4f}     | {dt:.4f}")

    if auc > best_auc:
        best_auc = auc
        best_params = params

print("-" * 50)
print(f"JUARA GLOBAL: n_estimators={best_params['n_estimators']}, max_samples={best_params['max_samples']}")
print(f"Skor AUC Global Sementara: {best_auc:.4f}")

# ---------------------------------------------------------
# STEP 4: FINAL EXECUTION & EVALUATION PER SOURCE
# ---------------------------------------------------------
print("\n[STEP 4] Final Evaluation Per Server (Using Best Params)...")
print("-" * 65)
print(f"{'Source':<30} | {'AUC':<10} | {'F1':<10}")
print("-" * 65)

# Train Final Model
final_model = IsolationForest(
    n_estimators=best_params['n_estimators'],
    max_samples=best_params['max_samples'],
    contamination='auto',
    random_state=42,
    n_jobs=-1
)
final_model.fit(global_train[feats])

# Evaluasi per server (biar bisa dibandingin sama Local Model)
unique_sources = global_test['source'].unique()
results = []

for source in unique_sources:
    # Ambil data khusus server ini
    subset = global_test[global_test['source'] == source].copy()

    if len(subset['label'].unique()) > 1:
        scores = -final_model.score_samples(subset[feats])
        auc = roc_auc_score(subset['label'], scores)

        # Cari F1 Terbaik
        best_f1 = 0
        threshs = np.linspace(scores.min(), scores.max(), 100)
        for t in threshs:
            p = (scores > t).astype(int)
            f = f1_score(subset['label'], p, zero_division=0)
            if f > best_f1: best_f1 = f

        print(f"{source:<30} | {auc:.4f}     | {best_f1:.4f}")
        results.append({'source': source, 'auc': auc, 'f1': best_f1})
    else:
        pass

# ---------------------------------------------------------
# SUMMARY
# ---------------------------------------------------------
print("="*65)
print("FINAL RESULT: GLOBAL ISOLATION FOREST")
print("="*65)
df_res = pd.DataFrame(results)
if not df_res.empty:
    print(df_res)
    print("-" * 65)
    print(f"AVERAGE AUC (Macro): {df_res['auc'].mean():.4f}")
else:
    print("No valid results.")
print("="*65)

GLOBAL ISOLATION FOREST: MERGE -> TUNE -> EXECUTE

[STEP 1] Merging Data...
 > Global Train Size : 9671 rows
 > Global Test Size  : 22584 rows

[STEP 2] Preprocessing...

[STEP 3] Tuning Global Parameters...
n_est      | max_samp   | AUC (Avg)  | Time (s)  
--------------------------------------------------
100        | 128        | 0.7655     | 0.3778
100        | 256        | 0.7750     | 0.4020
100        | 512        | 0.7721     | 0.3990
200        | 128        | 0.7627     | 0.7361
200        | 256        | 0.7703     | 0.7816
200        | 512        | 0.7711     | 1.0181
300        | 128        | 0.7572     | 1.3837
300        | 256        | 0.7683     | 1.4287
300        | 512        | 0.7730     | 1.1434
--------------------------------------------------
JUARA GLOBAL: n_estimators=100, max_samples=256
Skor AUC Global Sementara: 0.7750

[STEP 4] Final Evaluation Per Server (Using Best Params)...
-----------------------------------------------------------------
Source           

#VAE-LSTM

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score
import os
import glob
import time
import itertools

# ==========================================
# --- CONFIGURATION ---
# ==========================================
INPUT_FOLDER = '/content/drive/MyDrive/NAB_RESOURCES/nab_resources/nab_final_split'
TIME_STEPS = 10

# GRID PARAMETER UNTUK TUNING
param_grid = {
    'latent_dim': [4, 8],       # Coba naikin dikit siapa tau butuh info lebih
    'lstm_units': [32, 64]      # Coba naikin kapasitas otak
}

print("="*60)
print("GLOBAL VAE-LSTM: MERGE -> TUNE -> EXECUTE (ULTIMATE)")
print("="*60)

# ---------------------------------------------------------
# STEP 1: MERGE DATA
# ---------------------------------------------------------
print("\n[STEP 1] Merging Data...")
train_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_train.csv")))
test_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_test.csv")))

train_list = []
for f in train_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_train.csv', '')
    train_list.append(df)
global_train = pd.concat(train_list, ignore_index=True)

test_list = []
for f in test_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_test.csv', '')
    test_list.append(df)
global_test = pd.concat(test_list, ignore_index=True)

print(f" > Global Train Size : {len(global_train)} rows")
print(f" > Global Test Size  : {len(global_test)} rows")

# ---------------------------------------------------------
# STEP 2: PREPROCESSING
# ---------------------------------------------------------
print("\n[STEP 2] Preprocessing...")
scaler = StandardScaler()
global_train_vals = scaler.fit_transform(global_train['value'].values.reshape(-1,1))
global_test['value_scaled'] = scaler.transform(global_test['value'].values.reshape(-1,1))

def create_sequences(values):
    xs = []
    if len(values) > TIME_STEPS:
        for i in range(len(values) - TIME_STEPS):
            xs.append(values[i:(i + TIME_STEPS)])
    return np.array(xs)

X_train_global = create_sequences(global_train_vals)
# Ambil label global (potong 10 di awal karena sequence)
# Note: Label global ini cuma buat tuning cepat, nanti final eval tetep per source
y_test_global = global_test['label'].values[TIME_STEPS:]
X_test_global = create_sequences(global_test['value_scaled'].values.reshape(-1,1))

# ---------------------------------------------------------
# STEP 3: MODEL BUILDER (Robust Version)
# ---------------------------------------------------------
def build_vae_dynamic(lstm_units, latent_dim):
    enc_in = keras.Input(shape=(TIME_STEPS, 1))
    x = layers.LSTM(lstm_units, return_sequences=False)(enc_in)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)

    def sampling(args):
        zm, zv = args
        batch = tf.shape(zm)[0]
        dim = tf.shape(zm)[1]
        eps = tf.keras.backend.random_normal(shape=(batch, dim))
        return zm + tf.keras.backend.exp(0.5 * zv) * eps

    z = layers.Lambda(sampling)([z_mean, z_log_var])
    encoder = keras.Model(enc_in, [z_mean, z_log_var, z])

    dec_in = keras.Input(shape=(latent_dim,))
    x = layers.RepeatVector(TIME_STEPS)(dec_in)
    x = layers.LSTM(lstm_units, return_sequences=True)(x)
    dec_out = layers.TimeDistributed(layers.Dense(1))(x)
    decoder = keras.Model(dec_in, dec_out)

    class VAE(keras.Model):
        def __init__(self, enc, dec):
            super().__init__()
            self.enc = enc
            self.dec = dec
        def train_step(self, data):
            with tf.GradientTape() as tape:
                zm, zv, z = self.enc(data)
                recon = self.dec(z)
                recon_loss = tf.reduce_mean(tf.reduce_sum(keras.losses.mse(data, recon), axis=1))
                kl_loss = -0.5 * (1 + zv - tf.square(zm) - tf.exp(zv))
                kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
                total_loss = recon_loss + kl_loss
            grads = tape.gradient(total_loss, self.trainable_weights)
            self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
            return {"loss": total_loss}
        def call(self, inputs):
            zm, _, z = self.enc(inputs)
            return self.dec(z)

    vae = VAE(encoder, decoder)
    vae.compile(optimizer='adam')
    return vae

# ---------------------------------------------------------
# STEP 4: TUNING LOOP
# ---------------------------------------------------------
print("\n[STEP 4] Tuning Global Parameters...")
print(f"{'latent':<10} | {'lstm':<10} | {'AUC (Global)':<12} | {'Time (s)':<10}")
print("-" * 55)

keys = param_grid.keys()
combinations = list(itertools.product(*param_grid.values()))

best_auc = -1
best_params = {}

for combo in combinations:
    params = dict(zip(keys, combo))
    tf.keras.backend.clear_session() # Bersihin memori

    t0 = time.time()

    vae = build_vae_dynamic(params['lstm_units'], params['latent_dim'])
    # Epoch dikit aja (15) buat screening
    vae.fit(X_train_global, epochs=15, batch_size=128, verbose=0)

    # Predict Global (Quick Check)
    # Kita sample sebagian test set aja biar cepet kalau kegedean
    idx = np.random.choice(len(X_test_global), size=min(5000, len(X_test_global)), replace=False)
    pred = vae.predict(X_test_global[idx], verbose=0)
    scores = np.mean(np.mean(np.square(X_test_global[idx] - pred), axis=1), axis=1)

    try:
        auc = roc_auc_score(y_test_global[idx], scores)
    except:
        auc = 0.5

    dt = time.time() - t0
    print(f"{params['latent_dim']:<10} | {params['lstm_units']:<10} | {auc:.4f}       | {dt:.2f}")

    if auc > best_auc:
        best_auc = auc
        best_params = params

print("-" * 55)
print(f"JUARA GLOBAL: latent={best_params['latent_dim']}, lstm={best_params['lstm_units']}")

# ---------------------------------------------------------
# STEP 5: FINAL EXECUTION (WITH BEST PARAMS)
# ---------------------------------------------------------
print("\n[STEP 5] Final Evaluation Per Server...")
tf.keras.backend.clear_session()

# Re-Train Full Power (30 Epochs)
final_vae = build_vae_dynamic(best_params['lstm_units'], best_params['latent_dim'])
final_vae.fit(X_train_global, epochs=30, batch_size=64, verbose=0)

print("-" * 65)
print(f"{'Source':<30} | {'AUC':<10} | {'F1':<10}")
print("-" * 65)

unique_sources = global_test['source'].unique()
results = []

for source in unique_sources:
    subset = global_test[global_test['source'] == source].copy()

    if len(subset) > TIME_STEPS + 5:
        sub_vals = subset['value_scaled'].values.reshape(-1,1)
        X_sub = create_sequences(sub_vals)
        y_sub = subset['label'].values[TIME_STEPS:]

        if len(np.unique(y_sub)) > 1:
            pred = final_vae.predict(X_sub, verbose=0)
            scores = np.mean(np.mean(np.square(X_sub - pred), axis=1), axis=1)

            auc = roc_auc_score(y_sub, scores)

            best_f1 = 0
            threshs = np.linspace(scores.min(), scores.max(), 100)
            for t in threshs:
                p = (scores > t).astype(int)
                f = f1_score(y_sub, p, zero_division=0)
                if f > best_f1: best_f1 = f

            print(f"{source:<30} | {auc:.4f}     | {best_f1:.4f}")
            results.append({'source': source, 'auc': auc, 'f1': best_f1})

print("="*65)
print("FINAL RESULT: GLOBAL VAE-LSTM (TUNED)")
print("="*65)
df_res = pd.DataFrame(results)
if not df_res.empty:
    print(df_res)
    print("-" * 65)
    print(f"AVERAGE AUC (Macro): {df_res['auc'].mean():.4f}")
else:
    print("No valid results.")
print("="*65)

GLOBAL VAE-LSTM: MERGE -> TUNE -> EXECUTE (ULTIMATE)

[STEP 1] Merging Data...
 > Global Train Size : 16122 rows
 > Global Test Size  : 16128 rows

[STEP 2] Preprocessing...

[STEP 4] Tuning Global Parameters...
latent     | lstm       | AUC (Global) | Time (s)  
-------------------------------------------------------
4          | 32         | 0.1431       | 49.57




4          | 64         | nan       | 73.03
8          | 32         | 0.4303       | 38.79
8          | 64         | 0.2460       | 70.99
-------------------------------------------------------
JUARA GLOBAL: latent=8, lstm=32

[STEP 5] Final Evaluation Per Server...
-----------------------------------------------------------------
Source                         | AUC        | F1        
-----------------------------------------------------------------
ec2_cpu_utilization_24ae8d     | 0.4838     | 0.0020
ec2_cpu_utilization_53ea38     | 0.3825     | 0.0010
ec2_cpu_utilization_5f5533     | 0.7716     | 0.0031
ec2_cpu_utilization_ac20cd     | 0.9736     | 0.0312
ec2_cpu_utilization_fe7f93     | 0.6163     | 0.4000
FINAL RESULT: GLOBAL VAE-LSTM (TUNED)
                       source       auc        f1
0  ec2_cpu_utilization_24ae8d  0.483782  0.001993
1  ec2_cpu_utilization_53ea38  0.382544  0.000997
2  ec2_cpu_utilization_5f5533  0.771571  0.003077
3  ec2_cpu_utilization_ac20cd  0.973566  

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score
import os
import glob
import time
import itertools

# ==========================================
# --- CONFIGURATION ---
# ==========================================
INPUT_FOLDER = '/content/drive/MyDrive/NAB_RESOURCES/nab_resources/nab_final_split_30'
TIME_STEPS = 10

# GRID PARAMETER UNTUK TUNING
param_grid = {
    'latent_dim': [4, 8],       # Coba naikin dikit siapa tau butuh info lebih
    'lstm_units': [32, 64]      # Coba naikin kapasitas otak
}

print("="*60)
print("GLOBAL VAE-LSTM: MERGE -> TUNE -> EXECUTE (ULTIMATE)")
print("="*60)

# ---------------------------------------------------------
# STEP 1: MERGE DATA
# ---------------------------------------------------------
print("\n[STEP 1] Merging Data...")
train_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_train.csv")))
test_files = sorted(glob.glob(os.path.join(INPUT_FOLDER, "*_test.csv")))

train_list = []
for f in train_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_train.csv', '')
    train_list.append(df)
global_train = pd.concat(train_list, ignore_index=True)

test_list = []
for f in test_files:
    df = pd.read_csv(f)
    df['source'] = os.path.basename(f).replace('_test.csv', '')
    test_list.append(df)
global_test = pd.concat(test_list, ignore_index=True)

print(f" > Global Train Size : {len(global_train)} rows")
print(f" > Global Test Size  : {len(global_test)} rows")

# ---------------------------------------------------------
# STEP 2: PREPROCESSING
# ---------------------------------------------------------
print("\n[STEP 2] Preprocessing...")
scaler = StandardScaler()
global_train_vals = scaler.fit_transform(global_train['value'].values.reshape(-1,1))
global_test['value_scaled'] = scaler.transform(global_test['value'].values.reshape(-1,1))

def create_sequences(values):
    xs = []
    if len(values) > TIME_STEPS:
        for i in range(len(values) - TIME_STEPS):
            xs.append(values[i:(i + TIME_STEPS)])
    return np.array(xs)

X_train_global = create_sequences(global_train_vals)
# Ambil label global (potong 10 di awal karena sequence)
# Note: Label global ini cuma buat tuning cepat, nanti final eval tetep per source
y_test_global = global_test['label'].values[TIME_STEPS:]
X_test_global = create_sequences(global_test['value_scaled'].values.reshape(-1,1))

# ---------------------------------------------------------
# STEP 3: MODEL BUILDER (Robust Version)
# ---------------------------------------------------------
def build_vae_dynamic(lstm_units, latent_dim):
    enc_in = keras.Input(shape=(TIME_STEPS, 1))
    x = layers.LSTM(lstm_units, return_sequences=False)(enc_in)
    z_mean = layers.Dense(latent_dim)(x)
    z_log_var = layers.Dense(latent_dim)(x)

    def sampling(args):
        zm, zv = args
        batch = tf.shape(zm)[0]
        dim = tf.shape(zm)[1]
        eps = tf.keras.backend.random_normal(shape=(batch, dim))
        return zm + tf.keras.backend.exp(0.5 * zv) * eps

    z = layers.Lambda(sampling)([z_mean, z_log_var])
    encoder = keras.Model(enc_in, [z_mean, z_log_var, z])

    dec_in = keras.Input(shape=(latent_dim,))
    x = layers.RepeatVector(TIME_STEPS)(dec_in)
    x = layers.LSTM(lstm_units, return_sequences=True)(x)
    dec_out = layers.TimeDistributed(layers.Dense(1))(x)
    decoder = keras.Model(dec_in, dec_out)

    class VAE(keras.Model):
        def __init__(self, enc, dec):
            super().__init__()
            self.enc = enc
            self.dec = dec
        def train_step(self, data):
            with tf.GradientTape() as tape:
                zm, zv, z = self.enc(data)
                recon = self.dec(z)
                recon_loss = tf.reduce_mean(tf.reduce_sum(keras.losses.mse(data, recon), axis=1))
                kl_loss = -0.5 * (1 + zv - tf.square(zm) - tf.exp(zv))
                kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
                total_loss = recon_loss + kl_loss
            grads = tape.gradient(total_loss, self.trainable_weights)
            self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
            return {"loss": total_loss}
        def call(self, inputs):
            zm, _, z = self.enc(inputs)
            return self.dec(z)

    vae = VAE(encoder, decoder)
    vae.compile(optimizer='adam')
    return vae

# ---------------------------------------------------------
# STEP 4: TUNING LOOP
# ---------------------------------------------------------
print("\n[STEP 4] Tuning Global Parameters...")
print(f"{'latent':<10} | {'lstm':<10} | {'AUC (Global)':<12} | {'Time (s)':<10}")
print("-" * 55)

keys = param_grid.keys()
combinations = list(itertools.product(*param_grid.values()))

best_auc = -1
best_params = {}

for combo in combinations:
    params = dict(zip(keys, combo))
    tf.keras.backend.clear_session() # Bersihin memori

    t0 = time.time()

    vae = build_vae_dynamic(params['lstm_units'], params['latent_dim'])
    # Epoch dikit aja (15) buat screening
    vae.fit(X_train_global, epochs=15, batch_size=128, verbose=0)

    # Predict Global (Quick Check)
    # Kita sample sebagian test set aja biar cepet kalau kegedean
    idx = np.random.choice(len(X_test_global), size=min(5000, len(X_test_global)), replace=False)
    pred = vae.predict(X_test_global[idx], verbose=0)
    scores = np.mean(np.mean(np.square(X_test_global[idx] - pred), axis=1), axis=1)

    try:
        auc = roc_auc_score(y_test_global[idx], scores)
    except:
        auc = 0.5

    dt = time.time() - t0
    print(f"{params['latent_dim']:<10} | {params['lstm_units']:<10} | {auc:.4f}       | {dt:.2f}")

    if auc > best_auc:
        best_auc = auc
        best_params = params

print("-" * 55)
print(f"JUARA GLOBAL: latent={best_params['latent_dim']}, lstm={best_params['lstm_units']}")

# ---------------------------------------------------------
# STEP 5: FINAL EXECUTION (WITH BEST PARAMS)
# ---------------------------------------------------------
print("\n[STEP 5] Final Evaluation Per Server...")
tf.keras.backend.clear_session()

# Re-Train Full Power (30 Epochs)
final_vae = build_vae_dynamic(best_params['lstm_units'], best_params['latent_dim'])
final_vae.fit(X_train_global, epochs=30, batch_size=64, verbose=0)

print("-" * 65)
print(f"{'Source':<30} | {'AUC':<10} | {'F1':<10}")
print("-" * 65)

unique_sources = global_test['source'].unique()
results = []

for source in unique_sources:
    subset = global_test[global_test['source'] == source].copy()

    if len(subset) > TIME_STEPS + 5:
        sub_vals = subset['value_scaled'].values.reshape(-1,1)
        X_sub = create_sequences(sub_vals)
        y_sub = subset['label'].values[TIME_STEPS:]

        if len(np.unique(y_sub)) > 1:
            pred = final_vae.predict(X_sub, verbose=0)
            scores = np.mean(np.mean(np.square(X_sub - pred), axis=1), axis=1)

            auc = roc_auc_score(y_sub, scores)

            best_f1 = 0
            threshs = np.linspace(scores.min(), scores.max(), 100)
            for t in threshs:
                p = (scores > t).astype(int)
                f = f1_score(y_sub, p, zero_division=0)
                if f > best_f1: best_f1 = f

            print(f"{source:<30} | {auc:.4f}     | {best_f1:.4f}")
            results.append({'source': source, 'auc': auc, 'f1': best_f1})

print("="*65)
print("FINAL RESULT: GLOBAL VAE-LSTM (TUNED)")
print("="*65)
df_res = pd.DataFrame(results)
if not df_res.empty:
    print(df_res)
    print("-" * 65)
    print(f"AVERAGE AUC (Macro): {df_res['auc'].mean():.4f}")
else:
    print("No valid results.")
print("="*65)

GLOBAL VAE-LSTM: MERGE -> TUNE -> EXECUTE (ULTIMATE)

[STEP 1] Merging Data...
 > Global Train Size : 9671 rows
 > Global Test Size  : 22584 rows

[STEP 2] Preprocessing...

[STEP 4] Tuning Global Parameters...
latent     | lstm       | AUC (Global) | Time (s)  
-------------------------------------------------------
4          | 32         | 0.1191       | 25.05
4          | 64         | 0.4397       | 43.67
8          | 32         | 0.3162       | 25.10
8          | 64         | 0.6450       | 42.06
-------------------------------------------------------
JUARA GLOBAL: latent=8, lstm=64

[STEP 5] Final Evaluation Per Server...
-----------------------------------------------------------------
Source                         | AUC        | F1        
-----------------------------------------------------------------
ec2_cpu_utilization_24ae8d     | 0.5229     | 0.0150
ec2_cpu_utilization_53ea38     | 0.6757     | 0.0014
ec2_cpu_utilization_5f5533     | 0.2921     | 0.0014
ec2_cpu_utilizat