# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import math
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import optuna
from copy import deepcopy
from pyDeepInsight import ImageTransformer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.base import clone
from sklearn.cluster import KMeans

from scipy.spatial.distance import cdist
from scipy.stats import ks_2samp
from scipy.optimize import minimize

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
X_train = np.load('data/x_train.npy')
y_train = np.load('data/y_train.npy')

X_test = np.load('data/x_test.npy')
y_test = np.load('data/y_test.npy')

## MAE

In [3]:
class Encoder(nn.Module):
    def __init__(self, img_channels=1, feature_dim=32, latent_dim=2):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(img_channels, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(32 * 2 * 2, feature_dim)

    def forward(self, x):
        x = self.relu(self.conv1(x))  # Output: (batch_size, 16, 8, 8)
        x = self.pool(x)              # Output: (batch_size, 16, 4, 4)
        x = self.relu(self.conv2(x))  # Output: (batch_size, 32, 4, 4)
        x = self.pool(x)              # Output: (batch_size, 32, 2, 2)
        x = x.view(x.size(0), -1)     # Flatten to (batch_size, 128)
        x = self.fc1(x)               # Output: (batch_size, feature_dim)
        return x

class Decoder(nn.Module):
    def __init__(self, img_channels=1, feature_dim=32):
        super(Decoder, self).__init__()
        self.fc2 = nn.Linear(feature_dim, 32 * 2 * 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        self.deconv1 = nn.ConvTranspose2d(32, 16, kernel_size=3, stride=1, padding=1)
        self.deconv2 = nn.ConvTranspose2d(16, img_channels, kernel_size=3, stride=1, padding=1)
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

    def forward(self, z):
        x = self.relu(self.fc2(z))           # Output: (batch_size, 128)
        x = x.view(x.size(0), 32, 2, 2)      # Reshape to (batch_size, 32, 2, 2)
        x = self.upsample(x)                 # Upsample to (batch_size, 32, 4, 4)
        x = self.relu(self.deconv1(x))       # Output: (batch_size, 16, 4, 4)
        x = self.upsample(x)                 # Upsample to (batch_size, 16, 8, 8)
        x = self.sigmoid(self.deconv2(x))    # Output: (batch_size, img_channels, 8, 8)
        return x

class MAE(nn.Module):
    def __init__(self, img_channels=1, feature_dim=32, latent_dim=2):
        super(MAE, self).__init__()
        self.encoder = Encoder(img_channels, feature_dim, latent_dim)
        self.decoder = Decoder(img_channels, feature_dim)

    def mask_input(self, x, mask_ratio=0.25):
        # Generate a mask with 0s and 1s, keeping only (1-mask_ratio) of the original input
        mask = torch.rand(x.shape, device=x.device) > mask_ratio
        x_masked = x * mask
        return x_masked, mask

    def forward(self, x, mask_ratio=0.25):
        x_masked, mask = self.mask_input(x, mask_ratio)  # Apply masking to input
        z = self.encoder(x_masked)
        reconstructed = self.decoder(z)
        return reconstructed, mask

def mae_loss_function(reconstructed, original, mask):
    # Only calculate reconstruction loss on the masked parts
    masked_original = original * mask
    reconstruction_loss = F.mse_loss(reconstructed, masked_original, reduction='sum')
    return reconstruction_loss

def extract_latent_features(model, data_loader, device='cuda'):
    model.eval() 
    latent_features = []  

    with torch.no_grad():
        for batch in data_loader: #tqdm(data_loader, total=len(data_loader), desc="Extracting features"):
            if len(batch) == 2:
                data, _ = batch  
            else:
                (data,) = batch  
            
            data = data.to(device)

            latent_feature = model.encoder(data)
            latent_features.append(latent_feature.cpu().numpy())

    latent_features = np.concatenate(latent_features, axis=0)
    
    return latent_features

## Creation functions

In [4]:
def create_phi(normal_data, c):
    kmeans = KMeans(n_clusters=c, random_state=42)
    labels = kmeans.fit_predict(normal_data)
    
    normal_concepts = [normal_data[labels == i] for i in range(c)]
    print("Finished creating normal concepts")
    
    return normal_concepts


def create_gamma(anomaly_data, c):
    kmeans = KMeans(n_clusters=c, random_state=42)
    labels = kmeans.fit_predict(anomaly_data)
    
    anomaly_concepts = [anomaly_data[labels == i] for i in range(c)]
    print("Finished creating anomaly concepts")
    
    return anomaly_concepts
    
def match_lambda(anomaly_concepts, normal_concepts):
    pairs = []
    remaining_anomalies = anomaly_concepts.copy()

    for normal_concept in normal_concepts:
        normal_centroid = np.mean(normal_concept, axis=0)
        anomaly_centroids = [np.mean(ac, axis=0) for ac in remaining_anomalies]

        distances = cdist([normal_centroid], anomaly_centroids, metric='euclidean')[0]
        closest_idx = np.argmin(distances)

        pairs.append((normal_concept, remaining_anomalies[closest_idx]))
        remaining_anomalies.pop(closest_idx)

    print("Finished matching concept pairs")
    
    return pairs

## Evaluation metrics

In [5]:
def lifelong_roc_auc(R):
    N = R.shape[0]
    lower_triangular_sum = np.sum(np.tril(R))
    normalization_factor = (N * (N + 1)) / 2

    return lower_triangular_sum / normalization_factor

def BWT(R):
    N = R.shape[0]
    backward_transfer = 0
    count = 0

    for i in range(1, N):
        for j in range(i):
            backward_transfer += (R[i, j] - R[j, j])
            count += 1

    return backward_transfer / count if count > 0 else 0

def FWT(R):
    N = R.shape[0]
    forward_transfer = 0
    count = 0

    for i in range(N):
        for j in range(i + 1, N): 
            forward_transfer += R[i, j]
            count += 1

    return forward_transfer / count if count > 0 else 0 

## Helper Functions

In [6]:
def pca_feature_selection(X, k, explained_variance_threshold=0.95):

    pca = PCA()
    pca.fit(X)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cumulative_variance >= explained_variance_threshold) + 1

    pca = PCA(n_components=n_components)
    pca.fit(X)

    feature_importance = np.abs(pca.components_).sum(axis=0)
    top_k_indices = np.argsort(feature_importance)[-k:]
    
    return top_k_indices

def kolmogorov_smirnov_test(X_old, X_new, alpha=0.05):
    p_values = [ks_2samp(X_old[:, i], X_new[:, i]).pvalue for i in range(X_old.shape[1])]
    return np.any(np.array(p_values) < alpha)

def histogram_binning(X, bins=25):
    return np.array([np.histogram(X[:, i], bins=bins, density=True)[0] for i in range(X.shape[1])]).T

def weighted_histogram_binning(X, weights, bins=25):
    histograms = []
    for i in range(X.shape[1]):
        hist, _ = np.histogram(X[:, i], bins=bins, density=True, weights=weights)
        histograms.append(hist)
    return np.array(histograms).T  # shape: (bins, n_features)

def kl_divergence(P, Q):
    P, Q = np.clip(P, 1e-10, None), np.clip(Q, 1e-10, None)  # Avoid log(0)
    return np.sum(P * np.log(P / Q))

def strategic_sample_selection(X_old, X_new, top_k=100, learning_rate=0.01, num_iterations=100):
    H_old, H_new = histogram_binning(X_old), histogram_binning(X_new)
    m_n = np.random.rand(H_new.shape[0])  

    def loss_function(m_n):
        weighted_H_new = H_new * m_n[:, np.newaxis]  
        combined_H = (H_old + weighted_H_new) / 2 
        return kl_divergence(H_new, combined_H) 

    progress_bar = tqdm(total=num_iterations, desc="Optimizing Sample Selection", position=0, leave=True)

    def callback(xk):
        progress_bar.update(1)  

    result = minimize(loss_function, m_n, method="L-BFGS-B", bounds=[(0, 1)] * len(m_n), 
                      options={"maxiter": num_iterations, "ftol": 1e-10}, callback=callback)

    progress_bar.close()

    selected_indices = np.argsort(result.x)[-top_k:]

    return X_new[selected_indices] 

def strategic_forgetting_update_opt(memory_buffer, X_new, drop_k=100, bins=25, num_iterations=100):
    m0 = np.random.uniform(0.5, 1.0, size=memory_buffer.shape[0])
    
    H_new = histogram_binning(X_new, bins=bins)
    H_new = H_new / (np.sum(H_new, axis=0, keepdims=True) + 1e-10)
    
    def loss_function(m):
        H_weighted = weighted_histogram_binning(memory_buffer, m, bins=bins)
        H_weighted = H_weighted / (np.sum(H_weighted, axis=0, keepdims=True) + 1e-10)
        loss = 0
        for j in range(H_new.shape[1]):
            loss += kl_divergence(H_new[:, j], H_weighted[:, j])
        return loss

    progress_bar = tqdm(total=num_iterations, desc="Optimizing Forgetting", position=0, leave=True)

    def callback(xk):
        progress_bar.update(1)

    result = minimize(loss_function, m0, method="L-BFGS-B", bounds=[(0, 1)] * len(m0), 
                      options={"maxiter": num_iterations, "ftol": 1e-10}, callback=callback)
    
    progress_bar.close()
    
    optimized_m = result.x
    print(f"Optimized loss: {result.fun:.5f}")
    
    non_rep_indices = np.where(optimized_m < 0.5)[0]
    
    if len(non_rep_indices) < drop_k:
        additional_needed = drop_k - len(non_rep_indices)
        remaining_indices = np.setdiff1d(np.arange(memory_buffer.shape[0]), non_rep_indices)
        additional_indices = remaining_indices[np.argsort(optimized_m[remaining_indices])[:additional_needed]]
        drop_indices = np.concatenate([non_rep_indices, additional_indices])
    else:
        drop_indices = non_rep_indices[np.argsort(optimized_m[non_rep_indices])[:drop_k]]
    
    updated_buffer = np.delete(memory_buffer, drop_indices, axis=0)
    print(f"Strategic Forgetting: Dropped {len(drop_indices)} samples based on the optimized mask.")
    return updated_buffer


def update_memory_buffer(X_old, X_new_selected, memory_size=3000):
    updated_buffer = np.vstack((X_old, X_new_selected))  

    if updated_buffer.shape[0] > memory_size:
        updated_buffer = updated_buffer[-memory_size:]

    return updated_buffer

class HierarchicalMemory:
    def __init__(self, memory_limit=5000, pyramid_factor=2, centroids_per_concept=10):
        self.memory_limit = memory_limit
        self.pyramid_factor = pyramid_factor
        self.centroids_per_concept = centroids_per_concept
        self.memory = {}  # level: [concept1, concept2, ...]

    def add_concept(self, data, level=1):
        if level not in self.memory:
            self.memory[level] = []
        self.memory[level].append(np.array(data))
        self._summarize_memory()

    def _pyramidal_allocation(self):
        levels = sorted(self.memory.keys())
        weights = np.array([1 / (self.pyramid_factor ** (lvl - 1)) for lvl in levels])
        total_weight = weights.sum()
        allocations = (weights / total_weight) * self.memory_limit
        return {lvl: int(alloc) for lvl, alloc in zip(levels, allocations)}

    def _summarize_concept(self, concept, n_samples):
        if len(concept) <= n_samples:
            return concept
        kmeans = KMeans(n_clusters=min(self.centroids_per_concept, len(concept)), random_state=42).fit(concept)
        centroids = kmeans.cluster_centers_
        distances = np.linalg.norm(concept[:, None] - centroids, axis=2)
        closest_indices = np.argmin(distances, axis=0)
        summarized = concept[closest_indices]
        return summarized

    def _summarize_memory(self):
        allocations = self._pyramidal_allocation()
        for level, concepts in self.memory.items():
            summarized_level = []
            alloc_per_concept = max(1, allocations[level] // len(concepts))
            for concept in concepts:
                summarized = self._summarize_concept(concept, alloc_per_concept)
                summarized_level.append(summarized)
            self.memory[level] = summarized_level

    def get_all_memory(self):
        all_data = []
        for level_concepts in self.memory.values():
            for concept in level_concepts:
                all_data.append(concept)
        return np.vstack(all_data) if all_data else np.empty((0,))

## Scenario Design + Evaluation Protocol

In [None]:
def scenario_design(normal_data, anomaly_data, c):
    normal_concepts = create_phi(normal_data, c)
    anomaly_concepts = create_gamma(anomaly_data, c)
    
    scenario = match_lambda(anomaly_concepts, normal_concepts)
    
    return scenario

def evaluation_protocol(model, strategy="SSF", replay_buffer_size=5000, memory_size=5000, alpha=0.05, forgetting_quota=100):

    if strategy == "SSF":
        hier_memory = None

    # PREPROCESS
    X = np.vstack((X_train, X_test))
    y = np.hstack((y_train, y_test))
    y = np.where(y == 11, 0, 1)

    num_concepts = 5
    X_normal = X[y == 0]  
    X_anomaly = X[y == 1]

    normal_concepts = create_phi(X_normal, num_concepts)
    anomaly_concepts = create_gamma(X_anomaly, num_concepts)
    concept_pairs = match_lambda(anomaly_concepts, normal_concepts)

    T, E, Y_t, Y = [], [], [], []
    for normal, anomaly in concept_pairs:
        normal_train, normal_test = train_test_split(normal, test_size=0.3, random_state=42)
        anomaly_train, anomaly_test = train_test_split(anomaly, test_size=0.3, random_state=42)  

        T.append((normal_train, anomaly_train))
        E.append((normal_test, anomaly_test))

        y_normal_train = np.zeros(len(normal_train))
        y_anomaly_train = np.ones(len(anomaly_train))
        y_normal_test = np.zeros(len(normal_test))
        y_anomaly_test = np.ones(len(anomaly_test))

        Y_t.append((y_normal_train, y_anomaly_train))
        Y.append((y_normal_test, y_anomaly_test))

    print("Finished creating concepts")
    N = len(T)
    R = np.zeros((N, N))

    for i, Ti in tqdm(enumerate(T), desc=f"Evaluating using {strategy} strategy"):
        Ti_full = np.vstack((Ti[0], Ti[1]))
        Ti_n = Ti[1]
        Y_ti = np.hstack((Y_t[i][0], Y_t[i][1]))

        k = 31
        top_features_indices = pca_feature_selection(Ti_full, k)
        Ti_n_selected = Ti_n[:, top_features_indices]

        if strategy == "SSF":
            if hier_memory is None:
                hier_memory = HierarchicalMemory(memory_limit=memory_size, pyramid_factor=2, centroids_per_concept=10)
                selected = Ti_n_selected[:memory_size]
                hier_memory.add_concept(selected, level=1)
            else:
                current_memory = hier_memory.get_all_memory()
                drift_detected = kolmogorov_smirnov_test(current_memory, Ti_n_selected, alpha)
                if drift_detected:
                    print("Drift detected — applying strategic forgetting and selection (level 2)")
                    current_memory = strategic_forgetting_update_opt(current_memory, Ti_n_selected, drop_k=forgetting_quota)
                    selected = strategic_sample_selection(current_memory, Ti_n_selected, top_k=forgetting_quota)
                    hier_memory.add_concept(selected, level=2)
                else:
                    print("No drift — applying strategic selection (level 1)")
                    selected = strategic_sample_selection(current_memory, Ti_n_selected, top_k=forgetting_quota)
                    hier_memory.add_concept(selected, level=1)

            Ti_n_selected = hier_memory.get_all_memory()

        it = ImageTransformer(pixels=8, feature_extractor='tsne', discretization='lsa')
        it.fit(Ti_n_selected)
        Ti_n_images = it.transform(Ti_n_selected, 'pytorch')

        X_train_tensor = torch.tensor(Ti_n_images, dtype=torch.float32)
        train_dataset = TensorDataset(X_train_tensor)
        batch_size = 32
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

        if i > 0:
            mae_model = deepcopy(prev_mae_model)
        else:
            mae_model = MAE(img_channels=Ti_n_images.shape[1], feature_dim=32, latent_dim=16).to(device)

        optimizer = torch.optim.Adam(mae_model.parameters(), lr=1e-3)
        num_epochs = 20
        mask_ratio = 0.75

        for epoch in range(1, num_epochs + 1):
            mae_model.train()
            for batch_idx, (data,) in enumerate(train_loader):
                data = data.to(device)
                optimizer.zero_grad()
                reconstructed, mask = mae_model(data, mask_ratio=mask_ratio)
                loss = mae_loss_function(reconstructed, data, mask)
                loss.backward()
                optimizer.step()

        prev_mae_model = deepcopy(mae_model)
        mae_model_curr = mae_model

        Ti_n_latent = extract_latent_features(mae_model_curr, train_loader, device)
        current_model = clone(model)
        current_model.fit(Ti_n_latent)

        for j, ((Ej_normal, Ej_anomaly), (y_normal, y_anomaly)) in enumerate(zip(E, Y)):
            test_data = np.vstack((Ej_normal, Ej_anomaly))
            test_labels = np.hstack((y_normal, y_anomaly))
            test_data_selected = test_data[:, top_features_indices]
            test_data_images = it.transform(test_data_selected, 'pytorch')
            X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
            test_dataset = TensorDataset(X_test_tensor, torch.tensor(test_labels, dtype=torch.long))
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

            test_latent_features = extract_latent_features(mae_model_curr, test_loader, device)
            scores = -current_model.decision_function(test_latent_features)
            entry = roc_auc_score(test_labels, scores)
            R[i, j] = entry

    return R

# Experiments

## LOF

In [31]:
R_ssf = evaluation_protocol(LocalOutlierFactor(n_neighbors=20, novelty=True, algorithm="ball_tree"), memory_size=5000, alpha=0.05, forgetting_quota=1000)
print(f"Lifelong ROC-AUC: {lifelong_roc_auc(R_ssf)}, BWT: {BWT(R_ssf)}, FWT: {FWT(R_ssf)}")

Finished creating normal concepts
Finished creating anomaly concepts
Finished matching concept pairs
Finished creating concepts


Evaluating using SSF strategy: 0it [00:00, ?it/s]

  X_train_tensor = torch.tensor(Ti_n_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)


Drift detected — applying strategic forgetting and selection (level 2)


Optimizing Forgetting:   0%|          | 0/100 [00:00<?, ?it/s]

Optimized loss: 214.95916
Strategic Forgetting: Dropped 1000 samples based on the optimized mask.


Optimizing Sample Selection:   0%|          | 0/100 [00:00<?, ?it/s]

  X_train_tensor = torch.tensor(Ti_n_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)


Drift detected — applying strategic forgetting and selection (level 2)


Optimizing Forgetting:   0%|          | 0/100 [00:00<?, ?it/s]

Optimized loss: 211.31894
Strategic Forgetting: Dropped 35 samples based on the optimized mask.


  return n/db/n.sum(), bin_edges


Optimizing Sample Selection:   0%|          | 0/100 [00:00<?, ?it/s]

  X_train_tensor = torch.tensor(Ti_n_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)


Drift detected — applying strategic forgetting and selection (level 2)


Optimizing Forgetting:   0%|          | 0/100 [00:00<?, ?it/s]

Optimized loss: 273.99698
Strategic Forgetting: Dropped 60 samples based on the optimized mask.


  return n/db/n.sum(), bin_edges


Optimizing Sample Selection:   0%|          | 0/100 [00:00<?, ?it/s]

  X_train_tensor = torch.tensor(Ti_n_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)


Drift detected — applying strategic forgetting and selection (level 2)


Optimizing Forgetting:   0%|          | 0/100 [00:00<?, ?it/s]

Optimized loss: 208.93037
Strategic Forgetting: Dropped 85 samples based on the optimized mask.


  return n/db/n.sum(), bin_edges


Optimizing Sample Selection:   0%|          | 0/100 [00:00<?, ?it/s]

  X_train_tensor = torch.tensor(Ti_n_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)
  X_test_tensor = torch.tensor(test_data_images, dtype=torch.float32)


Lifelong ROC-AUC: 0.6776868664589615, BWT: 0.31546886894322534, FWT: 0.7980023104643891
