In [None]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
print("GPUs available:", physical_devices)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
from functools import partial
import math
import os

import numpy as np

from ocml.datasets import build_ds_from_numpy, tfds_from_sampler
from ocml.evaluate import check_LLC, log_metrics, evaluate_tabular
from ocml.models import conventional_dense, spectral_dense
from ocml.plot import plot_preds_ood
from ocml.priors import uniform_tabular
from ocml.train import train, SH_KR, BCE

In [None]:
from types import SimpleNamespace

def get_config(debug=False):
  dataset_name = os.environ.get("DATASET_NAME", "thyroid")
  adoc = os.environ.get("ADOC", "ad")  # or 'ad'
  config = SimpleNamespace(
    dataset_name = dataset_name,
    batch_size = 128,  # should be not too small to ensure diversity.
    domain = [-5, 5.],  # domain on which to sample points.
    use_sampler_for_train_test = True,
    scaling = False,
    maxiter = 4,  # very important on high dimensional dataset.
    margin = 0.05,  # very important !
    lbda = 100.,  # important but not as much as `margin`. Must be high for best results.
    k_coef_lip = 1.,  # no reason to change this.
    spectral_dense = True,  # Mandatory for orthogonal networks. 
    deterministic = False,  # Better with random learning rates.
    overshoot_boundary = False,
    conventional = False,  # Conventional training (i.e without hKR and Lipschitz constraint) for sanity check.
    widths = [512, 512, 512, 512],
    warmup_epochs=5,
    epochs_per_plot=20,
    adoc = adoc
  )
  return config

In [None]:
debug = "SANDBOX" in os.environ
config = get_config(debug)
train_kwargs = {
  'domain': config.domain,
  'deterministic': config.deterministic,
  'overshoot_boundary': True
}

In [None]:
import plotly.io as pio
print("PLOTLY_RENDERER:", pio.renderers.default)
try:
  import os
  os.environ['WANDB_NOTEBOOK_NAME'] = 'run_tabular.ipynb'
  import wandb
  wandb.login()
  wandb_available = True
except ModuleNotFoundError as e:
  print(e)
  print("Wandb logs will be removed.")
  wandb_available = False
plot_wandb = wandb_available and not debug  # Set to False to de-activate Wandb.
if plot_wandb:  
  import wandb
  group = os.environ.get("WANDB_GROUP", "sandbox_tabular")
  wandb.init(project="ocml_tabular", group=group, config=config.__dict__)
else:
  try:
    wandb.finish()
  except Exception as e:
    print(e)
    
train_kwargs['log_metrics_fn'] = partial(log_metrics, plot_wandb=plot_wandb)

In [None]:
import pandas as pd
from tensorflow.keras.utils import get_file

datasets = {
  'thyroid': {
    'save_path': '/data/datasets/tabular/thyroid.mat',
    'origin': 'https://www.dropbox.com/s/bih0e15a0fukftb/thyroid.mat?dl=1'
  },
  'mammography': {
    'save_path': '/data/datasets/tabular/mammography.mat',
    'origin': 'https://www.dropbox.com/s/tq2v4hhwyv17hlk/mammography.mat?dl=1'
  },
  'arrhythmia': {
    'save_path': '/data/datasets/tabular/arrhythmia/arrhythmia.mat',
    'origin': 'https://www.dropbox.com/s/lmlwuspn1sey48r/arrhythmia.mat?dl=1'
  },
  'arrhythmia_uci': {
    'save_path': '/data/datasets/tabular/arrhythmia/arrhythmia.data',
    'origin': 'https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'
  }
}

try:
  save_path = datasets[config.dataset_name]['save_path']
  origin = datasets[config.dataset_name]['origin']
  dataset_path = get_file(save_path, origin=origin)
  print("Dataset used:", dataset_path)
except:
  print('Error downloading')
  raise

In [None]:
import numpy as np
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
import matplotlib.pyplot as plt
from datetime import datetime, date, time
import pandas as pd

def load_matfile(file_path):
    mat = loadmat(file_path)
    mat = {k:v for k, v in mat.items() if k[0] != '_'}
    df = pd.DataFrame(np.concatenate([mat['X'],mat['y']], axis=1))
    return df
  
if config.dataset_name == 'arrhythmia_uci':
  "Custom processing for UCI original source. Shall not be used."
  from sklearn.experimental import enable_iterative_imputer
  from sklearn.impute import SimpleImputer, IterativeImputer
  df = pd.read_csv(dataset_path, header=None, na_values='?')
  imputer = SimpleImputer(missing_values=np.nan, strategy='median')
  df = pd.DataFrame(imputer.fit_transform(df))
  dtypes = ['float64']*16 + (['int64']*6 + ['float64']*6)*12 + (['float64']*10)*12
  dtypes = {idx: dtype for idx, dtype in zip(df.columns, dtypes)}
  df = df.astype(dtypes)
  discrete_cols = [idx for idx in dtypes if dtypes[idx] == 'int64']
  print('Categorical :', discrete_cols)
else:
  df = load_matfile(dataset_path)
  last_idx = int(df.columns[-1])
  df.rename({last_idx:'label'},axis=1,inplace=True)
df

In [None]:
normality = df[df['label'] == 0.].drop('label', axis=1)
anomalies = df[df['label'] == 1.].drop('label', axis=1).to_numpy()
print(f"Normality={len(normality)} Anomaly={len(anomalies)}")

In [None]:
df.describe()

In [None]:
from sklearn.neighbors import KDTree
print('Building tree... ')
kdt = KDTree(normality, leaf_size=30, metric='euclidean')
print('Built ! Queries on going... ')
dists, indexes = kdt.query(anomalies, k=20, return_distance=True)
print('Distances of each anomaly to 20 nearest normal points')
pd.DataFrame(dists).describe()

In [None]:
from ocml.priors import TabularSampler
sampler = TabularSampler(bounds=None)
continuous_policy = 'zscore'
if config.adoc == 'ad':
  dt = sampler.fit_transform(df, df, continuous_policy=continuous_policy)
elif config.adoc == 'oc':
  dt = sampler.fit_transform(df, normality, continuous_policy=continuous_policy)

In [None]:
dt[dt['label'] == 0.].describe()

In [None]:
dt[dt['label'] == 1.].describe()

In [None]:
from sklearn.model_selection import train_test_split

if config.use_sampler_for_train_test:
  print("Use dataset rescaled properly with Sampler object.")
  normality = dt[dt['label'] == 0.].drop('label', axis=1)
  anomalies = dt[dt['label'] == 1.].drop('label', axis=1).to_numpy()
  print(f"Normality={len(normality)} Anomaly={len(anomalies)}")

if config.adoc == 'ad':
  print("Running as Anomaly Detection.")
  x_train = np.concatenate([normality.to_numpy(), anomalies], axis=0)  # train on everything including anomalies.
  x_test = normality.to_numpy()  # anomalies treated separatly to avoid biases.
elif config.adoc == 'oc':
  train_sizes = {
    # Default sizes from DAGMM/HRN protocols.
    'thyroid': 1839,
    'arrhythmia': 193
  }
  train_size = train_sizes.get(config.dataset_name, round(len(normality) * 0.5))
  x_train, x_test = train_test_split(normality.to_numpy(), train_size=train_size, shuffle=True)
  print(f"Train Size={len(x_train)} Test Size={len(x_test)+len(anomalies)}")

In [None]:
pd.DataFrame(x_train).describe()

In [None]:
pd.DataFrame(anomalies).describe()

In [None]:
epoch_length = math.ceil(len(x_train) / config.batch_size)
epoch_length = max(epoch_length, 15)
print(f"Epoch Length={epoch_length}")

In [None]:
import sklearn.preprocessing as preprocessing

if config.scaling:
  # Should be useless when using Sampler object because it already rescale data.
  scaler = preprocessing.StandardScaler()
  x_train = scaler.fit_transform(x_train)
  print(f"Scaler: mean={scaler.mean_[:10]} std={scaler.scale_[:10]} min_scale={np.min(np.array(scaler.scale_))}")
  x_test = scaler.transform(x_test)
  normality = scaler.transform(normality)
  anomalies = scaler.transform(anomalies)
  print('x_train max norm:', pd.DataFrame(x_train).describe().iloc[1].max())
  print('x_test max norm:', pd.DataFrame(x_test).describe().iloc[1].max())

In [None]:
for i, g_sampler in enumerate(sampler.samplers[:10]):
  # Data should be centered. True STD depends on the scale of the whole fit test (that may include anomalies).
  scale = g_sampler.std*g_sampler.threshold
  mean = g_sampler.mean
  print(f'[{i}] {mean:.5f}±{scale:.5f}, sample in [{mean-3*scale:.5f},{scale+3*scale:.5f}] at 99.5% confidence interval.')
scales = np.array([g_sampler.std*g_sampler.threshold for g_sampler in sampler.samplers])
min_scale_idx = np.argmin(scales)
print(f'MinScale={scales[min_scale_idx]} at center {sampler.samplers[min_scale_idx].mean} with idx={min_scale_idx}')

In [None]:
adv = pd.DataFrame(sampler.sample(5))
adv.columns = dt.columns[:-1]
adv

In [None]:
from sklearn.neighbors import KDTree
print('Building tree... ')
kdt = KDTree(normality, leaf_size=30, metric='euclidean')
print('Built ! Queries on going... ')
dists, indexes = kdt.query(adv.to_numpy(), k=20, return_distance=True)
pd.DataFrame(dists).describe()

In [None]:
# Create positive examples dataset.
p_dataset = build_ds_from_numpy(x_train, config.batch_size)

In [None]:
input_size = adv.shape[1]

# Train model.
if config.conventional:
  model = conventional_dense(widths=config.widths, input_shape=(input_size,))
else:
  model = spectral_dense(widths=config.widths, input_shape=(input_size,),
                         k_coef_lip=config.k_coef_lip)

if config.conventional:
  loss_fn = BCE()
else:
  loss_fn = SH_KR(config.margin, config.lbda)

In [None]:
# Create optimizer class.
opt = tf.keras.optimizers.RMSprop(learning_rate=0.0005)

# Initialize the network.
gen = tf.random.Generator.from_seed(4321)  # reproducible sampling.
p_batch = next(iter(p_dataset))
_ = model(p_batch, training=True)  # dummy forward to trigger initialization.
model.summary()

In [None]:
# Adversarial distribution.
def sampler_fn(gen, batch_size, input_shape):
  del gen  # unused.
  del input_shape  # unused.
  return sampler.sample(batch_size)

q_dataset = tfds_from_sampler(sampler_fn, gen, config.batch_size, p_batch.shape[1:])
Q0 = next(iter(q_dataset))

In [None]:
num_epochs = config.warmup_epochs
for epoch in range(num_epochs):
  train(model, opt, loss_fn, gen, p_dataset, q_dataset, epoch_length, maxiter=0, **train_kwargs)
  T = evaluate_tabular(epoch, model, x_test, anomalies, plot_wandb=plot_wandb)
plot_preds_ood(epoch, model, tf.constant(x_train), tf.constant(x_test), tf.constant(anomalies), plot_histogram=True, plot_wandb=False, T=T)

In [None]:
for epoch in range(config.epochs_per_plot):
  train(model, opt, loss_fn, gen, p_dataset, q_dataset, epoch_length, maxiter=config.maxiter, **train_kwargs)
  T = evaluate_tabular(epoch, model, x_test, anomalies, plot_wandb=plot_wandb)
plot_preds_ood(epoch, model, tf.constant(x_train), tf.constant(x_test), tf.constant(anomalies), plot_histogram=True, plot_wandb=plot_wandb, T=T)

In [None]:
for epoch in range(config.epochs_per_plot):
  train(model, opt, loss_fn, gen, p_dataset, q_dataset, epoch_length, maxiter=config.maxiter, **train_kwargs)
  T = evaluate_tabular(epoch, model, x_test, anomalies, plot_wandb=plot_wandb)
plot_preds_ood(epoch, model, tf.constant(x_train), tf.constant(x_test), tf.constant(anomalies), plot_histogram=True, plot_wandb=plot_wandb, T=T)

In [None]:
if plot_wandb:
  wandb.finish()