In [1]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
print("GPUs available:", physical_devices)

GPUs available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
from functools import partial
import math
import numpy as np

from ocml.datasets import build_ds_from_numpy, tfds_from_sampler
from ocml.evaluate import check_LLC, log_metrics
from ocml.models import conventional_dense, spectral_dense
from ocml.plot import plot_2D_contour
from ocml.priors import uniform_tabular
from ocml.train import train, SH_KR, BCE

Perlin noise not available. Please install perlin_numpy package with `pip3 install git+https://github.com/pvigier/perlin-numpy`.


In [4]:
from types import SimpleNamespace

def get_config(debug=False):
  config = SimpleNamespace(
    num_pts = 4000,
    dataset_name = "two-circles",
    optimizer = 'rmsprop',  # optimizer; good default value.
    batch_size = 256,  # should be not too small to ensure diversity.
    domain = [-5, 5.],  # domain on which to sample points.
    scaling = True,
    maxiter = 4,  # very important on high dimensional dataset.
    margin = 0.05,  # very important !
    lbda = 100.,  # important but not as much as `margin`. Must be high for best results.
    k_coef_lip = 1.,  # no reason to change this.
    noise = 0.05,  # to introduce noise in dataset (for better plots)
    spectral_dense = True,  # Mandatory for orthogonal networks. 
    deterministic = False,  # Better with random learning rates.
    conventional = False,  # Conventional training (i.e without hKR and Lipschitz constraint) for sanity check.
    widths = [512, 512, 512, 512],
    warmup_epochs=5,
    epochs_per_plot=5,
    epoch_length = 1000
  )
  return config

In [5]:
debug = True
config = get_config(debug)
train_kwargs = {
  'domain': config.domain,
  'deterministic': config.deterministic,
  'overshoot_boundary': True
}

In [6]:
try:
  import os
  os.environ['WANDB_NOTEBOOK_NAME'] = 'run_tabular.ipynb'
  import wandb
  wandb.login()
  wandb_available = True
except ModuleNotFoundError as e:
  print(e)
  print("Wandb logs will be removed.")
  wandb_available = False
plot_wandb = wandb_available and not debug  # Set to False to de-activate Wandb.
if plot_wandb:  
  import wandb
  wandb.init(project="oneclass", config=config.__dict__)
else:
  try:
    wandb.finish()
  except Exception as e:
    print(e)
    
train_kwargs['log_metrics_fn'] = partial(log_metrics, plot_wandb=plot_wandb)

[34m[1mwandb[0m: Currently logged in as: [33malgue[0m (use `wandb login --relogin` to force relogin)


In [7]:
# Train model.
if config.conventional:
  model = conventional_dense(widths=config.widths, input_shape=(2,))
else:
  model = spectral_dense(widths=config.widths, input_shape=(2,),
                         k_coef_lip=config.k_coef_lip)

if config.conventional:
  loss_fn = BCE()
else:
  loss_fn = SH_KR(config.margin, config.lbda)

In [None]:
import pandas as pd
from tensorflow.keras.utils import get_file

try:
  path = '/data/datasets/tabular/thyroid.mat'
  thyroid_path = get_file(path, origin='https://www.dropbox.com/s/bih0e15a0fukftb/thyroid.mat?dl=1')
  print(thyroid_path)
except:
  print('Error downloading')
  raise

In [14]:
import numpy as np
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
import matplotlib.pyplot as plt
from datetime import datetime, date, time
import pandas as pd

def load_matfile(file_path):
    mat = loadmat(file_path)
    mat = {k:v for k, v in mat.items() if k[0] != '_'}
    df = pd.DataFrame(np.concatenate([mat['X'],mat['y']], axis=1))
    df.rename({6:'label'},axis=1,inplace=True)
    return df
  
df = load_matfile(thyroid_path)
df

Unnamed: 0,0,1,2,3,4,5,label
0,0.774194,0.001132,0.137571,0.275701,0.295775,0.236066,0.0
1,0.247312,0.000472,0.279886,0.329439,0.535211,0.173770,0.0
2,0.494624,0.003585,0.222960,0.233645,0.525822,0.124590,0.0
3,0.677419,0.001698,0.156546,0.175234,0.333333,0.136066,0.0
4,0.236559,0.000472,0.241935,0.320093,0.333333,0.247541,0.0
...,...,...,...,...,...,...,...
3767,0.817204,0.000113,0.190702,0.287383,0.413146,0.188525,0.0
3768,0.430108,0.002453,0.232448,0.287383,0.446009,0.175410,0.0
3769,0.935484,0.024528,0.160342,0.282710,0.375587,0.200000,0.0
3770,0.677419,0.001472,0.190702,0.242991,0.323944,0.195082,0.0


In [15]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,label
count,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0
mean,0.543121,0.008983,0.186826,0.248332,0.376941,0.177301,0.024655
std,0.20379,0.043978,0.070405,0.080579,0.087382,0.054907,0.155093
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.376344,0.001132,0.156546,0.203271,0.328638,0.14918,0.0
50%,0.569892,0.003019,0.190702,0.241822,0.375587,0.17377,0.0
75%,0.709677,0.004528,0.213472,0.28271,0.413146,0.196721,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
normality = df[df['label'] == 0.].drop('label', axis=1)
anomalies = df[df['label'] == 1.].drop('label', axis=1)
print(f"Normality={len(xtrain)} Anomaly={len(anomalies)}")

TrainSize=3679 TestSize=93


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(normality, train_size=1839)

In [None]:
import sklearn.preprocessing as preprocessing

if config.scaling:
  scaler = preprocessing.StandardScaler()
  x_train = scaler.fit_transform(x_train)
  x_test = scaler.transform(x_test)
  normality = scaler.transform(normality)
  anomalies = scaler.transform(anomalies)

In [18]:
from sklearn.neighbors import KDTree
print('Building tree... ')
kdt = KDTree(normality, leaf_size=30, metric='euclidean')
print('Built ! Queries on going... ')
dists, indexes = kdt.query(anomalies, k=20, return_distance=True)
print('Distances of each anomaly to 20 nearest normal points')
pd.DataFrame(dists).describe()

Building tree... 
Built ! Queries on going... 
Distances of each anomaly to 20 nearest normal points


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0
mean,0.156885,0.189169,0.203404,0.211868,0.218009,0.223639,0.228113,0.231686,0.234922,0.237147,0.239168,0.240922,0.242677,0.24418,0.245831,0.247557,0.248793,0.250091,0.251359,0.252469
std,0.144061,0.164098,0.171964,0.174139,0.1755,0.178484,0.180888,0.181552,0.184333,0.184672,0.185184,0.185203,0.185719,0.185678,0.186275,0.18673,0.186862,0.186966,0.18713,0.187132
min,0.020452,0.033404,0.039515,0.042978,0.046284,0.04836,0.052053,0.052758,0.053971,0.054472,0.055042,0.055693,0.056554,0.059147,0.060015,0.062372,0.064062,0.064807,0.06497,0.065075
25%,0.076705,0.096793,0.102849,0.111781,0.115383,0.118006,0.122796,0.126074,0.129049,0.129741,0.131076,0.133208,0.136354,0.137435,0.138851,0.139744,0.140877,0.142986,0.143977,0.145486
50%,0.115396,0.164102,0.172665,0.182197,0.185613,0.186819,0.191554,0.195158,0.195909,0.198712,0.199733,0.201961,0.203891,0.205265,0.205807,0.206654,0.20722,0.20889,0.2105,0.210642
75%,0.174966,0.207383,0.224703,0.23097,0.241523,0.247109,0.250768,0.253127,0.256799,0.258219,0.261309,0.261761,0.263683,0.265746,0.267568,0.274906,0.274977,0.276522,0.277761,0.278141
max,0.793604,0.822521,0.87434,0.875389,0.879521,0.914046,0.923671,0.928649,0.949599,0.955388,0.964525,0.964665,0.965337,0.967732,0.968952,0.970357,0.971795,0.97543,0.977821,0.978856


In [20]:
class Categorical:
    def __init__(self, num_classes):
        self.num_classes = max(num_classes, 2)
    def scale_bounds(self, bounds):
        pass
    def is_outlier(x):
        return np.full(x.shape[0], False, dtype=bool)
    def encode(x):
        if self.num_classes == 2:
            return x.copy()
        one_hot = np.eye(self.num_classes)[x]
        return one_hot
    def sample(self, batch_size):
        if self.num_classes == 2:
            return np.random.randint(2, size=batch_size)[:,np.newaxis]
        classes = np.random.randint(self.num_classes, size=batch_size)
        one_hot = np.eye(self.num_classes)[classes]
        return one_hot

class Gaussian:
    def __init__(self, mean, std, bounds):
        self.mean = mean
        self.std = std
        self.threshold = bounds
    def scale_bounds(self, bounds):
        self.threshold *= bounds
    def is_outlier(x):
        return np.abs(x) > self.threshold * self.std
    def encode(x):
        return (x - mean) / std
    def sample(self, batch_size):
        emp_std = self.threshold * self.std
        return np.random.normal(self.mean, emp_std, (batch_size,1))

class LogUniform:
    def __init__(self, min_v, max_v, shift):
        self.min_v = min_v
        self.max_v = max_v
        self.shift = shift
        self.bounds = 1.
    def scale_bounds(self, bounds):
        self.bounds *= bounds
    def is_outlier(x):
        return np.logical_or(x < self.min_v, x > self.max_v)
    def encode(x):
        return np.log(x + self.shift)
    def sample(self, batch_size):
        min_v, max_v = self.min_v * self.bounds, self.max_v * self.bounds
        return np.random.uniform(min_v, max_v, (batch_size,1))  # uniform in log space

class Sampler:
    def __init__(self, bounds, samplers=None):
        self.samplers = [] if samplers is None else samplers
        self.bounds = bounds
        self.shift = 0.1
    def scale_bounds(self, bounds):
        self.bounds *= bounds
        for sampler in self.samplers:
            sampler.scale_bounds(bounds)
    def add(self, sampler):
        self.samplers.append(sampler)
    def check_integrity(self, batch_size, batch_size_ref):
        assert self.sample(batch_size).shape == (batch_size,) + batch_size_ref
    def encode_numeric_zscore(self, df, df_source, df_train, name, mean=None, sd=None):
        if mean is None:
            mean = df_train[name].mean()
        if sd is None:
            sd = df_train[name].std()
        if sd == 0:
            sd = 1
        df[name] = (df_source[name] - mean) / sd
        bounds = max(df[name].max(), -df[name].min()) / 2
        sampler = Gaussian(0., 1., bounds)
        self.add(sampler)
    def encode_logscale(self, df, df_source, name):
        df[name] = np.log(df_source[name] + self.shift)
        min_v = df[name].min()
        max_v = df[name].max()
        sampler = LogUniform(min_v, max_v, self.shift)
        self.add(sampler)
    def encode_robust_zscore(self, df, df_source, df_train, name, median=None, mad=None):
        if median is None:
            median = df_train[name].median()
        if mad is None:
            absolute_deviation = (df_train[name] - median).abs()
            mad = absolute_deviation.median()
        if mad == 0:
            mad = 1
        df[name] = (df_source[name] - median) / mad * 0.6745
        bounds = max(df[name].max(), -df[name].min()) / 2
        sampler = Gaussian(0., 1., bounds)
        self.add(sampler)
    def encode_text_dummy(self, df, df_source, name):
        uniques = df_source[name].nunique()
        if uniques == 1:
            dummy_name = f"{name}-{df_source[name].iloc[0]}"
            df[dummy_name] = 1.
        elif uniques <= 2:
            dummy_name = f"is-{name}"
            dummies = pd.get_dummies(df_source[name], drop_first=True)
            df[dummy_name] = dummies[list(dummies.columns)[0]]
        else:  # No sparse when more than 1 class to ensure same distance between everyone
            dummies = pd.get_dummies(df_source[name])
            for x in dummies.columns:
                dummy_name = f"{name}-{x}"
                df[dummy_name] = dummies[x]
        sampler = Categorical(uniques)
        self.add(sampler)
    def fit_transform(self, df_source, df_train, continuous_policy, discrete_cols=[]):
        assert continuous_policy in ['robust', 'logscale', 'zscore']
        cols = list(df_source.columns)
        df = pd.DataFrame(index=df_source.index)
        for col in cols:
            if col == 'label':
                continue
            if col in discrete_cols:
                self.encode_text_dummy(df, df_source, col)
            elif continuous_policy == 'robust':
                self.encode_robust_zscore(df, df_source, df_train, col)
            elif continuous_policy == 'logscale':
                self.encode_logscale(df, df_source, col)
            elif continuous_policy == 'zscore':
                self.encode_numeric_zscore(df, df_source, df_train, col)
        df['label'] = df_source['label'].copy()
        self.check_integrity(16, (df.shape[1]-1,))
        return df
    def sample(self, batch_size):
        samples = [sampler.sample(batch_size) for sampler in self.samplers]
        samples = np.concatenate(samples, axis=1)
        return samples

In [21]:
sampler = Sampler(bounds = 5)
continuous_policy = 'zscore'
dt = sampler.fit_transform(df, normality, continuous_policy=continuous_policy)

In [22]:
dt[dt['label'] == 0.].describe()

Unnamed: 0,0,1,2,3,4,5,label
count,3679.0,3679.0,3679.0,3679.0,3679.0,3679.0,3679.0
mean,9.608454000000001e-17,3.090156e-17,-4.635234e-17,-4.82354e-16,-2.935648e-16,-4.326218e-16,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,0.0
min,-2.666389,-0.4237666,-2.752445,-2.795071,-4.308302,-3.028947,0.0
25%,-0.8202846,-0.3279218,-0.4768718,-0.6183669,-0.5467939,-0.5765608,0.0
50%,0.1291406,-0.1715434,0.01961678,-0.09718431,-0.00943554,-0.09882317,0.0
75%,0.8148365,-0.03702428,0.3506092,0.3933405,0.2592436,0.3470653,0.0
max,2.238974,23.95782,11.78364,9.805285,7.137431,15.92131,0.0


In [23]:
dt[dt['label'] == 1.].describe()

Unnamed: 0,0,1,2,3,4,5,label
count,93.0,93.0,93.0,93.0,93.0,93.0,93.0
mean,-0.088649,15.283288,-1.489424,-2.342357,0.245954,-2.518104,1.0
std,0.987771,18.956315,0.875238,0.659303,0.981823,0.641776,0.0
min,-2.666389,0.618756,-2.545574,-3.316253,-2.051397,-3.506685,1.0
25%,-0.820285,3.948103,-2.269747,-2.94836,-0.493058,-3.156344,1.0
50%,-0.029097,8.151824,-1.58018,-2.365862,0.259244,-2.519361,1.0
75%,0.603853,17.736308,-0.890612,-1.844679,0.904074,-1.946075,1.0
max,1.711516,88.69512,0.76435,-0.649025,2.731092,-1.532036,1.0


In [24]:
adv = pd.DataFrame(sampler.sample(10))
adv.columns = dt.columns[:-1]
adv

Unnamed: 0,0,1,2,3,4,5
0,-1.36214,34.73154,3.980301,4.401601,-1.987086,-12.317679
1,-0.274703,-21.321713,-18.345233,-3.416464,-3.951582,-7.373935
2,0.373705,46.230968,1.486343,5.256781,-0.308087,3.887207
3,0.438159,62.617813,7.937922,13.32831,-5.876583,-1.545792
4,-0.410022,8.874788,4.445212,5.099514,-4.067533,9.094335
5,1.613596,142.259266,7.172293,-0.935961,-7.24021,-2.123072
6,0.653245,46.386077,-4.825112,4.998111,0.782694,15.070128
7,-0.936051,35.890904,-1.495519,-8.600099,-3.191691,9.978245
8,0.562922,34.114915,-5.755149,-5.006577,2.667609,-13.21301
9,1.16884,20.693173,-7.800977,7.233669,5.532975,2.95341


In [25]:
dists, indexes = kdt.query(adv.to_numpy(), k=20, return_distance=True)
pd.DataFrame(dists).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,48.242947,48.267594,48.30115,48.327893,48.351834,48.369897,48.37957,48.38842,48.393402,48.3994,48.404759,48.412258,48.414244,48.418999,48.424816,48.428757,48.430738,48.432657,48.434497,48.436162
std,35.885039,35.876161,35.875351,35.88827,35.883074,35.883223,35.880783,35.877184,35.878247,35.878069,35.878669,35.876563,35.877742,35.874534,35.873061,35.870495,35.870883,35.870424,35.870393,35.869775
min,14.130056,14.210941,14.286722,14.328545,14.350341,14.368131,14.396007,14.431716,14.43217,14.436585,14.458231,14.488755,14.491939,14.517909,14.528946,14.547994,14.550073,14.555308,14.559821,14.564168
25%,31.512991,31.527169,31.546565,31.560836,31.565144,31.569859,31.572752,31.577764,31.584133,31.584459,31.584875,31.587642,31.588068,31.589564,31.592808,31.59287,31.59541,31.596321,31.598588,31.59883
50%,37.722421,37.748494,37.787522,37.797832,37.851677,37.873974,37.886298,37.888432,37.889916,37.905003,37.908341,37.916524,37.918513,37.924538,37.926559,37.932466,37.934693,37.93532,37.936273,37.937515
75%,48.330741,48.335029,48.368691,48.386768,48.397989,48.428572,48.433642,48.451327,48.456924,48.460313,48.462857,48.467097,48.469054,48.470954,48.48357,48.485639,48.486183,48.488498,48.489768,48.490948
max,142.395688,142.405399,142.447186,142.517142,142.52529,142.545307,142.553211,142.55519,142.562876,142.568459,142.578236,142.584571,142.590562,142.59095,142.592514,142.592879,142.596572,142.596685,142.59858,142.59887


In [None]:
from sklearn.metrics import roc_auc_score, f1_score

def evaluate(epoch, model, xtest, anomalies, level_set=0):
    test_size, anomalies_size = len(xtest), len(anomalies)
    xx = np.concatenate(xeval, axis=0)
    yy = model.predict(xx, verbose=1, batch_size=2048).flatten()
    ytest, yanomalies = np.split(yy, indices_or_sections=[test_size])
    mean_in, std_in = ytest.mean(), ytest.std()
    mean_out, std_out = yanomalies.mean(), yanomalies.std()
    tp = (ytest >= level_set).sum()
    tn = (yanomalies < level_set).sum()
    fp = len(yanomalies) - tn
    fn = len(ytest) - tp
    true_labels = np.concatenate([np.ones(test_size), np.zeros(anomalies_size)], axis=0)
    roc_auc = roc_auc_score(true_labels, yy)
    f1 = f1_score(true_labels, yy)
    recall_in = tp / (tp + fn) * 100
    recall_out = tn / (tn + fp) * 100
    precision_in = tp / (tp + fp) * 100
    precision_out = tn / (tn + fn) * 100
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    percentiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    trainstats = pd.DataFrame(ytrain).describe(percentiles).transpose()
    teststats = pd.DataFrame(ytest).describe(percentiles).transpose()
    stats = pd.concat([trainstats,teststats], ignore_index=True).round(4)
    stats.index = ['train','test']
    if (epoch+1)%5== 0:
        print(stats)
    print(f"Mean-In={mean_in:.2f}±{std_in:.2f} Mean-Out={mean_out:.2f}±{std_out:.2f}")
    print(f"False-Alarm={100-recall_in:.2f}% Sensivity-Anomalies={recall_out:.2f} Precision-Anomaly={precision_out:.2f}%")
    print(f"ROC_AUC={roc_auc:.2f} Precision={precision_in:.2f}% Recall={recall_in:.2f}% F1={f1:.2f}")
    if plot_wandb:
      wandb.log({'roc_auc': roc_auc, 'recall':recall_in, 'precision':precision_in, 'f1':f1})
    print(msg, flush=True)

In [None]:
# Create positive examples dataset.
p_dataset = build_ds_from_numpy(X, config.batch_size)

In [None]:
# Create optimizer.
opt =  tf.keras.optimizers.get(config.optimizer)

# Initialize the network.
gen = tf.random.Generator.from_seed(4321)  # reproducible sampling.
p_batch = next(iter(p_dataset))
_ = model(p_batch, training=True)  # dummy forward to trigger initialization.
model.summary()

In [None]:
# Adversarial distribution.
def sampler_fn(gen, batch_size, input_shape):
  del gen  # unused.
  del input_shape  # unused.
  return sampler.sample(batch_size)

q_dataset = tfds_from_sampler(sampler_fn, gen, config.batch_size, p_batch.shape[1:])
Q0 = next(iter(q_dataset))

In [None]:
num_epochs = config.warmup_epochs
for epoch in range(num_epochs):
  train(model, opt, loss_fn, gen, p_dataset, q_dataset, config.epoch_length, maxiter=0, **train_kwargs)
  evaluate(epoch, model, xtest, anomalies, level_set=0)

In [None]:
for epoch in range(config.epochs_per_plot):
  train(model, opt, loss_fn, gen, p_dataset, q_dataset, config.epoch_length, maxiter=config.maxiter, **train_kwargs)
