In [1]:
import torch
from torch import nn 
from torch import optim
from river import compose, metrics, preprocessing, stream, anomaly, linear_model
from OnlineTorch.anomaly import TorchAE, SklearnAnomalyDetector
from tqdm import tqdm
import river  
import torchvision
from pprint import pprint


from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDOneClassSVM
from util import build_anomaly_dataset, Tensor2Dict

In [2]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [3]:
torch.cuda.is_available()

False

In [4]:
def train_test_incremental_base(model, data, update_interv=100):
    scores = []
    truths = []
    iterator = tqdm(data, unit='samples')
    iterator.set_description('Learning from stream')
    loss_sum = 0
    idx = 0
    metric = metrics.Accuracy()
    for x, y in iterator:
        #print(x)
        score = model.predict_one(x)
        model = model.learn_one(x,y)
        scores.append(score)
        if isinstance(y, torch.Tensor):
            y = y.item()
        truths.append(y)
        loss_sum += score
        idx += 1
        if idx == update_interv:
            iterator.set_postfix({f'loss_{update_interv}': loss_sum/update_interv})
            loss_sum = 0
            idx = 0
        metric = metric.update(score,y)
    #print(truths,scores)        
    #print(metrics.Accuracy(truths,scores))
    print(metric)
    return roc_auc_score(truths, scores)

In [5]:
def train_test_incremental(model, data, update_interv=100):
    scores = []
    truths = []
    iterator = tqdm(data, unit='samples')
    iterator.set_description('Learning from stream')
    loss_sum = 0
    idx = 0
    for x, y in iterator:
        model = model.learn_one(x)
        score = model.score_one(x)
        scores.append(score)
        if isinstance(y, torch.Tensor):
            y = y.item()
        truths.append(y)
        loss_sum += score
        idx += 1
        if idx == update_interv:
            iterator.set_postfix({f'loss_{update_interv}': loss_sum/update_interv})
            loss_sum = 0
            idx = 0
    return roc_auc_score(truths, scores)

In [6]:
def build_cae(n_features=1):
    model = nn.Sequential(
        nn.Conv2d(in_channels=n_features, out_channels=32,
                  kernel_size=3, stride=2),
        nn.SELU(),
        nn.Conv2d(in_channels=32, out_channels=16, kernel_size=3, stride=2),
        nn.SELU(),
        nn.Conv2d(in_channels=16, out_channels=8, kernel_size=3, stride=3),
        nn.SELU(),
        nn.ConvTranspose2d(in_channels=8, out_channels=16,
                           kernel_size=3, stride=3),
        nn.SELU(),
        nn.ConvTranspose2d(in_channels=16, out_channels=32,
                           kernel_size=3, stride=2),
        nn.SELU(),
        nn.ConvTranspose2d(in_channels=32, out_channels=n_features,
                           kernel_size=4, stride=2),
    )

    return model

loss = nn.L1Loss
optimizer = optim.AdamW
model = TorchAE(build_fn=build_cae, loss_fn=loss, device=device,
                optimizer_fn=optimizer, learning_rate=0.01, seed=42)

model2 = Tensor2Dict() | anomaly.HalfSpaceTrees(seed=20)
model3 = SklearnAnomalyDetector(SGDOneClassSVM)


In [7]:
mnist = torchvision.datasets.MNIST('./data/', download=True)
mnist_x, mnist_y = mnist.train_data.unsqueeze(1) / 255., mnist.targets
mnist = build_anomaly_dataset(mnist_x, mnist_y)



In [8]:
train_test_incremental(model=model, data=mnist)

Learning from stream: 100%|██████████| 9631/9631 [01:07<00:00, 142.06samples/s, loss_100=0.0611]


0.9880754535021136

In [9]:
train_test_incremental(model=model2, data=mnist)

Learning from stream: 100%|██████████| 9631/9631 [00:58<00:00, 164.68samples/s, loss_100=0.557]


0.8960713049498096

In [10]:
train_test_incremental(model=model3, data=mnist)

Learning from stream: 100%|██████████| 9631/9631 [00:06<00:00, 1516.81samples/s, loss_100=[14.7615074]] 


0.5486082552720202

In [11]:
def build_cae_cifar(n_features=3):
    model = nn.Sequential(
        nn.Conv2d(in_channels=n_features, out_channels=64,
                  kernel_size=3, stride=2),
        nn.SELU(),
        nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2),
        nn.SELU(),
        nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=2),
        nn.SELU(),
        nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2),
        nn.SELU(),
        nn.ConvTranspose2d(in_channels=256, out_channels=128,
                           kernel_size=3, stride=2),
        nn.SELU(),
        nn.ConvTranspose2d(in_channels=128, out_channels=128,
                           kernel_size=3, stride=2),
        nn.SELU(),
        nn.ConvTranspose2d(in_channels=128, out_channels=64,
                           kernel_size=3, stride=2),
        nn.SELU(),
        nn.ConvTranspose2d(in_channels=64, out_channels=n_features,
                           kernel_size=3, stride=2),
    )

    return model

In [12]:
def build_ae(n_features, latent_dim=1):
    model = nn.Sequential(
        nn.Dropout(),
        nn.Linear(n_features, 20), 
        nn.LeakyReLU(),
        nn.Linear(20, latent_dim),
        nn.LeakyReLU(),
        nn.Linear(latent_dim, 20),
        nn.LeakyReLU(), 
        nn.Linear(20, n_features)
    )
    return model

loss =  nn.L1Loss
optimizer = optim.AdamW
model = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    TorchAE(build_fn=build_ae, loss_fn=loss, optimizer_fn=optimizer, learning_rate=0.01, seed=42)
)

model2 = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    anomaly.HalfSpaceTrees(seed=20)
)

model3= compose.Pipeline(
    preprocessing.MinMaxScaler(),
    linear_model.LogisticRegression())

phishing = stream.shuffle(river.datasets.CreditCard().take(8000), 1000, seed=20)
train_test_incremental(model, phishing)

Learning from stream: : 8000samples [00:22, 362.10samples/s, loss_100=0.048] 


0.9816225705329154

In [13]:
phishing = stream.shuffle(river.datasets.CreditCard().take(40000), 1000, seed=20)
train_test_incremental(model2, phishing)

Learning from stream: : 40000samples [00:27, 1480.71samples/s, loss_100=0.378]


0.9336265877820796

In [14]:
phishing = stream.shuffle(river.datasets.CreditCard().take(8000), 1000, seed=20)
train_test_incremental_base(model3, phishing)

Learning from stream: : 8000samples [00:02, 2738.79samples/s, loss_100=0]

Accuracy: 99.69%





0.5

In [15]:
'''phishing = stream.shuffle(river.datasets.CreditCard().take(8000), 1000, seed=20)
update_interv=100
scores = []
truths = []
iterator = tqdm(phishing, unit='samples')
iterator.set_description('Learning from stream')
loss_sum = 0
idx = 0
for x, y in iterator:
        #print(x)
        #model = model.learn_one(x,y)
        score = model3.predict_proba_one(x)
        print(score)
        scores.append(score)        
        truths.append(y)
        loss_sum += score
        idx += 1
        if idx == update_interv:
            iterator.set_postfix({f'loss_{update_interv}': loss_sum/update_interv})
            loss_sum = 0
            idx = 0
    #print(truths, scores)
print(roc_auc_score(truths, scores))
'''

"phishing = stream.shuffle(river.datasets.CreditCard().take(8000), 1000, seed=20)\nupdate_interv=100\nscores = []\ntruths = []\niterator = tqdm(phishing, unit='samples')\niterator.set_description('Learning from stream')\nloss_sum = 0\nidx = 0\nfor x, y in iterator:\n        #print(x)\n        #model = model.learn_one(x,y)\n        score = model3.predict_proba_one(x)\n        print(score)\n        scores.append(score)        \n        truths.append(y)\n        loss_sum += score\n        idx += 1\n        if idx == update_interv:\n            iterator.set_postfix({f'loss_{update_interv}': loss_sum/update_interv})\n            loss_sum = 0\n            idx = 0\n    #print(truths, scores)\nprint(roc_auc_score(truths, scores))\n"

In [16]:
model3= compose.Pipeline(
    preprocessing.MinMaxScaler(),
    linear_model.LogisticRegression())

In [17]:
dataset = river.datasets.CreditCard()
for x, y in dataset:
    pprint(x)
    print(y)
    break

{'Amount': 149.62,
 'Time': 0.0,
 'V1': -1.3598071336738,
 'V10': 0.0907941719789316,
 'V11': -0.551599533260813,
 'V12': -0.617800855762348,
 'V13': -0.991389847235408,
 'V14': -0.311169353699879,
 'V15': 1.46817697209427,
 'V16': -0.470400525259478,
 'V17': 0.207971241929242,
 'V18': 0.0257905801985591,
 'V19': 0.403992960255733,
 'V2': -0.0727811733098497,
 'V20': 0.251412098239705,
 'V21': -0.018306777944153,
 'V22': 0.277837575558899,
 'V23': -0.110473910188767,
 'V24': 0.0669280749146731,
 'V25': 0.128539358273528,
 'V26': -0.189114843888824,
 'V27': 0.133558376740387,
 'V28': -0.0210530534538215,
 'V3': 2.53634673796914,
 'V4': 1.37815522427443,
 'V5': -0.338320769942518,
 'V6': 0.462387777762292,
 'V7': 0.239598554061257,
 'V8': 0.0986979012610507,
 'V9': 0.363786969611213}
0


In [18]:
phishing2 = stream.shuffle(river.datasets.Phishing().take(8000), 1000, seed=20)
phishing2

<generator object shuffle at 0x000002602ED305F0>

In [19]:
phishing2 = stream.shuffle(river.datasets.Phishing().take(10000), 1000, seed=20)
num_true=0
num_false=0
counter=0
for x,y in phishing2:
    counter+=1
    if y == True:
        num_true+=1
    else:
        num_false+=1
print(counter)
print(num_true)
print(num_false)
    

1250
548
702


In [20]:
dataset = river.datasets.Phishing()
counter = 0
for x, y in dataset:
    y = int(y == True)


In [21]:
model1 = compose.Pipeline(
    preprocessing.StandardScaler(),
    TorchAE(build_fn=build_ae, loss_fn=loss, optimizer_fn=optimizer, learning_rate=0.01, seed=42)
)
phishing2 = stream.shuffle(dataset.take(8000), 1000, seed=20)
train_test_incremental(model1, phishing2)

Learning from stream: : 1250samples [00:03, 402.12samples/s, loss_100=0.604]


0.6334170357892985