In [1]:
import torch
from torch import nn 
from torch import optim
from river import compose, metrics, preprocessing, stream, anomaly, linear_model, datasets, compose
from river import feature_extraction as fx
from river.tree import HoeffdingAdaptiveTreeClassifier
from river import optim as op

from IncrementalTorch.anomaly.anomaly import Autoencoder, BasicAutoencoder
from river import compat

from tqdm import tqdm
import river  
import torchvision
import numpy as np
import pandas as pd
from pprint import pprint


from OnlineTorch.classifier import PyTorch2RiverClassifier
from OnlineTorch.anomaly import TorchAE

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDOneClassSVM
from sklearn.cluster import k_means
from sklearn.decomposition import IncrementalPCA
from sklearn.metrics import confusion_matrix as cm
from sklearn.datasets import fetch_covtype
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from river import evaluate


N_SAMPLES = 5000 #number of elements stored in memory
SEED = 42 #Random Seed for shuffles, constant
#track_name = "RBF"
#LOSS = nn.BCELoss
LOSS = nn.L1Loss #Lossfunction, constant
#LOSS = nn.CrossEntropyLoss
OPTIMIZER = optim.AdamW #Optimizer, constant
BATCH_SIZE=10 #Tracked for PCA, not necessary anymore
LEARNING_RATE=1e-3 #Constant

rocauc = river.metrics.ROCAUC() #metric for evaluation

threshhold=0 #Tracked for Confusion Matrix, not necessary anymore

########################################################################################
#Latent Dim 1 or 2; both evaluated; 
LATENT_DIM = 1
#######################################################################################

##max 5% with 10000 samples
anom_percentage_credit = [50,40,30,20,10,5,2.5,1.25,0.625,0.313,0.172] #everything evaluated
num_samples_credit = 9840 #50% * 9840 --> fits with num anomalies

anom_percentage_covtype = [50,40,30,20,10,5,2.5,1.25,0.625,0.313,0.172] #everything evaluated
num_samples_covtype = 25000
num_anom_classes_covtype=[1,2,3,4,5] #if you want 1 anom class type 1, up to 5 anomalie classes

#Structure Result csv
evaluation=pd.DataFrame(columns=['model','num_neurons','dataset','anom_percentage','num_samples','num_anom_classes','learn_supervised','Loss','Optimizer','Batch_size','Learning_rate','Latent_dim','ROC'])

# Dataset

## Covtype Dataset

class 5 contains 9493 instances
class 2 contains 283301 instances
class 1 contains 211840 instances
class 7 contains 20510 instances
class 3 contains 35754 instances
class 6 contains 17367 instances
class 4 contains 2747 instances

In [2]:
# function to create dataset out of covertype dataset, working with random shuffles of all anomalie classes, take according number of anomalies, fitting dataset into stream format
def make_covtype_dataset(anom_percentage_covtype,num_samples_covtype,num_anom_classes_covtype):  
    covtype_x, covtype_y=fetch_covtype(as_frame=True,return_X_y=True)
    covtype_df=covtype_x
    covtype_df['target']=covtype_y
    covtype_df['target']=covtype_df['target']-1
    covtype_df_classes=covtype_df.target.unique()
    for i in covtype_df_classes:
        print('class {0} contains {1} instances'.format(i,covtype_df[covtype_df.target==i].Elevation.count()))

    for i in range (1,num_anom_classes_covtype+1):
        covtype_df.target[covtype_df.target==i]=1
        covtype_df.loc[covtype_df['target']==i, 'target']=1

    covtype_df_filtered=covtype_df[covtype_df.target<=num_anom_classes_covtype]
    covtype_df_filtered.dropna(inplace=True)
    covtype_df_filtered = covtype_df_filtered.sample(frac=1,random_state=10).reset_index(drop=True)


    num_anom = int(num_samples_covtype*(anom_percentage_covtype/100))
    num_clean = int(num_samples_covtype-num_anom)

    anoms=covtype_df_filtered[covtype_df_filtered['target']==1].iloc[0:num_anom]
    clean = covtype_df_filtered[covtype_df_filtered['target']==0].iloc[0:num_clean]
    frames = [anoms,clean]
    print('Stream contains {} anomalies and {} no-anomalies'.format(anoms['target'].count(),clean['target'].count()))

    final_set = pd.concat(frames,ignore_index=True)


    x_covtype=final_set.iloc[:,:-1].transpose().to_dict()
    y_covtype=final_set.iloc[:,-1].transpose().to_dict()
    final_set = final_set.sample(frac=1,random_state=10).reset_index(drop=True)

    frames_test_test=pd.DataFrame(data=[x_covtype,y_covtype]).transpose()
    final_set=frames_test_test.copy()

    #final_set_x=final_set.iloc[:,:-1]
    #final_set_y=final_set.iloc[:,-1:]

    return final_set,'covertpye', anom_percentage_covtype, num_samples_covtype, num_anom_classes_covtype


## Credit Card Dataset

In [3]:
# function to create dataset out of CreditCard dataset, works with multiplaying anomalies, random shuffles of all anomalies, take according number of anomalies, fitting dataset into stream format
data_stream = stream.shuffle(river.datasets.CreditCard(), N_SAMPLES, seed=SEED)
data1 = pd.DataFrame(data=data_stream)
df_test= pd.DataFrame.from_dict(data1)
anoms=df_test[df_test[1]==1]
for i in range(0,10):
    df_test=df_test.append(anoms)
    i=i+1
df_test = df_test.sample(frac=1,random_state=10).reset_index(drop=True)

def make_credit_dataset(anom_percentage_credit,num_samples_credit,df_test):
    num_anom = int(num_samples_credit*(anom_percentage_credit/100))
    num_clean = int(num_samples_credit-num_anom)
    anoms=df_test[df_test[1]==1].iloc[0:num_anom]
    clean = df_test[df_test[1]==0].iloc[0:num_clean]
    frames = [anoms,clean]
    print('Stream contains {} anomalies and {} no-anomalies'.format(anoms[0].count(),clean[0].count()))
    final_set = pd.concat(frames,ignore_index=True)
    final_set = final_set.sample(frac=1,random_state=10).reset_index(drop=True)    
    return final_set,'creditcard', anom_percentage_credit, num_samples_credit, 1

# Autoencoder

## Undercomplete Autoencoder standard

In [4]:
def undercomplete_ae(n_features, latent_dim=LATENT_DIM):
    net = nn.Sequential(
        nn.Dropout(),
        nn.Linear(n_features, 10),
        nn.LeakyReLU(),
        nn.Linear(10, latent_dim),
        nn.LeakyReLU(),
        nn.Linear(latent_dim, 10),
        nn.LeakyReLU(), 
        nn.Linear(10, n_features),
    )
    return net

## Stacked Autoencoder

### Half Neurons

In [5]:
'''def stacked_ae(n_features, latent_dim=LATENT_DIM):
    net = nn.Sequential(
        nn.Dropout(),
        nn.Linear(n_features, 10),
        nn.LeakyReLU(),
        nn.Linear(10, 5),
        nn.LeakyReLU(),        
        nn.Linear(5, 3),
        nn.LeakyReLU(),        
        nn.Linear(3, latent_dim),
        nn.LeakyReLU(),
        nn.Linear(latent_dim, 3),
        nn.LeakyReLU(),
        nn.Linear(3, 5),
        nn.LeakyReLU(),
        nn.Linear(5, 10),
        nn.LeakyReLU(),               
        nn.Linear(10, n_features) 
    )
    return net
    '''

'def stacked_ae(n_features, latent_dim=LATENT_DIM):\n    net = nn.Sequential(\n        nn.Dropout(),\n        nn.Linear(n_features, 10),\n        nn.LeakyReLU(),\n        nn.Linear(10, 5),\n        nn.LeakyReLU(),        \n        nn.Linear(5, 3),\n        nn.LeakyReLU(),        \n        nn.Linear(3, latent_dim),\n        nn.LeakyReLU(),\n        nn.Linear(latent_dim, 3),\n        nn.LeakyReLU(),\n        nn.Linear(3, 5),\n        nn.LeakyReLU(),\n        nn.Linear(5, 10),\n        nn.LeakyReLU(),               \n        nn.Linear(10, n_features) \n    )\n    return net\n    '

### Full Neurons

In [6]:
def stacked_ae(n_features, latent_dim=LATENT_DIM):
    net = nn.Sequential(
        nn.Dropout(),
        nn.Linear(n_features, 20),
        nn.LeakyReLU(),
        nn.Linear(20, 10),
        nn.LeakyReLU(),        
        nn.Linear(10, 5),
        nn.LeakyReLU(),        
        nn.Linear(5, latent_dim),
        nn.LeakyReLU(),
        nn.Linear(latent_dim, 5),
        nn.LeakyReLU(),
        nn.Linear(5, 10),
        nn.LeakyReLU(),
        nn.Linear(10, 20),
        nn.LeakyReLU(),               
        nn.Linear(20, n_features) 
    )
    return net

# Non-AE-(Incremental Baselines)
## Tests before evaluation, results based on "Evaluation" Section

## Unsupervised

### OneClassSVM

In [7]:
# getestet mit folgenden Scalern und q-Werten, allerdings keine vernünftigen Ergebnisse bekommen
'''#funktioniert nur mit river=0.9, allerdings muss dann git repo geupdatet werden, da anomalie.anomaliedetector klasse nicht mehr in base sondern in anomly ist --> Wheels können nicht mehr so gebaut werden wie bisher
nu_given=[True,False]
with_standardscaler=[True,False]
anom_percentage_credit=1

qq=[0.95,0.99,0.995]
for q_value in qq:
    for with_standardscaler_var in with_standardscaler:
        for nu_given_var in nu_given:
            if nu_given_var and with_standardscaler_var:
                    model4 = compose.Pipeline(
                    preprocessing.StandardScaler(),
                    fx.RBFSampler(),
                    anomaly.QuantileThresholder(
                        anomaly.OneClassSVM(nu=anom_percentage_credit/100),
                        q=q_value #q Anpassung viele Auswirkungen
                    )
                    )
            if nu_given_var and not with_standardscaler_var:
                    model4 = compose.Pipeline(
        #            preprocessing.StandardScaler(),
                    fx.RBFSampler(),
                    anomaly.QuantileThresholder(
                        anomaly.OneClassSVM(nu=anom_percentage_credit/100),
                        q=q_value #q Anpassung viele Auswirkungen
                    )
                )

            if not nu_given_var and with_standardscaler_var:
                model4 = compose.Pipeline(
                    preprocessing.StandardScaler(),
                    fx.RBFSampler(),    
                    anomaly.QuantileThresholder(
                        anomaly.OneClassSVM(),
                        q=q_value #q Anpassung viele Auswirkungen
                    )
                )
            
            if not nu_given_var and not with_standardscaler_var:
                model4 = compose.Pipeline(
    #                preprocessing.StandardScaler(),
                    fx.RBFSampler(),    
                    anomaly.QuantileThresholder(
                        anomaly.OneClassSVM(),
                        q=q_value #q Anpassung viele Auswirkungen
                    )
                )

            rocauc= river.metrics.ROCAUC()
            j=0
            data_stream = stream.shuffle(make_credit_dataset(1,5000,df_test)[0].itertuples(index=False),N_SAMPLES, seed=SEED)
            for x, y in data_stream:
                model4.learn_one(x)
                y_pred= model4.score_one(x)
                rocauc.update(y,y_pred)
                if j<5:
                    print(y_pred)
                    print(y)
                    j=j+1
            print(rocauc)
'''

'#funktioniert nur mit river=0.9, allerdings muss dann git repo geupdatet werden, da anomalie.anomaliedetector klasse nicht mehr in base sondern in anomly ist --> Wheels können nicht mehr so gebaut werden wie bisher\nnu_given=[True,False]\nwith_standardscaler=[True,False]\nanom_percentage_credit=1\n\nqq=[0.95,0.99,0.995]\nfor q_value in qq:\n    for with_standardscaler_var in with_standardscaler:\n        for nu_given_var in nu_given:\n            if nu_given_var and with_standardscaler_var:\n                    model4 = compose.Pipeline(\n                    preprocessing.StandardScaler(),\n                    fx.RBFSampler(),\n                    anomaly.QuantileThresholder(\n                        anomaly.OneClassSVM(nu=anom_percentage_credit/100),\n                        q=q_value #q Anpassung viele Auswirkungen\n                    )\n                    )\n            if nu_given_var and not with_standardscaler_var:\n                    model4 = compose.Pipeline(\n        #    

### HalfSpaceTrees

In [8]:
'''
model3 = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    anomaly.HalfSpaceTrees(seed=SEED)
)
## gibt Anomalie Score aus

rocauc= river.metrics.ROCAUC()
#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=42)
j=0
#data_stream = stream.shuffle(make_credit_dataset(10,10000,df_test)[0].itertuples(index=False),N_SAMPLES, seed=SEED)
data_stream = stream.shuffle(dataset[0].itertuples(index=False),N_SAMPLES, seed=SEED)
counter_supervised=0
ls=False
model3 = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    anomaly.HalfSpaceTrees(seed=SEED)
)
for x, y in data_stream:
    #model3.learn_one(x)
    #y_pred= model3.score_one(x)
#    if ls and y==0:
#        model3.learn_one(x)
    if not ls or counter_supervised==0:
        model3.learn_one(x)
        counter_supervised=counter_supervised+1
    y_pred= model3.score_one(x) #high score means outlier
    if j<5:
        print(y_pred)
        print(y)
        j=j+1
    rocauc.update(y,y_pred)
#evaluation.append(rocauc)
print(rocauc)
evaluation=evaluation.append({
    'model':'HalfSpaceTrees_min_max_scaler',
    'anom_percentage_credit':anom_percentage_credit,
    'num_samples_credit':num_samples_credit,
    'Loss':LOSS,
    'Optimizer':OPTIMIZER,
    'Batch_size':BATCH_SIZE,
    'Learning_rate':LEARNING_RATE,
    'Latent_dim': LATENT_DIM,
    'ROC':rocauc,
    #'Precision_Recall_Curve':45,
    #'FP':5,
    #'TP': 4,
    #'FN':8,
    #'TN':10

},ignore_index=True)
'''

"\nmodel3 = compose.Pipeline(\n    preprocessing.MinMaxScaler(),\n    anomaly.HalfSpaceTrees(seed=SEED)\n)\n## gibt Anomalie Score aus\n\nrocauc= river.metrics.ROCAUC()\n#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=42)\nj=0\n#data_stream = stream.shuffle(make_credit_dataset(10,10000,df_test)[0].itertuples(index=False),N_SAMPLES, seed=SEED)\ndata_stream = stream.shuffle(dataset[0].itertuples(index=False),N_SAMPLES, seed=SEED)\ncounter_supervised=0\nls=False\nmodel3 = compose.Pipeline(\n    preprocessing.MinMaxScaler(),\n    anomaly.HalfSpaceTrees(seed=SEED)\n)\nfor x, y in data_stream:\n    #model3.learn_one(x)\n    #y_pred= model3.score_one(x)\n#    if ls and y==0:\n#        model3.learn_one(x)\n    if not ls or counter_supervised==0:\n        model3.learn_one(x)\n        counter_supervised=counter_supervised+1\n    y_pred= model3.score_one(x) #high score means outlier\n    if j<5:\n        print(y_pred)\n        print(y)\n        j=j+1\n    roc

## Supervised

### SGDRegressor (SVM-like with Kernel)

In [9]:
'''
data_stream = stream.shuffle(make_credit_dataset(10,5000,df_test)[0].itertuples(index=False),N_SAMPLES, seed=SEED)


#SGDRegressor (SVM-like with Kernel)
model6=compose.Pipeline(
    preprocessing.StandardScaler(),
    fx.RBFSampler(),
    compat.convert_sklearn_to_river(SGDClassifier(),classes=[False, True]),
    )

#HoeffdingAdaptiveTreeClassifier
model8 = compose.Pipeline(
    preprocessing.StandardScaler(),
    HoeffdingAdaptiveTreeClassifier()
)

#Logistic Regression
model9 = compose.Pipeline(
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression(optimizer=op.SGD(0.1)))

rocauc= river.metrics.ROCAUC(n_thresholds=100)

for x, y in data_stream:
    #y_pred= model3.predict_one(x)
    model3.learn_one(x,y)       
    rocauc.update(y,y_pred)

rocauc
'''

'\ndata_stream = stream.shuffle(make_credit_dataset(10,5000,df_test)[0].itertuples(index=False),N_SAMPLES, seed=SEED)\n\n\n#SGDRegressor (SVM-like with Kernel)\nmodel6=compose.Pipeline(\n    preprocessing.StandardScaler(),\n    fx.RBFSampler(),\n    compat.convert_sklearn_to_river(SGDClassifier(),classes=[False, True]),\n    )\n\n#HoeffdingAdaptiveTreeClassifier\nmodel8 = compose.Pipeline(\n    preprocessing.StandardScaler(),\n    HoeffdingAdaptiveTreeClassifier()\n)\n\n#Logistic Regression\nmodel9 = compose.Pipeline(\n    preprocessing.StandardScaler(),\n    linear_model.LogisticRegression(optimizer=op.SGD(0.1)))\n\nrocauc= river.metrics.ROCAUC(n_thresholds=100)\n\nfor x, y in data_stream:\n    #y_pred= model3.predict_one(x)\n    model3.learn_one(x,y)       \n    rocauc.update(y,y_pred)\n\nrocauc\n'

### HoeffdingAdaptiveTreeClassifier

In [10]:
'''model8 = compose.Pipeline(
    preprocessing.StandardScaler(),
    HoeffdingAdaptiveTreeClassifier()
)

rocauc= river.metrics.ROCAUC()
#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=42)
i=0
j=0
data_stream = stream.shuffle(final_set.itertuples(index=False),N_SAMPLES, seed=SEED)
for x, y in data_stream:
    #print(y)
    if i==0:
        model8.learn_one(x,y)
        i=i+1
    y_pred=model8.predict_one(x)
    #print(y_pred)
    model8.learn_one(x,y)
    rocauc.update(y,y_pred)
    if j<5:
        print(y_pred)
        print(y)
        j=j+1
rocauc
evaluation=evaluation.append({
    'model':'HalfSpaceTrees_min_max_scaler',
    'anom_percentage_credit':anom_percentage_credit,
    'num_samples_credit':num_samples_credit,
    'Loss':LOSS,
    'Optimizer':OPTIMIZER,
    'Batch_size':BATCH_SIZE,
    'Learning_rate':LEARNING_RATE,
    'Latent_dim': LATENT_DIM,
    'ROC':rocauc,
    #'Precision_Recall_Curve':45,
    #'FP':5,
    #'TP': 4,
    #'FN':8,
    #'TN':10

},ignore_index=True)
'''

"model8 = compose.Pipeline(\n    preprocessing.StandardScaler(),\n    HoeffdingAdaptiveTreeClassifier()\n)\n\nrocauc= river.metrics.ROCAUC()\n#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=42)\ni=0\nj=0\ndata_stream = stream.shuffle(final_set.itertuples(index=False),N_SAMPLES, seed=SEED)\nfor x, y in data_stream:\n    #print(y)\n    if i==0:\n        model8.learn_one(x,y)\n        i=i+1\n    y_pred=model8.predict_one(x)\n    #print(y_pred)\n    model8.learn_one(x,y)\n    rocauc.update(y,y_pred)\n    if j<5:\n        print(y_pred)\n        print(y)\n        j=j+1\nrocauc\nevaluation=evaluation.append({\n    'model':'HalfSpaceTrees_min_max_scaler',\n    'anom_percentage_credit':anom_percentage_credit,\n    'num_samples_credit':num_samples_credit,\n    'Loss':LOSS,\n    'Optimizer':OPTIMIZER,\n    'Batch_size':BATCH_SIZE,\n    'Learning_rate':LEARNING_RATE,\n    'Latent_dim': LATENT_DIM,\n    'ROC':rocauc,\n    #'Precision_Recall_Curve':45,\n    #'FP

### Logistic Regression

In [11]:
'''
model9 = compose.Pipeline(
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression(optimizer=op.SGD(0.1))
)

rocauc= river.metrics.ROCAUC()
i=0
#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=42)
data_stream = stream.shuffle(final_set.itertuples(index=False),N_SAMPLES, seed=SEED)
for x, y in data_stream:
    y_pred=model9.predict_one(x)
    model9.learn_one(x,y)
    rocauc.update(y,y_pred)
    if i<5:
        print(y_pred)
        print(y)
        i=i+1
#evaluation.append(rocauc)
rocauc
evaluation=evaluation.append({
    'model':'HalfSpaceTrees_min_max_scaler',
    'anom_percentage_credit':anom_percentage_credit,
    'num_samples_credit':num_samples_credit,
    'Loss':LOSS,
    'Optimizer':OPTIMIZER,
    'Batch_size':BATCH_SIZE,
    'Learning_rate':LEARNING_RATE,
    'Latent_dim': LATENT_DIM,
    'ROC':rocauc,
    #'Precision_Recall_Curve':45,
    #'FP':5,
    #'TP': 4,
    #'FN':8,
    #'TN':10

},ignore_index=True)
'''

"\nmodel9 = compose.Pipeline(\n    preprocessing.StandardScaler(),\n    linear_model.LogisticRegression(optimizer=op.SGD(0.1))\n)\n\nrocauc= river.metrics.ROCAUC()\ni=0\n#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=42)\ndata_stream = stream.shuffle(final_set.itertuples(index=False),N_SAMPLES, seed=SEED)\nfor x, y in data_stream:\n    y_pred=model9.predict_one(x)\n    model9.learn_one(x,y)\n    rocauc.update(y,y_pred)\n    if i<5:\n        print(y_pred)\n        print(y)\n        i=i+1\n#evaluation.append(rocauc)\nrocauc\nevaluation=evaluation.append({\n    'model':'HalfSpaceTrees_min_max_scaler',\n    'anom_percentage_credit':anom_percentage_credit,\n    'num_samples_credit':num_samples_credit,\n    'Loss':LOSS,\n    'Optimizer':OPTIMIZER,\n    'Batch_size':BATCH_SIZE,\n    'Learning_rate':LEARNING_RATE,\n    'Latent_dim': LATENT_DIM,\n    'ROC':rocauc,\n    #'Precision_Recall_Curve':45,\n    #'FP':5,\n    #'TP': 4,\n    #'FN':8,\n    #'TN':10\n

# Evaluation

## Covtype Evaluation

In [12]:
counter=0
counter_supervised=0
#for i in range(0,anom_percentage_credit.__len__()):
#    datasets_credit.append(make_credit_dataset(anom_percentage_credit[i],num_samples_credit,df_test))
learn_supervised=[True,False]

for i in range(0,anom_percentage_covtype.__len__()):
    for j in range(0,num_anom_classes_covtype.__len__()):
        dataset=make_covtype_dataset(anom_percentage_covtype[i],num_samples_covtype,num_anom_classes_covtype[j])
        ls=False
        for ls in learn_supervised:
            stacked_ae_model = compose.Pipeline(
            preprocessing.StandardScaler(),
            TorchAE(
                        build_fn = stacked_ae,
                        loss_fn = LOSS,
                        optimizer_fn = OPTIMIZER,
                        learning_rate=LEARNING_RATE,
                        seed=SEED
                )
            )
            undercomplete_ae_model = compose.Pipeline(
            preprocessing.StandardScaler(),
            TorchAE(
                        build_fn = undercomplete_ae,
                        loss_fn = LOSS,
                        optimizer_fn = OPTIMIZER,
                        learning_rate=LEARNING_RATE,
                        seed=SEED
                    )
                )
            #HalfSpaceTrees

            halfSpaceTrees = compose.Pipeline(
                preprocessing.MinMaxScaler(),
                anomaly.HalfSpaceTrees(seed=SEED)
            )
            #SGDRegressor (SVM-like with Kernel)
            sgdregressor=compose.Pipeline(
                preprocessing.StandardScaler(),
                fx.RBFSampler(),
                compat.convert_sklearn_to_river(SGDClassifier(),classes=[False, True]),
                )
            #HoeffdingAdaptiveTreeClassifier
            hoeffdingAdaptiveTreeClassifier = compose.Pipeline(
                preprocessing.StandardScaler(),
                HoeffdingAdaptiveTreeClassifier()
            )
            #Logistic Regression
            logistic_Regression = compose.Pipeline(
                preprocessing.StandardScaler(),
                linear_model.LogisticRegression(optimizer=op.SGD(0.1)))
            #models = [undercomplete_ae_model,stacked_ae_model,halfSpaceTrees,sgdregressor,hoeffdingAdaptiveTreeClassifier,logistic_Regression]
            #models = [undercomplete_ae_model,stacked_ae_model]
            models = [halfSpaceTrees]
            model_counter=0    
            for model in models:          
                counter_supervised=0
                print(model_counter)
                if model_counter==0:
                    #model_name='undercomplete_ae_model'
                    model_name='halfSpaceTrees_no2'
                if model_counter==1:
                    model_name='stacked_ae_model'
                if model_counter==2:
                    model_name='halfSpaceTrees'
                if model_counter==3:
                    model_name='sgdregressor'
                if model_counter==4:
                    model_name='hoeffdingAdaptiveTreeClassifier'
                if model_counter==5:
                    model_name='logistic_Regression'

                rocauc= river.metrics.ROCAUC(n_thresholds=100)
                data_stream = stream.shuffle(dataset[0].itertuples(index=False),N_SAMPLES, seed=SEED)
                learning_counter=0
                if model_counter<3:
                    for x, y in data_stream:
                        if learning_counter==0:
                            model.learn_one(x)
                            learning_counter=learning_counter+1
                        y_pred= model.score_one(x) #high score means outlier
                        if ls and y==0:
                            model.learn_one(x)
                        if not ls or counter_supervised==0:
                            model.learn_one(x)
                            counter_supervised=counter_supervised+1
                        y_pred= model.score_one(x) #high score means outlier
                        #if j<3:
                        #    print(y_pred)
                        #    print(y)
                        #    j=j+1
                        rocauc.update(y,y_pred)
                if model_counter>2 and ls:
                    for x, y in data_stream:
                        if learning_counter==0:
                            model.learn_one(x,y)
                            learning_counter=learning_counter+1
                        y_pred=model.predict_one(x)
                        model.learn_one(x,y)
                        rocauc.update(y,y_pred)

                model_counter=model_counter+1                 
                print(model_name)
                print(rocauc)
                evaluation=evaluation.append({
                    'model': model_name,
                    'num_neurons': [10,5,3],
                    'dataset': dataset[1],
                    'anom_percentage':dataset[2],
                    'num_samples':dataset[3],
                    'num_anom_classes':dataset[4],
                    'learn_supervised':ls,
                    'Loss':LOSS,
                    'Optimizer':OPTIMIZER,
                    'Batch_size':BATCH_SIZE,
                    'Learning_rate':LEARNING_RATE,
                    'Latent_dim': LATENT_DIM,
                    'ROC':rocauc,
                },ignore_index=True)

#evaluation.to_csv('evaluation_8_halfSpaceTrees.vers')

class 4 contains 9493 instances
class 1 contains 283301 instances
class 0 contains 211840 instances
class 6 contains 20510 instances
class 2 contains 35754 instances
class 5 contains 17367 instances
class 3 contains 2747 instances


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covtype_df.target[covtype_df.target==i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Stream contains 12500 anomalies and 12500 no-anomalies
0
halfSpaceTrees_no2
ROCAUC: 0.146988
0
halfSpaceTrees_no2
ROCAUC: 0.458393
class 4 contains 9493 instances
class 1 contains 283301 instances
class 0 contains 211840 instances
class 6 contains 20510 instances
class 2 contains 35754 instances
class 5 contains 17367 instances
class 3 contains 2747 instances
Stream contains 12500 anomalies and 12500 no-anomalies
0
halfSpaceTrees_no2
ROCAUC: 0.154865
0
halfSpaceTrees_no2
ROCAUC: 0.526625
class 4 contains 9493 instances
class 1 contains 283301 instances
class 0 contains 211840 instances
class 6 contains 20510 instances
class 2 contains 35754 instances
class 5 contains 17367 instances
class 3 contains 2747 instances
Stream contains 12500 anomalies and 12500 no-anomalies
0


KeyboardInterrupt: 

## Creditcard Evaluation

In [None]:
counter=0
counter_supervised=0
#for i in range(0,anom_percentage_credit.__len__()):
#    datasets_credit.append(make_credit_dataset(anom_percentage_credit[i],num_samples_credit,df_test))
learn_supervised=[True,False]

for i in range(0,anom_percentage_credit.__len__()):
    for j in range(0,num_anom_classes_covtype.__len__()):
        dataset=make_credit_dataset(anom_percentage_credit[i],num_samples_credit,df_test)
        for ls in learn_supervised:
            stacked_ae_model = compose.Pipeline(
            preprocessing.StandardScaler(),
            TorchAE(
                        build_fn = stacked_ae,
                        loss_fn = LOSS,
                        optimizer_fn = OPTIMIZER,
                        learning_rate=LEARNING_RATE,
                        seed=SEED
                )
            )
            undercomplete_ae_model = compose.Pipeline(
            preprocessing.StandardScaler(),
            TorchAE(
                        build_fn = undercomplete_ae,
                        loss_fn = LOSS,
                        optimizer_fn = OPTIMIZER,
                        learning_rate=LEARNING_RATE,
                        seed=SEED
                    )
                )
            halfSpaceTrees = compose.Pipeline(
                preprocessing.MinMaxScaler(),
                anomaly.HalfSpaceTrees(seed=SEED)
            )
            #SGDRegressor (SVM-like with Kernel)
            sgdregressor=compose.Pipeline(
                preprocessing.StandardScaler(),
                fx.RBFSampler(),
                compat.convert_sklearn_to_river(SGDClassifier(),classes=[False, True]),
                )
            #HoeffdingAdaptiveTreeClassifier
            hoeffdingAdaptiveTreeClassifier = compose.Pipeline(
                preprocessing.StandardScaler(),
                HoeffdingAdaptiveTreeClassifier()
            )
            #Logistic Regression
            logistic_Regression = compose.Pipeline(
                preprocessing.StandardScaler(),
                linear_model.LogisticRegression(optimizer=op.SGD(0.1)))
            #models = [undercomplete_ae_model,stacked_ae_model,halfSpaceTrees,sgdregressor,hoeffdingAdaptiveTreeClassifier,logistic_Regression]
            #models = [undercomplete_ae_model,stacked_ae_model]
            models= [halfSpaceTrees]
            model_counter=0    
            for model in models:          
                counter_supervised=0
                print(model_counter)
                if model_counter==0:
                    model_name='undercomplete_ae_model'
                if model_counter==1:
                    model_name='stacked_ae_model'
                if model_counter==2:
                    model_name='halfSpaceTrees'
                if model_counter==3:
                    model_name='sgdregressor'
                if model_counter==4:
                    model_name='hoeffdingAdaptiveTreeClassifier'
                if model_counter==5:
                    model_name='logistic_Regression'

                rocauc= river.metrics.ROCAUC(n_thresholds=100)
                data_stream = stream.shuffle(dataset[0].itertuples(index=False),N_SAMPLES, seed=SEED)
                learning_counter=0
                counter1=0
                counter2 =0                
                if model_counter<3:
                    for x, y in data_stream:
                        if learning_counter==0:
                            model.learn_one(x)
                            learning_counter=learning_counter+1
                        y_pred= model.score_one(x) #high score means outlier
                        if ls and y==0:
                            model.learn_one(x)
                        if not ls or counter_supervised==0:
                            model.learn_one(x)
                            counter_supervised=counter_supervised+1
                        y_pred= model.score_one(x) #high score means outlier

                        if y==1:
                            counter1=counter1+1
                        if y==0:
                            counter2=counter2+1
                        #if j<3:
                        #    print(y_pred)
                        #    print(y)
                        #    j=j+1
                        rocauc.update(y,y_pred)
                    print(counter1)
                    print(counter2)

                if model_counter>2 and ls:
                    for x, y in data_stream:
                        if learning_counter==0:
                            model.learn_one(x,y)
                            learning_counter=learning_counter+1
                        y_pred=model.predict_one(x)
                        model.learn_one(x,y)
                        rocauc.update(y,y_pred)

                model_counter=model_counter+1                 
                print(model_name)
                print(rocauc)
                evaluation=evaluation.append({
                    'model': model_name,
                    'num_neurons': [10,5,3],
                    'dataset': dataset[1],
                    'anom_percentage':dataset[2],
                    'num_samples':dataset[3],
                    'num_anom_classes':dataset[4],
                    'learn_supervised':ls,
                    'Loss':LOSS,
                    'Optimizer':OPTIMIZER,
                    'Batch_size':BATCH_SIZE,
                    'Learning_rate':LEARNING_RATE,
                    'Latent_dim': LATENT_DIM,
                    'ROC':rocauc,
                },ignore_index=True)
#evaluation.to_csv('evaluation_9_halfSpaceTrees.vers')

# Random/Backup

### Runtergezogenes

In [None]:
'''undercomplete_ae_model = compose.Pipeline(
    #preprocessing.StandardScaler(),
    compat.PyTorch2RiverClassifier(
                build_fn = undercomplete_ae,
                loss_fn = nn.BCELoss,
                optimizer_fn = OPTIMIZER,
                #batch_size=BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                seed=SEED
    )
)
'''
'''undercomplete_ae_model = Autoencoder(
                build_fn = undercomplete_ae,
                loss_fn = LOSS,
                optimizer_fn = OPTIMIZER,
                #batch_size=BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                seed=SEED
    )
'''
'''undercomplete_ae_model = compose.Pipeline(
    preprocessing.StandardScaler(),
    PyTorch2RiverClassifier(
                build_fn = undercomplete_ae,
                loss_fn = nn.MSELoss,
                optimizer_fn = OPTIMIZER,
                #batch_size=BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                seed=SEED
    )
)
'''

## OLD Undercomplete Autoencoder with Softmax

In [None]:
'''def undercomplete_ae_sm(n_features, latent_dim=LATENT_DIM):
    net = nn.Sequential(
        nn.Dropout(),
        nn.Linear(n_features, 20), 
        nn.LeakyReLU(),
        nn.Linear(20, latent_dim),
        nn.LeakyReLU(),
        nn.Linear(latent_dim, 20),
        nn.LeakyReLU(), 
        nn.Linear(20, n_features),
        nn.Linear(n_features,1),
        nn.Softmax()
    )
    return net
    '''

'''undercomplete_ae_sm_model = compose.Pipeline(
    preprocessing.StandardScaler(),
    PyTorch2RiverClassifier(
                build_fn = undercomplete_ae_sm,
                loss_fn = LOSS,
                optimizer_fn = OPTIMIZER,
                #batch_size=BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                seed=SEED
    )
)
'''

'''#supervised learning approach with Softmax function --> Proba gets predicted, ROC way worse
rocauc = river.metrics.ROCAUC()
#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=42)
data_stream = stream.shuffle(final_set.itertuples(index=False),N_SAMPLES, seed=SEED)
for x, y in data_stream:
    y_pred = undercomplete_ae_sm_model.predict_proba_one(x) #ruft learn_unsupervised auf, müssen wir learn_one dann überhaupt auch aufrufen?
    undercomplete_ae_sm_model.learn_one(x, y) #undercomplete_ae_sm_model.learn_one ist supervised Ansatz, wir wollen Unsupervised
    rocauc.update(y, y_pred)
rocauc
'''

## Backup

In [None]:
'''## backup if anything fails
rocauc= river.metrics.ROCAUC(n_thresholds=100)
j=0
learn_supervised=[True,False]
models = [undercomplete_ae_model,stacked_ae_model]
#learn_supervised=True
y_pred_arr_undercomplete_ae = []
y_act_arr_undercomplete_ae = []
y_pred_cf=[]
#f1 = river.metrics.F1()
#cm=river.metrics.ConfusionMatrix()
#data_stream = stream.shuffle(river.datasets.CreditCard().take(8000), N_SAMPLES, seed=SEED)
for model in models:    
    for ls in learn_supervised:
        data_stream = stream.shuffle(final_set.itertuples(index=False),N_SAMPLES, seed=SEED)
        for x, y in data_stream:
            if ls and y==0:
                undercomplete_ae_model.learn_one(x,y)
            else:
                undercomplete_ae_model.learn_one(x)
            y_pred= undercomplete_ae_model.score_one(x) #high score means outlier
            if j<5:
                print(y_pred)
                print(y)
                j=j+1
            rocauc.update(y,y_pred)
        #    if y_pred>threshhold:
        #        y_pred=1
        #    else:
        #        y_pred=0
            y_pred_arr_undercomplete_ae.append([y_pred,y])
            y_pred_cf.append(y_pred)
            y_act_arr_undercomplete_ae.append(y)

            #f1.update(y,y_pred)
            #cm.update(y,y_pred)
            #undercomplete_ae_model.learn_one(x,y_pred)
            #undercomplete_ae_model.learn_one(x,learn_unsupervised=True)
            #y_pred = undercomplete_ae_sm_model.predict_proba_one(x) #ruft learn_unsupervised auf, müssen wir learn_one dann überhaupt auch aufrufen?
            #undercomplete_ae_model.learn_one(x, y)
            #rocauc.update(y, y_pred)
        print(rocauc)
        evaluation=evaluation.append({
            'model':'undercomplete_ae_model',
            'anom_percentage_credit':anom_percentage_credit,
            'num_samples_credit':num_samples_credit,
            'Loss':LOSS,
            'Optimizer':OPTIMIZER,
            'Batch_size':BATCH_SIZE,
            'Learning_rate':LEARNING_RATE,
            'Latent_dim': LATENT_DIM,
            'ROC':rocauc,
            #'Precision_Recall_Curve':45,
            #'FP':5,
            #'TP': 4,
            #'FN':8,
            #'TN':10

        },ignore_index=True)
        '''