In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('../py_input_files/AMAZON_packets.csv') #Replace with whatever dataset you would like
df.drop(['ProtocolName'], axis=1, inplace=True)

In [None]:
#MAKE PREPROCESSING PIPELINE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer

def Numpy_to_df(x):
    df = pd.DataFrame(x)
    return df

preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=7)),
    ('to_dataframe', FunctionTransformer(Numpy_to_df)),
])

In [None]:
pca_df = pipeline.fit_transform(df)
AD_df = pd.DataFrame(pca_df)

import random
threshold = random.uniform(0.01, 0.03)
print("Random threshold:", threshold)

In [None]:
#MAKE ANOMALY DETECTION PIPELINE
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

autoencoder = Sequential([
    Dense(4, activation='relu', input_shape=(7,)),
    Dense(7, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')


ml_pipeline = Pipeline([
    ('IF', IsolationForest(contamination=threshold, random_state=42)),
    ('KNN', NearestNeighbors(n_neighbors=5)), 
    ('AE', autoencoder) 
])

In [None]:
#Obtain IF Anomalies
IF_anomalies = ml_pipeline.named_steps['IF'].fit_predict(pca_df)
AD_df['IF_anomalies'] = IF_anomalies
AD_df['IF_anomalies'] = AD_df['IF_anomalies'].replace({1: 0, -1: 1})

#Obtain KNN Anomalies
distances, _ = ml_pipeline.named_steps['KNN'].fit(pca_df).kneighbors(pca_df)
knn_threshold = 100-threshold*100
knn_anomalies = pca_df[distances[:,-1] > np.percentile(distances[:,-1], knn_threshold)]
AD_df['KNN_anomalies'] = 0
AD_df.loc[knn_anomalies.index, 'KNN_anomalies'] = 1

#Obtain AE Anomalies
ml_pipeline.named_steps['AE'].fit(pca_df, pca_df, epochs=100, batch_size=32, shuffle=True)
preds = autoencoder.predict(pca_df)
mse = np.mean(np.power(pca_df - preds, 2), axis=1)
AD_df['mse'] = mse
AE_threshold = np.percentile(mse, 100-threshold*100 )
AE_anomalies = AD_df[mse > AE_threshold]
AD_df['AE_anomalies'] = 0
AD_df.loc[AE_anomalies.index, 'AE_anomalies'] = 1
AD_df.drop(['mse'], axis=1, inplace=True)

In [None]:
#OBTAIN FINAL ANOMALIES
AD_df['Final_anomalies'] = 0
AD_df.loc[(AD_df['IF_anomalies'] == 1) & (AD_df['KNN_anomalies'] == 1) & (AD_df['AE_anomalies'] == 1), 'Final_anomalies'] = 1

In [None]:
AD_df

In [None]:
AD_df.Final_anomalies.value_counts()