In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import isclose
from scipy.spatial.distance import pdist, squareform
from src.DataLoader import DataLoader
from src.FeatureExtraction import FeatureExtraction
from src.LogCluster import LogCluster

Use feature extraction on only valid instances and see, if anomalies are separable from normal logs (with different weighting types)

In [None]:
dl = DataLoader(False)
x_raw, y_raw = dl.load_csv('data/HDFS100k/log_structured.csv', 'data/HDFS100k/log_labels.csv')
y_raw = np.array(y_raw)

fe = FeatureExtraction("EventId", False)
x, y = fe.session_windowing(x_raw[y_raw == 0], r'(blk_-?\d+)', "Content", y_raw)

x_validate, y_validate = fe.transform(x_raw, y_raw)

Printing basic info about files ... we can see, that normal logs are missing some events (E7, E8 ... E27), which usually contain some keywords like "exception" and "interrupt"

In [None]:
# print characteristics of the dataset
print("Number of sessions: ", len(x_validate))
print("Normal sessions: ", len([i for i in y_validate if i == 0]))
print("Anomalous sessions: ", len([i for i in y_validate if i == 1]))

print("Number of unique event types: ", len(fe.events))

print()

normal = x_validate[y_validate == 0].sum(axis=0)
normal_events = normal[normal > 0].index.values
print(f"Unique events in normal sessions: {len(normal_events)} -> {normal_events}")

anomaly = x_validate[y_validate == 1].sum(axis=0)
anomaly_events = anomaly[anomaly > 0].index.values
print(f"Unique events in anomalous sessions: {len(anomaly_events)} -> {anomaly_events}")

anomaly_events = set(anomaly_events) - set(normal_events)
print(f"Unique events in anomalous sessions not in normal sessions: {len(anomaly_events)} -> {anomaly_events}")

print()

# Print normal event templates
print("E2: Verification succeeded for <*>")
print("E3: Served block <*> to /<*>")
print("E5: Receiving block <*> src: /<*> dest: /<*>")
print("E6: Received block <*> src: /<*> dest: /<*> of size <*>")
print("E9: Received block <*> of size <*> from /<*>")
print("E11: PacketResponder <*> for block <*> terminating")
print("E16: <*>:Transmitted block <*> to /<*>")
print("E18: <*> Starting thread to transfer block <*> to <*>")
print("E21: Deleting block <*> file <*>")
print("E22: BLOCK* NameSystem.allocateBlock:<*>")
print("E25: BLOCK* ask <*> to replicate <*> to datanode(s) <*>")
print("E26: BLOCK* NameSystem.addStoredBlock: blockMap updated: <*> is added to <*> size <*>")

print()

print("E7: writeBlock <*> received exception <*>")
print("E8: PacketResponder <*> for block <*> Interrupted.")
print("E10: PacketResponder <*> <*> Exception <*>")
print("E13: Receiving empty packet for block <*>")
print("E14: Exception in receiveBlock for block <*> <*>")
print("E15: Changing block file offset of block <*> from <*> to <*> meta file offset to <*>")
print("E27: BLOCK* NameSystem.addStoredBlock: Redundant addStoredBlock request received for <*> on <*> size <*>")

In [None]:
print("Average counts of events in normal sessions: \n", (normal / len(x_validate[y_validate == 0])).sort_values(ascending=False).to_frame().T.to_string())
print()
print("Average counts of events in anomalous sessions: \n", (anomaly / len(x_validate[y_validate == 1])).sort_values(ascending=False).to_frame().T.to_string())
print()
print("Differences in average counts of events in normal and anomalous sessions: \n", (normal / len(x_validate[y_validate == 0]) - anomaly / len(x_validate[y_validate == 1])).abs().sort_values(ascending=False).to_frame().T.to_string())

We can visualise the different occurences of events in normal vs anomalous data ... for better clarity remove duplicate sequences. We can see, how much each event occurs in normal vs anomaly data

In [None]:
normal_deduplicated = x_validate[y_validate == 0].drop_duplicates()
anomaly_deduplicated = x_validate[y_validate == 1].drop_duplicates()

def plot_features(norm, anom):
    norm["Label"] = "Normal"
    anom["Label"] = "Anomaly"
    all = pd.concat([norm, anom])
    print(all.to_string())

    low = all[all["E3"] < 200] # filter 2 events with very high counts to make the plot more readable

    fig, ax = plt.subplots(figsize=(9, 5))
    
    def plot_group(group, label):
        col = "blue" if label == "Normal" else "orange"
        lw = 3 if label == "Normal" else 1
        style = "-" if label == "Normal" else "--"
        ax.plot(all.columns[:-1].values, group.T.values, color=col, linewidth=lw, linestyle=style, label=[label if i == 0 else "_nolabel_" for i in range(len(group))])
        
    plot_group(low[low["Label"]  == "Normal"].drop(columns=["Label"]), "Normal")
    plot_group(low[low["Label"]  == "Anomaly"].drop(columns=["Label"]), "Anomaly")
    
    list(map(lambda x: x.set_color('green'), ax.get_xticklabels()))
    anomaly_labels = filter(lambda x: x.get_text() in anomaly_events, ax.get_xticklabels())
    for i in anomaly_labels:
        i.set_color('red')
        i.set_weight('bold')
        
    ax.legend()
        
    return fig
        
fig = plot_features(normal_deduplicated, anomaly_deduplicated)
# fig.savefig("count_matrix.pdf")

How does inverse term document frequency influence the difference in logs

In [None]:
x_w = fe.apply_weighting(x_validate, True)
normal_w = x_w[y_validate == 0].drop_duplicates()
anomaly_w = x_w[y_validate == 1].drop_duplicates()

# print(anomaly_w)
print(normal[normal > 10])

fig = plot_features(normal_w, anomaly_w)
# fig.savefig("idf_weighting.pdf")

And how does contrast weighting influence features

In [None]:
x_wc = fe.apply_weighting(x_validate, True, True)
normal_wc = x_wc[y_validate == 0].drop_duplicates()
anomaly_wc = x_wc[y_validate == 1].drop_duplicates()

fig = plot_features(normal_wc, anomaly_wc)
# fig.savefig("idf_weighting_context.pdf")

Apply clustering with contrast weights and see, how similiar are anomalous events to normal 

In [None]:
x2 = fe.apply_weighting(x, True, True)

model = LogCluster(0.000001, 0, True, True)
model.fit(x2)

def distance(model, x):
    centroids = model.centroids.copy()
    centroids.insert(0, x)
    # Calculate cosine distance to all centroids
    dist = squareform(pdist(centroids, metric='cosine'))[0]
    min_dist = np.min(dist[1:])
    return min_dist

In [None]:
model._synchronize_events(anomaly_wc.drop(columns=["Label"]))

In [None]:
print("Normal sessions:")
for i in normal_wc.iterrows():
    i = i[1].to_list()[:-1]
    print(distance(model, i))
    
print()
    
print("Anomalous sessions:")
result = []
for i in anomaly_wc.iterrows():
    cnt = 0
    for j in x_wc.iterrows():
        if y_validate[j[0]] == 0:
            continue
        if sum(list(map(isclose, i[1][:-1].to_list(), j[1].to_list()))) == len(i[1]) - 1:
            cnt += 1
    
    i = i[1].to_list()[:-1]
    result.append((cnt, distance(model, i)))
    
allsum = 0
for i in result:
    print(f"Counts: {i[0]}, Distance: {i[1]}")
    allsum += i[0]

print("Overall anomalous sessions (checking correctness): ", allsum)

Even different weighted normal events can be used to detect anoamlies

In [None]:
data = x_wc.iloc[0].copy()
print("Normal sequence:")
print(data.to_frame().T.to_string())
print("Distance", distance(model, data))

print()

print("Modified sequence:")
data["E2"] = 0.5
print(data.to_frame().T.to_string())
print("Distance", distance(model, data))

print()

print("Modified sequence 2:")
data["E2"] = 1
print(data.to_frame().T.to_string())
print("Distance", distance(model, data))

We can see that for some anomalous events, the distance is quite large (i am suspecting that those are the events that are nicely separated in graphs) .. on the other hand there are handful of events, which pretty much exactly match the input data .. thats probably the reason why is LogCluster ineffective (low recall). To test this, get blockIds from "nicely separable" anomalies and create synthetic data (data/synthetic/log_structured)

In [None]:
import re

res = []
for idx, row in x_raw.iterrows():
    blkId = re.findall(r'(blk_-?\d+)', row["Content"])[0]
    if blkId not in res:
        res.append(blkId)

print(res)

In [None]:
for idx, i in anomaly_wc.iterrows():
    i = i.to_list()[:-1]
    if distance(model, i) > 0.1:
        print(res[idx])