In [1]:
import csv, json, sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [2]:
#Reading the necessary file into Python#

input = open("huge-0.1-1.json")
data = json.load(input)
input.close()


In [3]:
##Extracting the feature for capturing Incorrect user anomaly##

Anomaly_value_incorrect_user=np.zeros(len(data["traces"]))   #Initializing the feature vector of size 12500 (length of trace) 
                                                             #which takes a value 1 if there is an anomaly and 0 otherwise
    
# This 'for' loop runs through each and every trace and checks if there is any incorrect user anomaly

for i in range(len(data["traces"])):
    for j in range(len(data["traces"][i]["events"])):
        try:
            A=data["traces"][i]["events"][j]["attributes"]["_possible_users"]
            B=data["traces"][i]["events"][j]["attributes"]["user"]
            Anomaly_value_incorrect_user[i]=Anomaly_value_incorrect_user[i]+int(B not in A)
        except:
            pass

In [4]:
##Extracting the feature for capturing Duplicate Sequence anomaly##

Anomaly_value_duplicate=np.zeros(len(data["traces"]))       #Initializing the feature vector of size 12500 (length of trace) 
                                                            #which takes a value 1 if there is an anomaly and 0 otherwise

for i in range(len(data["traces"])):
    for j in range(len(data["traces"][i]["events"])-1):
        A=data["traces"][i]["events"][j]["name"]
        B=data["traces"][i]["events"][j+1]["name"]
        if (A==B):
            Anomaly_value_duplicate[i]=Anomaly_value_duplicate[i]+1

In [5]:
##This piece of code will create the set of unique normal traces present in the data which we have loaded
##Which will be useful in finding the anomalies with switched and swapped events

Act_list=list()
Activity=str()
for i in range(len(data["traces"])):
    if (data["traces"][i]["attributes"]["label"]=="normal"):
        for j in range(len(data["traces"][i]["events"])):
            Activity=Activity+ " / " +str(data["traces"][i]["events"][j]["name"])
        Act_list.append(Activity)
        Activity=str()

Activity_list=list(set(Act_list))

Activity_unique=[[]]*len(Activity_list)

for i in range(len(Activity_list)):
    Activity_unique[i]=Activity_list[i].split(" / ")    ##Activity_unique contains the set of unique normal traces in this data

In [6]:
##Extracting the features for capturing anamolies with switched and swapped events##

Switching=np.zeros(len(data["traces"]))      #Initializing the feature vectors which captures the Switching and Skipping anomalies
Skipping=np.zeros(len(data["traces"]))

for i in range(len(data["traces"])):
        for j in range(len(data["traces"][i]["events"])):
            Activity=Activity+ " / " +str(data["traces"][i]["events"][j]["name"])
        Activity_temp = Activity.split(" / ")
        Activity=str()
        s=np.zeros(len(Activity_unique))
        for l,k in enumerate(Activity_unique):
            s[l]=int(Activity_temp==k)    
        if(sum(s)==0):
            b=np.zeros(len(Activity_unique))
            for n,p in enumerate(Activity_unique):
                b[n]=int(set(p)==set(Activity_temp))
            if(sum(b)==1):
                if not (any(Activity_temp[z]==Activity_temp[z+1] for z in range(len(Activity_temp)-1))):
                    Switching[i]=1
            else:
                Skipping[i]=1

In [7]:
##Creating the dissimilarity matrix with the generated features which can be fed to the KMeans clustering algorithm
X=np.column_stack((Anomaly_value_incorrect_user,Anomaly_value_duplicate,Switching,Skipping))
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

In [8]:
##Getting the true nature of each of the traces for evaluating the purity of each cluster##

Lables=list()
for i in range(len(data["traces"])):
    if (data["traces"][i]["attributes"]["label"]=="normal"):
        Lables.append("normal")
    else:
        Lables.append(data["traces"][i]["attributes"]["label"]["anomaly"])

le = preprocessing.LabelEncoder()
L=le.fit_transform(Lables)

In [9]:
##Computing the purity of cluster with normal traces##

normal_true= L[(np.where(kmeans.labels_==0))]
normal_pred=np.full(len((np.where(kmeans.labels_==0))[0]),5)
Purity_score= sum(normal_true==normal_pred)/len(normal_true)*100
print ("The purity of cluster with normal traces is %f %% " %Purity_score)

The purity of cluster with normal traces is 97.478261 % 


In [10]:
##Computing the purity of cluster with traces with Duplicate entries anomaly##

Dupicate_true= L[(np.where(kmeans.labels_==1))]
Duplicate_pred=np.full(len((np.where(kmeans.labels_==1))[0]),0)
Purity_score=sum(Dupicate_true==Duplicate_pred)/len(Duplicate_pred)*100
print ("The purity of cluster with Duplicate entries anomaly is %f %% " %Purity_score)

The purity of cluster with Duplicate entries anomaly is 100.000000 % 


In [11]:
##Computing the purity of cluster with traces with switching anomaly##

Switching_true= L[(np.where(kmeans.labels_==2))]
Switching_pred=np.full(len((np.where(kmeans.labels_==2))[0]),4)
Purity_score=sum(Switching_true==Switching_pred)/len(Switching_pred)*100
print ("The purity of cluster with switching anomaly is %f %% " %Purity_score)

The purity of cluster with switching anomaly is 100.000000 % 


In [12]:
##Computing the purity of cluster with traces with skipping anomaly##

Skipping_true= L[(np.where(kmeans.labels_==3))]
Skipping_pred=np.full(len((np.where(kmeans.labels_==3))[0]),3)
Purity_score=sum(Skipping_true==Skipping_pred)/len(Skipping_pred)*100
print ("The purity of cluster with skipping anomaly is %f %% " %Purity_score)

The purity of cluster with skipping anomaly is 100.000000 % 


In [13]:
##Computing the purity of cluster with traces with Incorrect user anomaly##

Incorrect_user_true= L[(np.where(kmeans.labels_==4))]
Incorrect_user_pred=np.full(len((np.where(kmeans.labels_==4))[0]),1)
Purity_score=sum(Incorrect_user_pred==Incorrect_user_true)/len(Incorrect_user_true)*100
print ("The purity of cluster with incorrect user anomaly is %f %% " %Purity_score)

The purity of cluster with incorrect user anomaly is 100.000000 % 
