In [1]:
import csv, json, sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN
from sklearn import preprocessing
from statistics import mode
import collections
import pickle
from sklearn import tree, svm
from sklearn.decomposition import PCA
import sklearn
from sklearn import mixture
from sklearn import metrics

In [2]:
with open('anomaly_scores_23_1', 'rb') as handle:
    Anomaly_scores_DEA = pickle.load(handle)

In [3]:
with open('Labels_23_1.pickle', 'rb') as handle:
    targets = pickle.load(handle)

In [4]:
Feature_matrix=Anomaly_scores_DEA.reshape(12500,44)

In [5]:
Labels_true=np.zeros(12500)
for i,j in enumerate(targets):
    if j =='normal':
        Labels_true[i]=0
    elif (targets[i]['anomaly']=='DuplicateSequence'):
        Labels_true[i]=1
    elif (targets[i]['anomaly']=='IncorrectLongTermDependency'):
        Labels_true[i]=2
    elif (targets[i]['anomaly']=='SkipSequence'):
        Labels_true[i]=3
    elif (targets[i]['anomaly']=='SwitchEvents'):
        Labels_true[i]=4
    elif (targets[i]['anomaly']=='IncorrectAttribute'):
        Labels_true[i]=5

In [6]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(Feature_matrix)
np.unique(kmeans.labels_,return_counts=True)

(array([0, 1, 2, 3, 4, 5]),
 array([11896,   138,   154,    89,   134,    89], dtype=int64))

In [None]:
print("K-MEANS")
print("Adj rand score:",metrics.adjusted_rand_score(Labels_true, kmeans.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, kmeans.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, kmeans.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, kmeans.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, kmeans.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, kmeans.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, kmeans.labels_))  

K-MEANS
Adj rand score: 0.594935240845
Adj mutual score: 0.288075915477
Adj homogenity score: 0.289395533248
Adj complteness score: 0.506042391258
Adj v_measure score: 0.368215804533
Adj silhout score: 0.901841917369
Adj calinski score: 1804.48304617


In [None]:
Spectral=SpectralClustering(n_clusters=6).fit(Feature_matrix)
np.unique(Spectral.labels_,return_counts=True)

(array([0, 1, 2, 3, 4, 5]),
 array([11836,   107,    98,   192,   112,   155], dtype=int64))

In [None]:
print("Spectral")
print("Adj rand score:", metrics.adjusted_rand_score(Labels_true, Spectral.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, Spectral.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, Spectral.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, Spectral.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, Spectral.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, Spectral.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, Spectral.labels_))  

Spectral
Adj rand score: 0.638349292351
Adj mutual score: 0.32178585616
Adj homogenity score: 0.323033422975
Adj complteness score: 0.523770450394
Adj v_measure score: 0.399609323398
Adj silhout score: 0.899660529151
Adj calinski score: 1773.47097878


In [None]:
Hierarichial= AgglomerativeClustering(n_clusters=6,linkage='ward').fit(Feature_matrix)
np.unique(Hierarichial.labels_,return_counts=True)

(array([0, 1, 2, 3, 4, 5], dtype=int64),
 array([  270,   128,   137, 11760,   119,    86], dtype=int64))

In [None]:
print("Hierarichial-Ward")
print("Adj rand score:",metrics.adjusted_rand_score(Labels_true, Hierarichial.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, Hierarichial.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, Hierarichial.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, Hierarichial.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, Hierarichial.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, Hierarichial.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, Hierarichial.labels_))  

Hierarichial-Ward
Adj rand score: 0.690002679623
Adj mutual score: 0.354483736399
Adj homogenity score: 0.355667149369
Adj complteness score: 0.532911611017
Adj v_measure score: 0.426611938088
Adj silhout score: 0.898744434245
Adj calinski score: 1577.63964774


In [None]:
Hierarichial= AgglomerativeClustering(n_clusters=6,linkage='average').fit(Feature_matrix)
np.unique(Hierarichial.labels_,return_counts=True)

(array([0, 1, 2, 3, 4, 5], dtype=int64),
 array([12451,    15,    14,     1,     2,    17], dtype=int64))

In [None]:
print("Hierarichial-Average")
print("Adj rand score:", metrics.adjusted_rand_score(Labels_true, Hierarichial.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, Hierarichial.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, Hierarichial.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, Hierarichial.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, Hierarichial.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, Hierarichial.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, Hierarichial.labels_))  

Hierarichial-Average
Adj rand score: 0.0624993160665
Adj mutual score: 0.029379323776
Adj homogenity score: 0.0305018321652
Adj complteness score: 0.468888223802
Adj v_measure score: 0.0572776719751
Adj silhout score: 0.906486222776
Adj calinski score: 351.195394432


In [None]:
Hierarichial= AgglomerativeClustering(n_clusters=6,linkage='complete').fit(Feature_matrix)
np.unique(Hierarichial.labels_,return_counts=True)

(array([0, 1, 2, 3, 4, 5], dtype=int64),
 array([12253,    75,    69,    82,    12,     9], dtype=int64))

In [None]:
print("Hierarichial-complete")
print("Adj rand score:", metrics.adjusted_rand_score(Labels_true, Hierarichial.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, Hierarichial.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, Hierarichial.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, Hierarichial.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, Hierarichial.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, Hierarichial.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, Hierarichial.labels_))  

Hierarichial-complete
Adj rand score: 0.285788429432
Adj mutual score: 0.133761285719
Adj homogenity score: 0.135246906528
Adj complteness score: 0.515873850107
Adj v_measure score: 0.214308457148
Adj silhout score: 0.901559317218
Adj calinski score: 1087.38953446


In [None]:
Mean_shift=MeanShift().fit(Feature_matrix)
np.unique(Mean_shift.labels_,return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [None]:
print("Mean_shift")
print("Adj rand score:", metrics.adjusted_rand_score(Labels_true, Mean_shift.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, Mean_shift.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, Mean_shift.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, Mean_shift.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, Mean_shift.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, Mean_shift.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, Mean_shift.labels_))  

Mean_shift
Adj rand score: 0.920548511072
Adj mutual score: 0.4302084745
Adj homogenity score: 0.936654699467
Adj complteness score: 0.449810258122
Adj v_measure score: 0.607757000755
Adj silhout score: 0.951116853412
Adj calinski score: 92723.2907135


In [None]:
dbscan=sklearn.cluster.DBSCAN(eps=0.01).fit(Feature_matrix)
np.unique(dbscan.labels_,return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64),
 array([  912, 11149,   114,    19,    12,    13,    12,    20,    10,
            5,    15,    12,    13,     9,    27,     9,     9,     8,
           12,    11,    12,    15,    11,    18,    10,     9,    14,
            9,     5,     6], dtype=int64))

In [None]:
print("dbscan")
print("Adj rand score:", metrics.adjusted_rand_score(Labels_true, dbscan.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, dbscan.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, dbscan.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, dbscan.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, dbscan.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, dbscan.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, dbscan.labels_))  

dbscan
Adj rand score: 0.916717159907
Adj mutual score: 0.705003090745
Adj homogenity score: 0.77060815728
Adj complteness score: 0.707274926824
Adj v_measure score: 0.737584500306
Adj silhout score: 0.896997153371
Adj calinski score: 221.813296337


In [None]:
BIRCH=sklearn.cluster.Birch(branching_factor=50, n_clusters=None, threshold=0.01,compute_labels=True).fit(Feature_matrix)
np.unique(BIRCH.labels_,return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
         93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
        106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
        132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
        145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
        158, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171,
        172, 173, 174, 175, 176, 177, 178, 179, 180

In [None]:
print("BIRCH")
print("Adj rand score:", metrics.adjusted_rand_score(Labels_true, BIRCH.labels_))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, BIRCH.labels_))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, BIRCH.labels_))
print("Adj complteness score:", metrics.completeness_score(Labels_true, BIRCH.labels_))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true, BIRCH.labels_))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, BIRCH.labels_))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, BIRCH.labels_))  

BIRCH
Adj rand score: 0.892249680758
Adj mutual score: 0.418798100954
Adj homogenity score: 0.870385441264
Adj complteness score: 0.438520266917
Adj v_measure score: 0.583207260291
Adj silhout score: 0.940693563081
Adj calinski score: 4713.92249351


In [None]:
GM=sklearn.mixture.GaussianMixture(n_components=6).fit(Feature_matrix)
np.unique(GM.predict(Feature_matrix),return_counts=True)

(array([0, 1, 2, 3, 4, 5], dtype=int64),
 array([11262,   239,   240,   437,   115,   207], dtype=int64))

In [None]:
print("GM")
print("Adj rand score:", metrics.adjusted_rand_score(Labels_true, GM.predict(Feature_matrix)))
print("Adj mutual score:", metrics.adjusted_mutual_info_score(Labels_true, GM.predict(Feature_matrix)))
print("Adj homogenity score:", metrics.homogeneity_score(Labels_true, GM.predict(Feature_matrix)))
print("Adj complteness score:", metrics.completeness_score(Labels_true, GM.predict(Feature_matrix)))
print("Adj v_measure score:", metrics.v_measure_score(Labels_true,GM.predict(Feature_matrix)))
print("Adj silhout score:", metrics.silhouette_score(Feature_matrix, GM.predict(Feature_matrix)))
print("Adj calinski score:", metrics.calinski_harabaz_score(Feature_matrix, GM.predict(Feature_matrix))) 

GM
Adj rand score: 0.96850058194
Adj mutual score: 0.675351451274
Adj homogenity score: 0.678110127629
Adj complteness score: 0.675924135491
Adj v_measure score: 0.677015366995
