- elab camp 用

In [1]:
import numpy as np
import pandas as pd
import json
import gensim
import collections
import matplotlib.pyplot as plt
import pprint
import itertools
import seaborn as sns
import networkx as nx
from IPython.display import display_svg
import tqdm
%matplotlib inline

In [2]:
with open("./lt.json", "r") as f:
    raw_lt = json.load(f)

In [3]:
lt_per_cat = {}
for ltid, value in raw_lt.items():
    if value['cat'] not in lt_per_cat:
        lt_per_cat[value['cat']] = [value['lt']]
    else:
        lt_per_cat[value['cat']].append(value['lt'])

In [4]:
def remove_symbols(value):
    return value.replace("'",'').replace('>','').replace('<','').replace('=','').replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace('**', '').replace(':', '').replace(',', '').replace("_", " ")

def string_to_corpus(lt, max_len=None):
    words = [remove_symbols(t) for t in lt.split()]
    split_words = []
    for w in words:
        if len(w.split()) > 1: # さらに分割できる場合
            split_words.extend(w.split())
        else:
            split_words.append(w)
    split_words = [t.lower() for t in split_words if t != '']
    if max_len != None:
        split_words = split_words[:max_len] # 前方のみ残す

    return split_words

In [5]:
all_corpus = dict()
for cat, lts in lt_per_cat.items():
    corpus = []
    for lt in lts:
        words = [remove_symbols(t) for t in lt.split()]
        split_words = []
        for w in words:
            if len(w.split()) > 1: # さらに分割できる場合
                split_words.extend(w.split())
            else:
                split_words.append(w)
        split_words = [t.lower() for t in split_words if t != '']
        corpus.extend(split_words)
    all_corpus[cat] = corpus

In [6]:
all_words = list(set([word for cat, words in all_corpus.items() for word in words]))
word_dict = {w:i for i, w in enumerate(all_words)}
inv_word_dict = {v:k for k,v in word_dict.items()}

corpus = []
for cat, lts in lt_per_cat.items():
    for lt in lts:
        words = [remove_symbols(t) for t in lt.split()]
        split_words = []
        for w in words:
            if len(w.split()) > 1: # さらに分割できる場合
                split_words.extend(w.split())
            else:
                split_words.append(w)
        split_words = [t.lower() for t in split_words if t != '']
        
        split_words = split_words[:7] # 前方のみ残す
        
        lt_word_id_cnt = collections.Counter([word_dict[w] for w in split_words])
        
        corpus.append([(tid, cnt) for tid, cnt in lt_word_id_cnt.items()])

In [7]:
# 単語のカウント
word_cnt = {}
for wid, cnt in sorted(collections.Counter([c[0] for s in corpus for c in s]).items(), key= lambda x: x[1])[::-1]:
    print(inv_word_dict[wid], cnt)
    word_cnt[inv_word_dict[wid]] = cnt

ui 543
user 537
mgd 505
cfg 470
audit 466
rpd 374
set 324
other 257
event 198
index 174
broadcast 162
delete 136
up 110
multicast 104
/kernel 92
mpls 85
lsp 83
to 80
evt 62
from 61
init 60
updown 55
login 55
bgp 53
for 51
mtu 46
chassisd 42
alarm 41
task 41
file 41
pic 41
change 41
trap 40
error 40
address 40
read 39
received 39
mcsn 39
sent 38
port 38
is 38
add 36
snmp 36
notification 36
on 36
failed 34
xntpd 34
pointtopoint 32
cmd 31
class 30
pid 29
down 28
deactivate 28
status 28
bandwidth 27
- 27
message 27
neighbor 26
signal 26
state 26
link 25
cleared 25
junoscript 24
of 24
connection 24
info 24
alarmd 23
craftd 23
generated 23
luchip 23
peer 23
switch 22
reinitializing 22
used 22
slot 22
activate 21
pci 21
not 21
color 20
pfe 20
recv 19
master 19
terminate 19
bulkget 18
chas 18
pfeman 18
system 18
config 18
sshd 18
libjsnmp 17
as 17
no 17
time 17
re 16
secret 16
active 16
rsp 16
interface 16
realm 15
ospf 15
snmpd 15
lu 15
fpc 15
with 15
major 15
by 15
failure 15
rsvp 14
mac 14


In [8]:
raw_lt

{'0': {'cat': 'system(cron)',
  'lt': '/usr/sbin/cron[**]: (root) CMD (newsyslog)'},
 '1': {'cat': 'network(lacp)',
  'lt': 'mib2d[**]: lacp info not found for ifl:**'},
 '2': {'cat': 'network(lacp)',
  'lt': 'mib2d[**]: cleared lacp info not found for ifl:**'},
 '3': {'cat': 'system(cron)',
  'lt': '/usr/sbin/cron[**]: (**) CMD (adjkerntz -a)'},
 '4': {'cat': 'service(ntp)', 'lt': 'xntpd[**]: NTP Server Unreachable'},
 '5': {'cat': 'monitor(syslog)', 'lt': 'last message repeated ** times'},
 '6': {'cat': 'network(mtu)', 'lt': '/kernel: MTU for ** reduced to **'},
 '7': {'cat': 'mgmt(login)',
  'lt': 'login: LOGIN_INFORMATION: User ** logged in from host ** on device **'},
 '8': {'cat': 'system(ui)',
  'lt': "mgd[**]: UI_AUTH_EVENT: Authenticated user '**' at permission level '**'"},
 '9': {'cat': 'system(ui)',
  'lt': "mgd[**]: UI_LOGIN_EVENT: User '**' login, class '**' [**], ssh-connection '', client-mode 'cli'"},
 '10': {'cat': 'system(ui)',
  'lt': "mgd[**]: UI_CHILD_START: Starti

In [None]:
# LTレベルの共起
cor_df = pd.DataFrame(index=np.arange(len(word_dict)), columns=np.arange(len(word_dict)), dtype=int)
cor_df = cor_df.fillna(0)

for s in corpus:
    for a,b in itertools.combinations(s, 2):
        ida = int(a[0])
        idb = int(b[0])
        cor_df[ida][idb] += 1
        cor_df[idb][ida] += 1

cor_list = []
for a,b in itertools.combinations(np.arange(len(word_dict)), 2):
    if cor_df[a][b] != 0 :
        cor_list.append([(a, b) , cor_df[a][b]])
        
for i in tqdm.tqdm(sorted(cor_list, key=lambda x:x[1], reverse=True)):
    a,b = i[0]
    print(inv_word_dict[a], inv_word_dict[b], i[1])
    print("\t simpson:", i[1]/min(word_cnt[inv_word_dict[a]], word_cnt[inv_word_dict[b]]))
    print("\t jaccard:", i[1]/(word_cnt[inv_word_dict[a]] + word_cnt[inv_word_dict[b]] - i[1]))
    print()

In [None]:
G = nx.DiGraph()
nx.set_node_attributes

for i in sorted(cor_list, key=lambda x:x[1], reverse=True):
    if i[1] > 9: # 共起が10回以上
        a,b = i[0]
        simpson = i[1]/min(word_cnt[inv_word_dict[a]], word_cnt[inv_word_dict[b]])
        jaccard = i[1]/(word_cnt[inv_word_dict[a]] + word_cnt[inv_word_dict[b]] - i[1])
        
        print(simpson)
        if simpson > 0.7:
            if a not in G.nodes:
                G.add_node(inv_word_dict[a])

            if b not in G.nodes:
                G.add_node(inv_word_dict[b])

            src = inv_word_dict[a] if np.argmin([word_cnt[inv_word_dict[a]], word_cnt[inv_word_dict[b]]]) == 0 else inv_word_dict[b]
            dst = inv_word_dict[a] if src == inv_word_dict[b] else inv_word_dict[b]
            G.add_edge(src, dst, weight=jaccard)

In [None]:
json_data = nx.node_link_data(G)
f = open("nx.json", "w")
json.dump(json_data, f, ensure_ascii=False, indent=4, sort_keys=True, separators=(',', ': '))
f.close()

In [None]:
plt.figure(figsize=(13, 13))
nx.draw_networkx(G, pos=nx.shell_layout(G))
plt.show()

## LDA

- __getItem__での結果は，inference(collect_stats=True)の結果をノーマライズしたものっぽい

In [None]:
for cat, lts in lt_per_cat.items():
    print(cat)

In [None]:
tmp = []
eval_corpus = []
eval_corpus_ltids = []
for ltid, i in enumerate(corpus):
    if word_dict['ui'] not in [j[0] for j in i]: # uiを弾く(550番)
        tmp.append(tuple(sorted(i)))
        eval_corpus.append(i)
        eval_corpus_ltids.append(ltid)
    
print(len(tmp))
set_corpus = list(set(tmp))

In [None]:
lda = gensim.models.ldamodel.LdaModel(
                                      #corpus=corpus, 
                                      corpus=set_corpus,
                                      chunksize=2,
                                      num_topics=40, 
                                      id2word=inv_word_dict, 
                                      minimum_probability=1e-8, 
                                      #gamma_threshold=1e-5,
#                                       eval_every=1,
                                      iterations=400,
#                                       alpha='auto', 
#                                       eta='auto', 
                                      decay=1.0,
                                      passes=1,
#                                      update_every=0,
#                                       random_state=10,
                                      per_word_topics=True)

In [None]:
inf_topic_words = {}
for topic in range(lda.num_topics):
    print(topic, [inv_word_dict[i[0]] for i in lda.get_topic_terms(topic)])
    inf_topic_words[topic] = [inv_word_dict[i[0]] for i in lda.get_topic_terms(topic)]

In [None]:
t = []
for lt in lt_per_cat["service(ntp)"]:                
    c = list(collections.Counter([word_dict[w] for w in string_to_corpus(lt, max_len=7)]).items())
    topic_dist, word_topic, word_topic_dist = lda[c]
    inf_topic = np.argmax(np.array([t[1] for t in topic_dist]))
    t.append(inf_topic)
    
for k,v in collections.Counter(t).items():
    print(k,"\t",v)
    
print("alarm")
t = []
for lt in lt_per_cat["system(alarm)"]:
    c = list(collections.Counter([word_dict[w] for w in string_to_corpus(lt, max_len=7)]).items())
    topic_dist, word_topic, word_topic_dist = lda[c]
    inf_topic = np.argmax(np.array([t[1] for t in topic_dist]))
    t.append(inf_topic)
    
for k,v in collections.Counter(t).items():
    print(k,"\t",v)

In [None]:
results = []
for cs in range(1, 20):
    for i in range(10):
        lda = gensim.models.ldamodel.LdaModel(
                                              corpus=set_corpus,
                                              chunksize=cs,
                                              num_topics=50, 
                                              id2word=inv_word_dict, 
                                              minimum_probability=1e-8, 
                                              iterations=800,
                                              decay=1.0,
                                              passes=1,
                                              per_word_topics=True)

        res = []
        for lt in lt_per_cat["service(ntp)"]:                
            c = list(collections.Counter([word_dict[w] for w in string_to_corpus(lt, max_len=7)]).items())
            topic_dist, word_topic, word_topic_dist = lda[c]
            inf_topic = np.argmax(np.array([t[1] for t in topic_dist]))
            res.append(inf_topic)
        results.append((cs, i, res))

In [None]:
for cs,i,r in results:
    e = entropy(list(collections.Counter(r).values()))
    if e < 0.8:
        print("ChunkSize=", cs, "\t", i)
        print("\t", collections.Counter(r))
        print("\t entropy:", entropy(list(collections.Counter(r).values())))
        print()

In [None]:
results =  [np.argmax(l) for l in lda.inference(eval_corpus)[0]]

inference_results = {}
for a,b in zip(eval_corpus, results):
    if b in inference_results:
        inference_results[b].append([inv_word_dict[c[0]] for c in a])
    else:
        inference_results[b] = [[inv_word_dict[c[0]] for c in a]]

### eval

- 外れたやつについて，調査

In [None]:
for a,b in lt_per_cat.items():
    print(a, len(b))

In [None]:
for i in range(len(word_dict)):
    print(inv_word_dict[i], lda.get_term_topics(i))

In [None]:
word_dict['unreachable']

In [None]:
# inf_per_cat = {}
for cat,lts in lt_per_cat.items():
    
    if cat == "service(ntp)" or cat == "system(alarm)" or cat == "egp(bgp)" or cat == "network(mtu)":
        print(cat)
#     inf_per_cat[cat] = []
        for lt in lts:
            c = list(collections.Counter([word_dict[w] for w in string_to_corpus(lt, max_len=7)]).items())
            topic_dist, word_topic, word_topic_dist = lda[c]
            inf_topic = np.argmax(np.array([t[1] for t in topic_dist]))
#             inf_per_cat[cat].append(inf_topic)
            print("\t", inf_topic, ",", lt)
    #         print()

In [None]:
lt_vecs = {}
for cat,lts in lt_per_cat.items():
    for lt in lts:
        c = list(collections.Counter([word_dict[w] for w in string_to_corpus(lt, max_len=7)]).items())
        topic_dist, word_topic, word_topic_dist = lda[c]
        topic_vec = np.array([t[1] for t in  topic_dist])
        lt_vecs[lt] = topic_vec

In [None]:
from sklearn.manifold import TSNE

x = np.array(list(lt_vecs.values()))
tsne = TSNE(
    n_components=3, #ここが削減後の次元数です．
    init='random',
    random_state=101,
    method='barnes_hut',
    n_iter=800,
    verbose=2
).fit_transform(x)

In [None]:
lt_cat = {vv:k.split('(')[0]  for k,v in  lt_per_cat.items() for vv in v}
cats = [lt_cat[k] for k in list(lt_vecs.keys())]

In [None]:
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
colors = le.fit_transform(cats)

# 3Dの散布図が作れるScatter3dを使います．
trace1 = go.Scatter3d(
    x=tsne[:,0], # それぞれの次元をx, y, zにセットするだけです．
    y=tsne[:,1],
    z=tsne[:,2],
    mode='markers',
    text=list(lt_vecs.keys()),
    marker=dict(
        sizemode='diameter',
        color = colors,
        colorscale = 'Portland',
        colorbar=dict(
            title='Colorbar',
            tickvals = np.arange(len(le.classes_)),
            ticktext = le.classes_
        ),
        line=dict(color='rgb(255, 255, 255)'),
        opacity=0.9,
        size=2 # ごちゃごちゃしないように小さめに設定するのがオススメです．
    )
)

data=[trace1]
layout=dict(height=1000, width=900, title='Log Template Vectors')
fig=dict(data=data, layout=layout)
offline.iplot(fig, filename='tsne_example')

In [None]:
from sklearn.manifold import TSNE

x = np.array(list(lt_vecs.values()))
tsne2d = TSNE(
    n_components=2, #ここが削減後の次元数です．
    init='random',
    random_state=101,
    method='barnes_hut',
    n_iter=1000,
    verbose=2
).fit_transform(x)

In [None]:
lt_cat = {vv:k.split('(')[0]  for k,v in  lt_per_cat.items() for vv in v}
cats = [lt_cat[k] for k in list(lt_vecs.keys())]

In [None]:
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()

le = preprocessing.LabelEncoder()
colors = le.fit_transform(cats)

trace2 = go.Scatter(
    x=tsne2d[:,0],
    y=tsne2d[:,1],
    mode='markers',
    text=list(lt_vecs.keys()),
    marker=dict(
        sizemode='diameter',
        color = colors,
        colorscale = 'Portland',
        colorbar=dict(
            title='Colorbar',
            tickvals = np.arange(len(le.classes_)),
            ticktext = le.classes_
        ),
        line=dict(color='rgb(255, 255, 255)'),
        opacity=0.9,
        size=5
    )
)

data=[trace2]
layout=dict(height=800, width=800, title='Log Template Vectors')
fig=dict(data=data, layout=layout)
offline.iplot(fig, filename='tsne_example')

In [None]:
vecs_df = pd.DataFrame(columns=["lt","v"], index=np.arange(len(lt_vecs)))

In [None]:
i = 0
for k,v in lt_vecs.items():
    vecs_df['lt'][i] = k
    vecs_df['v'][i] = list(v)
    i+=1

In [None]:
for cat,lts in lt_per_cat.items():
    
    for lt in lts:
        c = list(collections.Counter([word_dict[w] for w in string_to_corpus(lt, max_len=7)]).items())
        topic_dist, word_topic, word_topic_dist = lda[c]
        inf_topic = np.argmax(np.array([t[1] for t in topic_dist]))
        
        vecs_df.loc[vecs_df['lt']==lt, 'gt'] = cat
        vecs_df.loc[vecs_df['lt']==lt, 'tm'] = inf_topic


In [None]:
set(vecs_df["tm"])

In [None]:
tmp = vecs_df[vecs_df["gt"]=="system(alarm)"]

for i in tmp.iterrows():
    print(i[1]['lt'])
    plt.figure(figsize=(10, 5))
    plt.plot(i[1]['v'])
    plt.show()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cos_mat = cosine_similarity(np.array(list(vecs_df["v"].values)), np.array(list(vecs_df["v"].values)))

In [None]:
for i in range(len(cos_mat)):
    print(vecs_df['lt'][i])
    print("top 10:")
    for j in zip(vecs_df['lt'][np.argsort(cos_mat[i])[::-1]][:10], np.sort(cos_mat[i])[::-1][:10]):
        print("\t", j)
    print()

In [None]:
for k,v in lt_vecs.items():
    if k == "** xntpd[**]: synchronized to **, stratum=**" or k =="xntpd[**]: kernel time sync disabled **":
        plt.figure(figsize=(10, 5))
        plt.title(k)
        plt.plot(v)
        plt.show()

### 単純な手法と比較してみる
- BoW をk-means

In [None]:
len(corpus)

In [None]:
vec_len = len(word_dict)

bow_vecs = []
for v in corpus:
    vec = np.zeros(vec_len)
    for vv in v:
        vec[vv[0]] = vv[1]
    bow_vecs.append(vec)
bow_vecs = np.array(bow_vecs)

In [None]:
bow_vecs.shape

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(n_clusters=40,random_state=9)
km.fit(bow_vecs)

In [None]:
vecs_df['km'] = 0 
for class_label in range(40):
#     print(class_label)
    idx = (km.labels_==class_label)
    vecs_df['km'][idx] = class_label

In [None]:
from scipy.stats import entropy

- entropyは小さいほうがいい．

In [None]:
vecs_df

In [None]:
tmp = dict()
for row in vecs_df[vecs_df["gt"]=="interface(agg)"][["lt", "tm"]].iterrows():
    if row[1]["tm"] in tmp:
        tmp[row[1]["tm"]].append(row[1]["lt"])
    else:
        tmp[row[1]["tm"]] =[row[1]["lt"]]

In [None]:
for k,v in tmp.items():
    print(k, v)

In [None]:
# import pickle

# with open("vecs_df_20180805", "wb") as f:
#     pickle.dump(vecs_df, f)

In [None]:
for cat in lt_per_cat.keys():
    print(cat)

    rows = vecs_df[vecs_df['gt']==cat]
    print("#class(tm)\t", len(set(rows['tm'])))
    print("\t", collections.Counter(rows['tm']))
    print("\tEntropy:\t", entropy(list(collections.Counter(rows['tm']).values())))
    
    print("#class(km)\t", len(set(rows['km'])))
    print("\t", collections.Counter(rows['km']))
    print("\tEntropy:\t", entropy(list(collections.Counter(rows['km']).values())))

    tm_y = np.zeros(50)
    km_y = np.zeros(50)
    
    for k,v in collections.Counter(rows['tm']).items():
        tm_y[int(k)] = v
    for k,v in collections.Counter(rows['km']).items():
        km_y[k] = v
    
    plt.figure(figsize=(10, 5))
    plt.bar(np.arange(50), tm_y, label="tm")
    plt.bar(np.arange(50), km_y, label="km", alpha=0.6)
    plt.legend()
    plt.grid()
    plt.show()

    print()

In [None]:
import matplotlib as mpl
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams['lines.color'] = 'r'
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['font.size'] = 15
mpl.rcParams['axes.labelsize'] = 20
mpl.rcParams['lines.markersize'] = 12
mpl.rcParams['xtick.major.size'] = 15
mpl.rcParams['xtick.minor.size'] = 15
mpl.rcParams['ytick.major.size'] = 15

mpl.rcParams['text.usetex'] = False
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


In [None]:
gt_ent = {}
for cat in lt_per_cat.keys():
    rows = vecs_df[vecs_df['gt']==cat]
    class_t = len(set(rows['tm']))
    hist = np.array(list(collections.Counter(rows['tm']).values()))
    hist_max = hist.sum()
    ent_t = entropy(hist/hist_max)
    
    class_k = len(set(rows['km']))
    hist = np.array(list(collections.Counter(rows['km']).values()))
    hist_max = hist.sum()
    ent_k = entropy(hist/hist_max)
    
    gt_ent[cat] = ((class_t, ent_t), (class_k, ent_k))

In [None]:
gt_ent[cat]

In [None]:
y_tm

In [None]:
gt_ent_item = [(k,v) for k,v in gt_ent.items()]
x = [k for k,v in gt_ent_item]
y_tm = []
y_km = []

for p, cat in enumerate(x):
    y_tm.append(gt_ent[cat][0][1])
    y_km.append(gt_ent[cat][1][1])

plt.figure(figsize=(20, 14))
plt.subplot(211)
plt.subplots_adjust(top=0.95, bottom=0.1, left=0.18, right=0.95, hspace=0.5)
plt.bar(np.arange(32)-0.2, y_tm[:32], label="topic model", width=0.4)
plt.bar(np.arange(32)+0.2, y_km[:32], label="k-means", width=0.4)
plt.xticks(np.arange(32), x[:32], rotation=90)
plt.legend()
plt.ylabel("Entropy")
plt.grid()

plt.subplot(212)
plt.bar(np.arange(32)-0.2, y_tm[32:], label="topic model", width=0.4)
plt.bar(np.arange(32)+0.2, y_km[32:], label="k-means", width=0.4)
plt.xticks(np.arange(32), x[32:], rotation=90)
plt.legend()
plt.ylabel("Entropy")
plt.grid()


plt.show()

In [None]:
inf_ent = {}
for idx in range(40):
    
    rows = vecs_df[vecs_df['tm']==idx]
    ent_t = entropy(list(collections.Counter(rows['gt']).values()))
    
    rows = vecs_df[vecs_df['km']==idx]
    ent_k = entropy(list(collections.Counter(rows['gt']).values()))
    
    inf_ent[idx] = ((ent_t, ent_k))

In [None]:
# gt_ent_item = [(k,v) for k,v in gt_ent.items()]
x = np.arange(40)
y_tm = []
y_km = []

for idx in range(40):
    y_tm.append(inf_ent[idx][0])
    y_km.append(inf_ent[idx][1])

plt.figure(figsize=(14, 5))

plt.bar(np.arange(40)-0.2, y_tm, label="topic model", width=0.4)
plt.bar(np.arange(40)+0.2, y_km, label="k-means", width=0.4)
plt.xticks(np.arange(40), rotation=90)
plt.legend()
plt.ylabel("Entropy")
plt.grid()

plt.show()

In [None]:
for row in vecs_df[vecs_df["gt"]=="system(alarm)"].iterrows():
    print(row[1]['lt'])
    print("\t", row[1]['km'])

In [None]:
print(np.array(y_tm).mean())
print(np.array(y_tm).std())
print()
print(np.array(y_km).mean())
print(np.array(y_km).std())

In [None]:
print(np.array(y_tm).mean())
print(np.array(y_tm).std())
print()
print(np.array(y_km).mean())
print(np.array(y_km).std())

In [None]:
## 逆
cats = lt_per_cat.keys()
cats_idx = {i:e for e,i in enumerate(cats)}

for idx in range(40):
    print(idx)
    
    tm_y = np.zeros(len(cats))
    km_y = np.zeros(len(cats))
    
    rows = vecs_df[vecs_df['tm']==idx]
    print("#class(tm)\t", len(set(rows['gt'])))
    print("\t", collections.Counter(rows['gt']))
    print("\tEntropy:\t", entropy(list(collections.Counter(rows['gt']).values())))
    for k,v in collections.Counter(rows['gt']).items():
        tm_y[cats_idx[k]] = v

    rows = vecs_df[vecs_df['km']==idx]
    print("#class(km)\t", len(set(rows['gt'])))
    print("\t", collections.Counter(rows['gt']))
    print("\tEntropy:\t", entropy(list(collections.Counter(rows['gt']).values())))
    for k,v in collections.Counter(rows['gt']).items():
        km_y[cats_idx[k]] = v
    
    plt.figure(figsize=(10, 5))
    plt.bar(np.arange(len(cats)), tm_y, label="tm")
    plt.bar(np.arange(len(cats)), km_y, label="km", alpha=0.6)
    plt.legend()
    plt.grid()
    plt.show()

    print()

In [None]:
with open("lda_result", "w") as f:
    for i in range(40):
        f.write(str(i))
        f.write("\n")
        for j in vecs_df[vecs_df["tm"]==i]["lt"].values:
            f.write("\t"+j)
            f.write("\n")

In [None]:
with open("km_result", "w") as f:
    for i in range(40):
        f.write(str(i))
        f.write("\n")
        for j in vecs_df[vecs_df["km"]==i]["lt"].values:
            f.write(j)
            f.write("\n")

In [None]:
for cat in lt_per_cat.keys():
    print(cat)
    
    rows = vecs_df[vecs_df['gt']==cat]
#     print(rows[['lt', 'gt', 'tm', 'km']])
    print("#class(tm)\t", len(set(rows['tm'])))
    print("\t", collections.Counter(rows['tm']))
    print("\tEntropy:\t", entropy(list(collections.Counter(rows['tm']).values())))
    
    print("#class(km)\t", len(set(rows['km'])))
    print("\t", collections.Counter(rows['km']))
    print("\tEntropy:\t", entropy(list(collections.Counter(rows['km']).values())))

    print()

### LTの距離について

In [None]:
t = []
for a,b in lt_per_cat.items():
    t.append((a, len(b)))
    
print(sorted(t, key=lambda x: x[1]))

In [None]:
km.cluster_centers_

In [None]:
tsne2d = TSNE(
    n_components=2, #ここが削減後の次元数です．
    init='random',
    random_state=101,
    method='barnes_hut',
    n_iter=500,
    verbose=2
).fit_transform(km.cluster_centers_)

In [None]:
t = []
for a,b in lt_per_cat.items():
    t.append((a, len(b)))
    
print(sorted(t, key=lambda x: x[1]))

t = [i[0] for i in sorted(t, key=lambda x:x[1])[::-1][2:26]]

In [None]:
t

In [None]:
t = ["network(mtu)","monitor(snmp)"]

In [None]:
bool_idx = [False]*len(vecs_df)
for i in t:
    bool_idx = bool_idx | (vecs_df["gt"]==i)

cat_names = vecs_df[bool_idx]["gt"]

In [None]:
z = np.array([np.array(i) for i in vecs_df[bool_idx]["v"].values])
z_keys = vecs_df[bool_idx]["lt"].values

In [None]:
["mgmt(login)", "system(ui)","mgmt(command)","mgmt(ssh)","monitor(snmp)","system(alarm)"]

In [None]:
a = vecs_df[vecs_df["lt"]=='rpd[**]: EVENT <SNMP Index> ** index ** <Up Broadcast Multicast>']["v"].values[0]
b = vecs_df[vecs_df["lt"]=='snmpd[**]: SNMPD_BIND_INFO: Source address for trap socket was set to **']["v"].values[0]

In [None]:
cosine_similarity(np.array((a,b)).reshape(2, -1))

In [None]:
vecs_df

In [None]:
vecs_df[vecs_df["gt"]=="service(ntp)"]

In [None]:
a = "/kernel: setting server address to **"
b = "xntpd[**]: kernel time sync enabled **"
c = "xntpd[**]: NTP Server Unreachable"

bow_a = bow_vecs[vecs_df["lt"]==a][0]
bow_b = bow_vecs[vecs_df["lt"]==b][0]
bow_c = bow_vecs[vecs_df["lt"]==c][0]

d1 = np.sqrt(sum([i**2 for i in bow_a - bow_b]))
d2 = np.sqrt(sum([i**2 for i in bow_a - bow_c]))
d3 = np.sqrt(sum([i**2 for i in bow_b - bow_c]))

v_a = np.array(vecs_df[vecs_df["lt"]==a]["v"].values[0])
v_b = np.array(vecs_df[vecs_df["lt"]==b]["v"].values[0])
v_c = np.array(vecs_df[vecs_df["lt"]==c]["v"].values[0])

d4 = np.sqrt(sum([i**2 for i in v_a - v_b]))
d5 = np.sqrt(sum([i**2 for i in v_a - v_c]))
d6 = np.sqrt(sum([i**2 for i in v_b - v_c]))

plt.figure()
plt.plot(v_a, label="a")
plt.plot(v_b, label="b")
plt.plot(v_c, label="c")
plt.legend()
plt.show()

In [None]:
print(d1, d2, d3)
print(d4, d5, d6)

In [None]:
for i  in vecs_df[vecs_df['gt']=="network(mtu)"].iterrows():
    print(i[1]['lt'], np.argsort(i[1]['v']))
    
    plt.figure()
    plt.plot(i[1]['v'])
    plt.show()

In [None]:
vecs_df[vecs_df['gt']=="monitor(snmp)"]['lt'].values

In [None]:
tmp_df

In [None]:
import seaborn as sns
tmp_df = pd.DataFrame(cosine_similarity(z), columns=cat_names, index=cat_names)

plt.figure(figsize=(20, 20))
sns.heatmap(tmp_df)
plt.show()

In [None]:
tmp_df = pd.DataFrame(cosine_similarity(bow_vecs[bool_idx]), columns=cat_names, index=cat_names)

plt.figure(figsize=(20, 20))
sns.heatmap(tmp_df)
plt.show()

In [None]:
tsne2d = TSNE(
    n_components=2, #ここが削減後の次元数です．
    init='random',
    random_state=101,
    method='barnes_hut',
    n_iter=500,
    verbose=2
).fit_transform(z)

In [None]:
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()

le = preprocessing.LabelEncoder()
colors = le.fit_transform(cat_names)

trace2 = go.Scatter(
    x=tsne2d[:,0],
    y=tsne2d[:,1],
    mode='markers',
    text=[i[:50] for i in z_keys],
    marker=dict(
        sizemode='diameter',
        color = colors,
        colorscale = 'Portland',
        colorbar=dict(
            title='Colorbar',
            tickvals = np.arange(len(le.classes_)),
            ticktext = le.classes_
        ),
        line=dict(color='rgb(255, 255, 255)'),
        opacity=0.9,
        size=5
    )
)

data=[trace2]
layout=dict(height=800, width=800, title='Log Template Vectors')
fig=dict(data=data, layout=layout)
offline.iplot(fig, filename='tsne_example')

In [None]:
z = np.array([np.array(i) for i in vecs_df[bool_idx]["v"].values])
z_keys = vecs_df[bool_idx]["lt"].values

In [None]:
tsne2d = TSNE(
    n_components=2, #ここが削減後の次元数です．
    init='random',
    random_state=101,
    method='barnes_hut',
    n_iter=500,
    verbose=2
).fit_transform(z)

In [None]:
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()

le = preprocessing.LabelEncoder()
colors = le.fit_transform(cat_name)

trace2 = go.Scatter(
    x=tsne2d[:,0],
    y=tsne2d[:,1],
    mode='markers',
    text=[i[:50] for i in z_keys],
    marker=dict(
        sizemode='diameter',
        color = colors,
        colorscale = 'Portland',
        colorbar=dict(
            title='Colorbar',
            tickvals = np.arange(len(le.classes_)),
            ticktext = le.classes_
        ),
        line=dict(color='rgb(255, 255, 255)'),
        opacity=0.9,
        size=5
    )
)

data=[trace2]
layout=dict(height=800, width=800, title='Log Template Vectors')
fig=dict(data=data, layout=layout)
offline.iplot(fig, filename='tsne_example')

In [None]:
vecs_df[vecs_df["gt"]=="vpn(mpls)"]

### 単純な手法と比較して見る-2
- 編集距離

In [None]:
inf_topic_words[23]

In [None]:
import Levenshtein

string1 = "井上泰治"
string2 = "井上泰次"

print(Levenshtein.distance(string1, string2))

In [None]:
KMeans()

In [None]:
for i in vecs_df["lt"].values:
    print(i)
    print(string_to_corpus(i))
    
    
    break

In [None]:
idxs = np.argsort(cos_mat[72])[::-1][:60]

for idx in idxs:
    print(vecs_df["lt"][idx])

    v = np.array([i for i in vecs_df["v"][idx]])
    print(np.where(v>0.08))
    
    plt.figure(figsize=(10,5))
    plt.plot(v)
    plt.grid()
    plt.show()


In [None]:
plt.style.use('seaborn-colorblind')
plt.style.use('seaborn-whitegrid')
cvs = []

for cat, res in inf_per_cat.items():
    plt.figure(figsize=(15, 5))
    plt.title(cat)
    y = collections.Counter(res)
    y = [y[i] if i in y else 0 for i in range(40)]
    plt.bar(np.arange(40), y, color="#85C1E9")
    plt.xticks(np.arange(40))
    minY = 5 if max(y) < 5 else max(y)+1
    plt.ylim(0, minY)
    plt.show()

    print(np.array(y).std())
    cvs.append((cat, np.array(y).std()/np.array(y).mean()))

- ppe, snmp, l2, ui, pfe, fpc, agg, None

In [None]:
sorted_t = sorted(cvs, key=lambda x: x[1], reverse=True)

plt.figure(figsize=(15, 5))
plt.title('cv')
plt.bar(np.arange(len(sorted_t)), [i[1] for i in sorted_t], color="#85C1E9")
plt.xticks(np.arange(len(sorted_t)), [i[0] for i in sorted_t],rotation=90)
plt.hlines(3.5, xmin=0, xmax=len(sorted_t), color='red',linestyles='dotted', lw=3)
plt.show()


In [None]:
cat_per_inf

In [None]:
y = collections.Counter(res)
y = [y[i] if i in y else 0 for i in inf_per_cat.keys()]
y = np.array(y) / total_per_cat

In [None]:
## 逆変換

total_per_cat = np.array([len(lt_per_cat[i]) for i in inf_per_cat.keys()])

cat_per_inf = dict()
for i in range(40):
    cat_per_inf[i] = []

for cat, res in inf_per_cat.items():
    for r in res:
        cat_per_inf[r].append(cat)
    
for cat, res in cat_per_inf.items():
    plt.figure(figsize=(15, 5))
    plt.title(cat)
    y = collections.Counter(res)
    y = [y[i] if i in y else 0 for i in inf_per_cat.keys()]
    y = np.array(y) / total_per_cat
    plt.bar(np.arange(len(inf_per_cat)), y, color="#85C1E9")
    plt.xticks(np.arange(len(inf_per_cat)), inf_per_cat.keys(), rotation=90)
    plt.ylabel("#LT/#LT in the cat")
#     minY = 5 if max(y) < 5 else max(y)+1
#     plt.ylim(0, minY)
    plt.ylim(0, 1)
    plt.show()

In [None]:
lt_per_inf_topic = {}
for i in range(40):
    lt_per_inf_topic[i] = []
    
for ltid, data in raw_lt.items():
    lt = data['lt']
        
    c = list(collections.Counter([word_dict[w] for w in string_to_corpus(lt, max_len=7)]).items())
    topic_dist, word_topic, word_topic_dist = lda[c]
    inf_topic = np.argmax(np.array([t[1] for t in topic_dist]))
    
    lt_per_inf_topic[inf_topic].append((lt, string_to_corpus(lt, max_len=7)))

In [None]:
for inf_topic, lts in lt_per_inf_topic.items():
    print(inf_topic, inf_topic_words[inf_topic])
    
    for lt in lts:
        print("\t", lt[0], "\t",  lt[1])
    print()

### グラフ生成

- エッジの重みは，GTノードから出ているエッジの，それぞれの発生回数の割合

In [None]:
inf_per_cat

In [None]:
G = nx.DiGraph()
nx.set_node_attributes

pos = {}
for i,k in enumerate(inf_per_cat.keys()):
    if 'None' in k:
        pos[k] = (2,i)
    else:
        pos[k] = (0, i)
for i in range(40):
    pos[i] = (1, len(inf_per_cat)/40*i)

for cat, res in inf_per_cat.items():
    if cat not in G.nodes:
        G.add_node(cat)
        
    col = collections.Counter(res) # dstの集計
    total = sum(col.values())
    for dst, weight in col.items():
        if dst not in G.nodes:
            G.add_node(dst)
        G.add_edge(cat, dst, weight=weight/total)
        
for n, p in pos.items():
    G.node[n]['pos'] = p

In [None]:
for a in G.edges(data=True):
    print(a)

In [None]:
inf_node_edge_weight = dict()
for a,b in G.edges.items():
    if 'None'  in a[0]:
            if a[1] in inf_node_edge_weight:
                inf_node_edge_weight[a[1]] += 0
            else:
                inf_node_edge_weight[a[1]] = 0
        
    if a[1] in inf_node_edge_weight:
        inf_node_edge_weight[a[1]] += b['weight']
    else:
        inf_node_edge_weight[a[1]] = b['weight']
        
        
# edge_size = [i[2]['weight']*2 for i in G.edges(data=True)]
# edge_size = [1 if i[2]['weight']==1.0  else 0 for i in G.edges(data=True)]
# edge_size = [i[2]['weight'] if i[2]['weight']>0.55 else 0  for i in G.edges(data=True)]
edge_size = [i[2]['weight'] if "None" not in i[0] else -1 for i in G.edges(data=True)]

In [None]:
x = collections.Counter([int(e*1000)/10 for e in edge_size])
plt.figure(figsize=(10, 5))
plt.bar(x.keys(), x.values())
plt.title("Edge weights distribution")
plt.xlabel("%")
plt.ylabel("#edges")
plt.xticks([i for i in range(0, 100, 10)])
plt.show()

In [None]:
x = collections.Counter([int(e*1000)/10 for e in edge_size])

# remove -100% data
x = sorted(x.items(), key=lambda x:x[0])
x[0] = (0., 0)

plt.figure(figsize=(10, 5))
plt.bar([i[0] for i in x], [i[1] for i in x])
plt.title("Edge weights distribution (without 'None')")
plt.xlabel("%")
plt.ylabel("#edges")
plt.xticks([i for i in range(0, 100, 10)])
plt.ylim(0, 30)
plt.show()

In [None]:
print('All edges:', len(G.edges()))

print('Without None:', len([(k,v) for k,v in G.edges.items() if 'None' not in k[0]]))

In [None]:
# エッジ1本のみ
for i in np.where(np.array(edge_size)==1)[0]:
    print(list(G.edges())[i], "\t", edge_size[i])
    
print()
for i in np.where(np.array(edge_size)>0.55)[0]:
    print(list(G.edges())[i], "\n\t", edge_size[i], len(lt_per_cat[list(G.edges())[i][0]]))

In [None]:
plt.figure(figsize=(20, 30))
node_size = [20 if type(g) == str else inf_node_edge_weight[g]*500 for g in G.nodes]
nx.draw_networkx(G, pos, node_color='b', alpha=0.6, node_size=node_size, width=edge_size, edge_color='g')

plt.show()

# nx.draw_networkx_labels(G, pos, fontsize=14, font_family="Yu Gothic", font_weight="bold")

# edge_width = [ d['weight']*0.2 for (u,v,d) in G.edges(data=True)]
# nx.draw_networkx_edges(G, pos, alpha=0.4, edge_color='C', width=edge_width)

# plt.axis('off')
# plt.savefig("g2.png")
# plt.show()

### 集計結果

- gtのカテゴリごとに，どのトピックが振られたかを確認
    - カテゴリとトピックのそれらしい対応をつけて見た
    - 複数マージされるものと，対応づけられなかったものが存在
- 訓練じはUIの単語を含むものは除外したが，評価じは全て含めた
- gt通りにうまくいったものは存在

## VyOS

In [None]:
with open("./vyos_logs/tpl_all_uniq", "r") as f:
    raw_vyos = f.readlines()

In [None]:
corpus = []
for lt in raw_vyos:
    words = [remove_symbols(t) for t in lt.split()]
    split_words = []
    for w in words:
        if len(w.split()) > 1: # さらに分割できる場合
            split_words.extend(w.split())
        else:
            split_words.append(w)
    split_words = [t.lower() for t in split_words if t != '']
    corpus.append(split_words)

In [None]:
all_words = list(set([word for words in corpus for word in words]))
word_dict = {w:i for i, w in enumerate(all_words)}
inv_word_dict = {v:k for k,v in word_dict.items()}

corpus = []
for lt in raw_vyos:
    words = [remove_symbols(t) for t in lt.split()]
    split_words = []
    for w in words:
        if len(w.split()) > 1: # さらに分割できる場合
            split_words.extend(w.split())
        else:
            split_words.append(w)
    split_words = [t.lower() for t in split_words if t != '']
    
    split_words = [w for w in split_words]

    split_words = split_words[:7] # 前方のみ残す

    lt_word_id_cnt = collections.Counter([word_dict[w] for w in split_words])

    corpus.append([(tid, cnt) for tid, cnt in lt_word_id_cnt.items()])

In [None]:
# 単語のカウント
word_cnt = {}
for wid, cnt in sorted(collections.Counter([c[0] for s in corpus for c in s]).items(), key= lambda x: x[1])[::-1]:
    print(inv_word_dict[wid], cnt)
    word_cnt[inv_word_dict[wid]] = cnt

In [None]:
# LTレベルの共起
cor_df = pd.DataFrame(index=np.arange(len(word_dict)), columns=np.arange(len(word_dict)), dtype=int)
cor_df = cor_df.fillna(0)

for s in corpus:
    for a,b in itertools.combinations(s, 2):
        ida = int(a[0])
        idb = int(b[0])
        cor_df[ida][idb] += 1
        cor_df[idb][ida] += 1

cor_list = []
for a,b in itertools.combinations(np.arange(len(word_dict)), 2):
    if cor_df[a][b] != 0 :
        cor_list.append([(a, b) , cor_df[a][b]])
        
# for i in tqdm.tqdm(sorted(cor_list, key=lambda x:x[1], reverse=True)):
#     a,b = i[0]
#     print(inv_word_dict[a], inv_word_dict[b], i[1])
#     print("\t simpson:", i[1]/min(word_cnt[inv_word_dict[a]], word_cnt[inv_word_dict[b]]))
#     print("\t jaccard:", i[1]/(word_cnt[inv_word_dict[a]] + word_cnt[inv_word_dict[b]] - i[1]))
#     print()

In [None]:
word_dict['failed']

In [None]:
cor_df[315].sum()

In [None]:
plot_data = [i[1] for i in sorted(cor_list, key=lambda x:x[1], reverse=True)][1000:2000]
plt.figure(figsize=(15, 15))
plt.bar(np.arange(len(plot_data)), plot_data)
plt.grid()
plt.show()

In [None]:
G = nx.DiGraph()
nx.set_node_attributes

for i in sorted(cor_list, key=lambda x:x[1], reverse=True):
    if i[1] > 50: # 共起がn回以上
        a,b = i[0]
        
        if inv_word_dict[a] == 'failed' or inv_word_dict[a] == 'cant' or inv_word_dict[b] == 'failed' or inv_word_dict[b] == 'cant':
            continue
            
        simpson = i[1]/min(word_cnt[inv_word_dict[a]], word_cnt[inv_word_dict[b]])
        jaccard = i[1]/(word_cnt[inv_word_dict[a]] + word_cnt[inv_word_dict[b]] - i[1])
        
        print(simpson)
        if simpson > 0.95:
            if a not in G.nodes:
                G.add_node(inv_word_dict[a])

            if b not in G.nodes:
                G.add_node(inv_word_dict[b])

            src = inv_word_dict[a] if np.argmin([word_cnt[inv_word_dict[a]], word_cnt[inv_word_dict[b]]]) == 0 else inv_word_dict[b]
            dst = inv_word_dict[a] if src == inv_word_dict[b] else inv_word_dict[b]
            G.add_edge(src, dst, weight=jaccard)

In [None]:
json_data = nx.node_link_data(G)
f = open("./d3_test/vyos_nx.json", "w")
json.dump(json_data, f, ensure_ascii=False, indent=4, sort_keys=True, separators=(',', ': '))
f.close()

In [None]:
plt.figure(figsize=(13, 13))
nx.draw_networkx(G, pos=nx.shell_layout(G))
plt.show()

## LDA

### full length

In [None]:
for cat, lts in lt_per_cat.items():
    print(cat)

In [None]:
tmp = []
eval_corpus = []
eval_corpus_ltids = []
for ltid, i in enumerate(corpus):
    if word_dict['ui'] not in [j[0] for j in i]: # uiを弾く(550番)
        tmp.append(tuple(sorted(i)))
        eval_corpus.append(i)
        eval_corpus_ltids.append(ltid)
    
print(len(tmp))
set_corpus = list(set(tmp))

In [None]:
corpus

In [None]:
lda = gensim.models.ldamodel.LdaModel(
                                      #corpus=corpus, 
                                      corpus=set_corpus,
                                      chunksize=10,
                                      num_topics=40, 
                                      id2word=inv_word_dict, 
                                      minimum_probability=1e-8, 
                                      #gamma_threshold=1e-5,
                                      #eval_every=10,
                                      iterations=100,
#                                       alpha='auto', 
#                                       eta='auto', 
                                      decay=1.0,
                                      passes=1,
                                      per_word_topics=True)

In [None]:
results =  [np.argmax(l) for l in lda.inference(eval_corpus)[0]]

inference_results = {}
for a,b in zip(eval_corpus, results):
    if b in inference_results:
        inference_results[b].append([inv_word_dict[c[0]] for c in a])
    else:
        inference_results[b] = [[inv_word_dict[c[0]] for c in a]]

In [None]:
for topic in range(lda.num_topics):
    print(topic, [inv_word_dict[i[0]] for i in lda.get_topic_terms(topic)])
# print('1', [inv_word_dict[i[0]] for i in lda.get_topic_terms(1)])

In [None]:
inf_ltids = {}
for ltid, inf_cat in results.items():
    if inf_cat in inf_ltids:
        inf_ltids[inf_cat].append(ltid)
    else:
        inf_ltids[inf_cat] = [ltid]

In [None]:
for inf_cat, ltids in inf_ltids.items():
    print("topic:", inf_cat)
    for i in ltids:
        print(raw_lt[str(i)]['lt'])
    print()

### eval

- 外れたやつについて，調査

In [None]:
compared_results = {}
for inf_cat, ltids in inf_ltids.items():
    for ltid in ltids:
        true_cat = raw_lt[str(ltid)]['cat']
        if true_cat in compared_results:
            compared_results[true_cat].append((ltid, inf_cat))
        else:
            compared_results[true_cat] = [(ltid, inf_cat)]

In [None]:
for cat, res in compared_results.items():
    plt.figure(figsize=(15, 5))
    plt.title(cat)
    y = collections.Counter([r[1] for r in res])
    y = [y[i] if i in y else 0 for i in range(40)]
#     print(collections.Counter([r[1] for r in res]), y)
    plt.bar(np.arange(40), y)
    plt.show()