- LTに重みをつけたい
  - BGPはBGPで近いところにおきたい
  - 似たような意味のLTは同じような値にしたい

### Unigram Mixture やってみる

In [1]:
import numpy as np
import pandas as pd
import json
import gensim
import collections
import matplotlib.pyplot as plt
import pprint
import itertools
import seaborn as sns
import networkx as nx
from IPython.display import display_svg
import tqdm

SYMBOLS = ["'", '>', '<','=', '(', ')', '[', ']', '**', ':', ',','"','-','/']

def remove_symbols(value):
    for s in SYMBOLS:
        value = value.replace(s, "")
    return value.replace("_", " ")


%matplotlib inline

In [2]:
with open("./lt.json", "r") as f:
    raw_lt = json.load(f)

In [3]:
# カテゴリ別分類

lt_per_cat = {}
for ltid, value in raw_lt.items():
    if value['cat'] not in lt_per_cat:
        lt_per_cat[value['cat']] = [value['lt']]
    else:
        lt_per_cat[value['cat']].append(value['lt'])

In [4]:
# カテゴリ別コーパス作成

all_corpus = dict()
for cat, lts in lt_per_cat.items():
    corpus = []
    for lt in lts:
        words = [remove_symbols(t) for t in lt.split()]
        split_words = []
        for w in words:
            if len(w.split()) > 1: # さらに分割できる場合
                split_words.extend(w.split())
            else:
                split_words.append(w)
        split_words = [t.lower() for t in split_words if t != '']
        corpus.extend(split_words)
    all_corpus[cat] = corpus

In [None]:
## TF-IDF

df = dict()
for cat, words in all_corpus.items():
    for word in set(words):
        if word not in df:
            df[word] = 1
        else:
            df[word] += 1
            
tfidfs = {}
for cat, words in all_corpus.items():
    all_word_freq = len(words)
    tfidf_per_word = {}
    
    for word, cnt in collections.Counter(words).items():
        tf = cnt/all_word_freq
        tfidf = tf/np.log(len(all_corpus.keys())/df[word])
        tfidf_per_word[word] = tfidf
    
    tfidfs[cat] = tfidf_per_word
    
for cat, tfidfs_per_word in tfidfs.items():
    print(cat)
    pprint.pprint(sorted(tfidfs_per_word.items(), key=lambda x:x[1])[-10::][::-1])
    print()

In [5]:
'''LDA用のコーパス作成

[ [(word_id, cnt), ...], ... ]

'''

all_words = list(set([word for cat, words in all_corpus.items() for word in words]))
word_dict = {w:i for i, w in enumerate(all_words)}
inv_word_dict = {v:k for k,v in word_dict.items()}

corpus = []
for cat, lts in lt_per_cat.items():
    if True: #cat == 'system(alarm)' or cat == 'service(ntp)':
        for lt in lts:
            words = [remove_symbols(t) for t in lt.split()]
            split_words = []
            for w in words:
                if len(w.split()) > 1: # さらに分割できる場合
                    split_words.extend(w.split())
                else:
                    split_words.append(w)
            split_words = [t.lower() for t in split_words if t != '']

            split_words = split_words[:7] # 前方のみ残す

            lt_word_id_cnt = collections.Counter([word_dict[w] for w in split_words])

            corpus.append([(tid, cnt) for tid, cnt in lt_word_id_cnt.items()])

In [6]:
# 単語のカウント
word_cnt = {}
for wid, cnt in sorted(collections.Counter([c[0] for s in corpus for c in s]).items(), key= lambda x: x[1])[::-1]:
    print(inv_word_dict[wid], cnt)
    word_cnt[inv_word_dict[wid]] = cnt

ui 543
user 537
mgd 505
cfg 470
audit 466
rpd 374
set 324
other 257
event 198
index 174
broadcast 162
delete 136
multicast 115
up 110
kernel 105
mpls 85
lsp 83
to 80
evt 62
from 61
init 60
updown 55
login 55
bgp 53
for 51
mtu 46
chassisd 42
alarm 41
task 41
file 41
pic 41
change 41
trap 40
error 40
address 40
read 39
received 39
mcsn 39
sent 38
port 38
is 38
add 36
snmp 36
notification 36
on 36
pointtopoint 35
failed 34
xntpd 34
cmd 31
luchip 30
class 30
pid 29
down 28
deactivate 28
status 28
bandwidth 27
message 27
neighbor 26
signal 26
state 26
link 25
cleared 25
junoscript 24
of 24
connection 24
info 24
alarmd 23
craftd 23
generated 23
peer 23
switch 22
reinitializing 22
used 22
slot 22
no 22
activate 21
pci 21
not 21
color 20
pfe 20
recv 19
master 19
terminate 19
bulkget 18
chas 18
pfeman 18
system 18
config 18
sshd 18
libjsnmp 17
as 17
time 17
re 16
secret 16
active 16
rsp 16
interface 16
realm 15
ospf 15
snmpd 15
mqchip 15
lu 15
fpc 15
with 15
major 15
by 15
failure 15
rsvp 14
ma

In [7]:
corpus

[[(1116, 1), (686, 1), (467, 1), (1538, 1)],
 [(1116, 1), (467, 1), (630, 1), (1100, 1)],
 [(1116, 1), (467, 1), (862, 1), (352, 1)],
 [(1116, 1), (467, 1), (1538, 1)],
 [(1116, 1), (467, 1), (630, 1), (1100, 1)],
 [(1116, 1), (467, 1), (862, 1), (352, 1)],
 [(1116, 1), (686, 1), (467, 1), (429, 1)],
 [(1116, 1), (587, 1), (1128, 1), (430, 1)],
 [(909, 1), (439, 1), (1021, 1), (95, 1), (1070, 1), (968, 1), (41, 1)],
 [(909, 1), (190, 1), (439, 1), (1021, 1), (95, 1), (1070, 1), (968, 1)],
 [(435, 1), (439, 1), (1021, 1), (95, 1), (1070, 1), (968, 1), (41, 1)],
 [(435, 1), (190, 1), (439, 1), (1021, 1), (95, 1), (1070, 1), (968, 1)],
 [(1318, 1), (1165, 1), (439, 2), (338, 1), (1473, 1), (1387, 1)],
 [(1086, 2), (941, 1), (439, 1), (273, 1), (505, 1), (1057, 1)],
 [(877, 1), (1667, 1), (623, 1), (109, 1)],
 [(877, 1), (1318, 1), (129, 1), (267, 1), (1019, 1)],
 [(877, 1), (613, 1), (793, 1), (950, 1)],
 [(877, 1), (1052, 1), (1440, 1), (30, 1), (351, 1)],
 [(877, 1), (1052, 1)],
 [(877,

## LDA

### full length

In [None]:
len(lt_per_cat)

In [None]:
for k,v in lt_per_cat.items():
    if k == 'system(alarm)':
        print(len(v), v[0])
    if k == 'service(ntp)':
        print(len(v), v[0])

- eval_everyとchunksizeがかなり効いた...
- minimum_probabilityも，下げる必要あり．各単語の値が打ち切られないくらいの値．
- eta, alpha はあまり大きな影響なし．
- 1回しか出てきてない単語のうち，表記揺れのもの(品詞が違う．success or successfull など)

In [None]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                      chunksize=5,
                                      num_topics=30, 
                                      id2word=inv_word_dict, 
                                      minimum_probability=1e-10, 
                                      #gamma_threshold=1e-5,
                                      #eval_every=10,
                                      #iterations=100,
#                                       alpha='auto', 
#                                       eta='auto', 
                                      decay=1.0,
                                      passes=2,
                                      per_word_topics=True)

In [None]:
lda.log_perplexity(corpus)

In [None]:
results =  [np.argmax(l) for l in lda.inference(corpus)[0]]

In [None]:
inference_results = {}
for a,b in zip(corpus, results):
    if b in inference_results:
        inference_results[b].append([inv_word_dict[c[0]] for c in a])
    else:
        inference_results[b] = [[inv_word_dict[c[0]] for c in a]]

In [None]:
for topic in range(30):
    print(topic, [inv_word_dict[i[0]] for i in lda.get_topic_terms(topic)])
# print('1', [inv_word_dict[i[0]] for i in lda.get_topic_terms(1)])

In [None]:
for i,j in lda.show_topics(num_topics=50):
    print(i)
    print("\t", j)

In [None]:
for k,v in inference_results.items():
    print(k)
    for vv in v:
        print('\t', " ".join(vv))
    print()

In [None]:
for i,j in word_dict.items():
    
    if i not in word_cnt:
        cnt = "not shown"
    else:
        cnt = word_cnt[i] 
        
    print(i, cnt, lda.get_term_topics(j))

In [None]:
a = lda.get_document_topics(corpus, per_word_topics=True)
for i,j in zip(a, corpus_lt):
    print(j)
    print(i[0])
    for k in i[2]:
        print(inv_word_dict[k[0]], "\t", k[1])
    print()

In [None]:
inf_ltids = {}
for i, inf_cat in enumerate(results):
    if inf_cat in inf_ltids:
        inf_ltids[inf_cat].append(i)
    else:
        inf_ltids[inf_cat] = [i]

In [None]:
for inf_cat, ltids in inf_ltids.items():
    print("topic:", inf_cat)
    for i in ltids:
        print([inv_word_dict[c[0]] for c in corpus[i]])
    print()

In [None]:
lda.show_topics(num_topics=64, num_words=20)

In [None]:
compared_results = {}
for inf_cat, ltids in inf_ltids.items():
    for ltid in ltids:
        true_cat = raw_lt[str(ltid)]['cat']
        if true_cat in compared_results:
            compared_results[true_cat].append((ltid, inf_cat))
        else:
            compared_results[true_cat] = [(ltid, inf_cat)]

In [None]:
for cat, res in compared_results.items():
    plt.figure(figsize=(12,5))
    plt.title(cat)
    y = collections.Counter([r[1] for r in res])
    y = [y[i] if i in y else 0 for i in range(30)]
#     print(collections.Counter([r[1] for r in res]), y)
    plt.bar(np.arange(30), y)
    plt.show()

### all split

In [None]:
all_corpus

In [None]:
all_words = list(set([word for cat, words in all_corpus.items() for word in words]))
word_dict = {w:i for i, w in enumerate(all_words)}
inv_word_dict = {v:k for k,v in word_dict.items()}

corpus = []
for cat, lts in lt_per_cat.items():
    for lt in lts:
        words = [remove_symbols(t) for t in lt.split()]
        split_words = []
        for w in words:
            if len(w.split()) > 1: # さらに分割できる場合
                split_words.extend(w.split())
            else:
                split_words.append(w)
        split_words = [t.lower() for t in split_words if t != '']
        
        split_words = split_words[:7]
        split_words = [t for t in split_words if t != 'ui']
        
        lt_word_id_cnt = collections.Counter([word_dict[w] for w in split_words])
        
        corpus.append([(tid, cnt) for tid, cnt in lt_word_id_cnt.items()])

In [None]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=30, id2word=inv_word_dict)

In [None]:
results =  [np.argmax(l) for l in lda.inference(corpus)[0]]
results = {ltid:r for ltid, r in enumerate(results)}

In [None]:
inf_ltids = {}
for ltid, inf_cat in results.items():
    if inf_cat in inf_ltids:
        inf_ltids[inf_cat].append(ltid)
    else:
        inf_ltids[inf_cat] = [ltid]

In [None]:
for inf_cat, ltids in inf_ltids.items():
    print("topic:", inf_cat)
    for i in ltids:
        print(len(ltids))
        break
#         print(raw_lt[str(i)]['lt'])
    print()

In [None]:
for inf_cat, ltids in inf_ltids.items():
    print("topic:", inf_cat)
    for i in ltids:
        print(raw_lt[str(i)]['lt'])
    print()

In [None]:
lda.show_topics(num_topics=30)

In [None]:
inf_ltids = {}
for ltid, inf_cat in results.items():
    if inf_cat in inf_ltids:
        inf_ltids[inf_cat].append(ltid)
    else:
        inf_ltids[inf_cat] = [ltid]

In [None]:
compared_results_3 = {}
for inf_cat, ltids in inf_ltids.items():
    for ltid in ltids:
        true_cat = raw_lt[str(ltid)]['cat']
        if true_cat in compared_results_3:
            compared_results_3[true_cat].append((ltid, inf_cat))
        else:
            compared_results_3[true_cat] = [(ltid, inf_cat)]

In [None]:
vec_per_topic = dict()
for tpid, vecs in lda.show_topics(formatted=False):
    tp_vec = np.zeros(len(word_dict), dtype=float)
    for vec in vecs:
        v = np.zeros(len(word_dict), dtype=float)
        v[word_dict[vec[0]]] = 1
        tp_vec += v*vec[1]
    vec_per_topic[tpid] = tp_vec
    
for a, b in itertools.combinations(vec_per_topic.keys(), 2):
    print(a, b)
    print(np.dot(vec_per_topic[a], vec_per_topic[b]))

In [None]:
for (cat, res), (cat2, res2), (cat3, res3)  in zip(sorted(compared_results.items()), sorted(compared_results_2.items()), sorted(compared_results_3.items())):
    plt.figure(figsize=(15, 4))
    plt.subplot(131)
    plt.title(cat)
    y = collections.Counter([r[1] for r in res])
    y = [y[i] if i in y else 0 for i in range(10)]
    plt.bar(np.arange(10), y)
    plt.subplot(132)
    plt.title(cat2)
    y = collections.Counter([r[1] for r in res2])
    y = [y[i] if i in y else 0 for i in range(10)]
    plt.bar(np.arange(10), y)
    plt.subplot(133)
    plt.title(cat3)
    y = collections.Counter([r[1] for r in res3])
    y = [y[i] if i in y else 0 for i in range(10)]
    plt.bar(np.arange(10), y)
    
    plt.show()

### split

In [None]:
all_words = list(set([word for cat, words in all_corpus.items() for word in words]))
word_dict = {w:i for i, w in enumerate(all_words)}
inv_word_dict = {v:k for k,v in word_dict.items()}

corpus = []
for cat, lts in lt_per_cat.items():
    for lt in lts:
        words = [remove_symbols(t) for t in lt.split()]
        split_words = []
        for w in words:
            if len(w.split()) > 1: # さらに分割できる場合
                split_words.extend(w.split())
            else:
                split_words.append(w)
        split_words = [t.lower() for t in split_words if t != '']
        
        split_words = split_words[:7]
        
        lt_word_id_cnt = collections.Counter([word_dict[w] for w in split_words])
        
        corpus.append([(tid, cnt) for tid, cnt in lt_word_id_cnt.items()])

In [None]:
# lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=10, id2word=inv_word_dict)

In [None]:
results =  [np.argmax(l) for l in lda.inference(corpus)[0]]
results = {ltid:r for ltid, r in enumerate(results)}

In [None]:
inf_ltids = {}
for ltid, inf_cat in results.items():
    if inf_cat in inf_ltids:
        inf_ltids[inf_cat].append(ltid)
    else:
        inf_ltids[inf_cat] = [ltid]

In [None]:
for inf_cat, ltids in inf_ltids.items():
    print("topic:", inf_cat)
    for i in ltids:
        print(raw_lt[str(i)]['lt'])
    print()

In [None]:
inf_ltids = {}
for ltid, inf_cat in results.items():
    if inf_cat in inf_ltids:
        inf_ltids[inf_cat].append(ltid)
    else:
        inf_ltids[inf_cat] = [ltid]

In [None]:
compared_results_2 = {}
for inf_cat, ltids in inf_ltids.items():
    for ltid in ltids:
        true_cat = raw_lt[str(ltid)]['cat']
        if true_cat in compared_results_2:
            compared_results_2[true_cat].append((ltid, inf_cat))
        else:
            compared_results_2[true_cat] = [(ltid, inf_cat)]

In [None]:
for (cat, res), (cat2, res2) in zip(compared_results.items(), compared_results_2.items()):
    plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.title(cat)
    y = collections.Counter([r[1] for r in res])
    y = [y[i] if i in y else 0 for i in range(10)]
    plt.bar(np.arange(10), y)
    plt.subplot(122)
    plt.title(cat2)
    y = collections.Counter([r[1] for r in res2])
    y = [y[i] if i in y else 0 for i in range(10)]
    plt.bar(np.arange(10), y)
    
    plt.show()