In [1]:
# import libraries
from __future__ import print_function

import pandas as pd
import numpy as np

import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
# if 'autoreload' not in ipython.extension_manager.loaded:
#     get_ipython().run_line_magic('load_ext', 'autoreload')

# get_ipython().run_line_magic('autoreload', '2')

# Visualizations
import seaborn as sns

In [2]:
v = '07'

In [3]:
file_name = f"../File/FrontendFileGroup/storm-frontend-202003{v}-mask-group.txt"

print('Reading', file_name)
logs = pd.read_csv(file_name, index_col=0, nrows = 1e4)
tokens_per_message = [x.lower().split() for x in logs.message]

Reading ../File/FrontendFileGroup/storm-frontend-20200307-mask-group.txt


In [4]:
word_set = set()

for mess in tokens_per_message:
    word_set = word_set.union(set(mess))

In [5]:
print("We have {} logs messages, for a total of {} unique tokens adopted.".format(
    len(tokens_per_message), len(word_set)))

We have 10000 logs messages, for a total of 9863 unique tokens adopted.


In [6]:
word_dict = [dict.fromkeys(word_set, 0) for i in range(len(tokens_per_message))]

# Compute raw frequencies of each token per each message
for i in range(len(logs.message)):
    for word in tokens_per_message[i]:
        word_dict[i][word] += 1

In [7]:
c = 0
for i, dic in enumerate(tokens_per_message):
    if not len(dic):
        print(i, errors.loc[i])
        c += 1

print("Warning: there are {} blanck messages which will be excluded from the analysis.".format(c))



In [8]:
# Extract TF-IDF information
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.02, stop_words='english',
                             use_idf=True)
# vectorizer = TfidfVectorizer(stop_words='english',
#                              use_idf=True)
X = vectorizer.fit_transform(logs.message)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()


Extracting features from the training dataset using a sparse vectorizer
done in 0.753497s
n_samples: 10000, n_features: 93



In [9]:
# Apply LSA for dimensionality reduction to get a lower-dimensional embedding space
print("Performing dimensionality reduction using LSA")
t0 = time()

# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(25)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

Performing dimensionality reduction using LSA
done in 0.081740s


In [10]:
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
      int(explained_variance * 100)))

Explained variance of the SVD step: 96%
