In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

import nltk.corpus # for stopwords
from nltk.tokenize import wordpunct_tokenize

In [None]:
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))
wordcloud_stopwords = set(STOPWORDS)
print(nltk_stopwords - wordcloud_stopwords)
print(wordcloud_stopwords - nltk_stopwords)

In [None]:
with open('Log.txt') as f:
    data = ''
    
    for line in f:
        data += line
    
print(data)

In [None]:
tokenized = wordpunct_tokenize(data)
print(tokenized)

In [None]:
wordcloud_filtered = []
nltk_filtered = []

for word in tokenized:
    if word not in wordcloud_stopwords:
        wordcloud_filtered.append(word)
        
    if word not in nltk_filtered:
        nltk_filtered.append(word)

print('Words in nltk set that are not in wordcloud set:', set(nltk_filtered) - set(wordcloud_filtered))
print('Words in wordcloud set that are not in nltk set:', set(wordcloud_filtered) - set(nltk_filtered))

Seems like the wordcloud stopwords gets rid of more uninteresting words.

In [None]:
stopwords = set(STOPWORDS)
stopwords.add('page')
stopwords.add('paper')
stopwords.add('readings')
stopwords.add('research')
stopwords.add('summary')

stopwords.add('look')
stopwords.add('possible')
stopwords.add('related')
stopwords.add('seemed')
stopwords.add('seems')
stopwords.add('understand')
stopwords.add('use')
stopwords.add('used')
stopwords.add('using')

stopwords.add('wikipedia')

stopwords.add('________________')
# time related words
stopwords.add('date')
stopwords.add('november')
stopwords.add('week')
stopwords.add('monday')
stopwords.add('tuesday')
stopwords.add('wednesday')
stopwords.add('thursday')
stopwords.add('friday')

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed = []

for word in wordcloud_filtered:
    stemmed.append(stemmer.stem(word))
    
print(stemmed)

In [None]:
from nltk.stem import WordNetLemmatizer

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 

lemmatizer = WordNetLemmatizer()
lemmatized = []

for word in wordcloud_filtered:
    _, pos = nltk.pos_tag([word])[0]
    pos = penn2morphy(pos)
    lemmatized.append(lemmatizer.lemmatize(word, pos=pos))
    print(word, '->', pos, lemmatized[-1])

In [None]:
wordcloud = WordCloud(stopwords=stopwords, 
                      width=1920,
                      height=1080, 
                      background_color='white',
                      random_state=42)

In [None]:
regular_wordcloud = wordcloud.generate(data)

plt.figure(figsize=(20, 10))
plt.imshow(regular_wordcloud)
plt.axis('off')
plt.show()

In [None]:
stemming_wordcloud = wordcloud.generate(' '.join(stemmed))

plt.figure(figsize=(20, 10))
plt.imshow(stemming_wordcloud)
plt.axis('off')
plt.show()

In [None]:
lemmatizing_wordcloud = wordcloud.generate(' '.join(lemmatized))

plt.figure(figsize=(20, 10))
plt.imshow(lemmatizing_wordcloud)
plt.axis('off')
plt.show()

In [None]:
plt.imsave('log_wordcloud.png', regular_wordcloud, format="png")