In [None]:
!pip install pyLDAvis
#Import the necessary packages
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import gensim
from gensim.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
#import spacy
import string
import re
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Load the krebs dataset (saved as a pickle file) into a dataframe df
df = pickle.load(open('krebs_dataset.pickle', 'rb'))

In [None]:
#Inspect the dataframe
df.head()

In [None]:
#Set the index to the date column
df = df.set_index('Date')

In [None]:
df.head()

In [None]:
**Text Preprocessing**

In [None]:
#Convert each article from a list into strings
df['Body'] = df['Body'].apply(', '.join)

In [None]:
#Define a function that removes urls from text using regex
def remove_urls(text):
    URLess_string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    return URLess_string

In [None]:
#Define a function that removes punctuation from the text which may impact the model's ability to extract topics
def remove_punctuation(text):
    no_punct = re.sub(r'[^\w\s]', '', text)
    return no_punct

In [None]:
#I treate the Body and Title series separately because their text produce different results when the LDA model is applied later.

In [None]:
#Apply the remove url function to the body series
df['Body'] = df['Body'].apply(lambda x: remove_urls(x))

In [None]:
#Apply the remove punctuation function to the body series
df['Body'] = df['Body'].apply(lambda x: remove_punctuation(x))

In [None]:
#Apply the remove url function to the Title series
df['Title'] = df['Title'].apply(lambda x: remove_urls(x))

In [None]:
#Apply the remove punctuation function to the Title series
df['Title'] = df['Title'].apply(lambda x: remove_punctuation(x))

In [None]:
df.head()

In [None]:
#Define a function that takes input string, converts it all to lower case then tokenizes it using nltk.tokenize
def word_tokenizer(text):
    tokens = nltk.word_tokenize(text.lower())
    return tokens

In [None]:
#Apply the tokenizer function to the entire Body series
df.Body = df.Body.apply(lambda x: word_tokenizer(x))

In [None]:
#Apply the tokenizer function to the Title series
df['Title'] = df['Title'].apply(lambda x: word_tokenizer(x))

In [None]:
df.head()

In [None]:
#Create a function which removes stopwords using nltk's stop word feature
all_stopwords = stopwords.words('english')
new_stopwords = ["krebsonsecurity", "krebs","security", "cyber", "\'", """ " """, "-", "one", "get", "like", "may", "would", "said", "`", "\"", "...", 'krebsonsecuritycom']
all_stopwords.extend(new_stopwords)
def remove_stopwords(text):
    words = [w for w in text if w not in all_stopwords]
    return words

In [None]:
#Apply the stopwords function to the entire Body series
df.Body = df.Body.apply(lambda x: remove_stopwords(x))

In [None]:
#Apply the tokenizer function to the Title  series
df['Title'] = df['Title'].apply(lambda x: remove_stopwords(x))

In [None]:
#Create a function which removes lemmas using nltk's stop word feature
lemmatizer = WordNetLemmatizer()
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [None]:
#Apply the lemmas function to the entire Body series
df.Body = df.Body.apply(lambda x: word_lemmatizer(x))

In [None]:
#Apply the lemmas function to the Title  series
df['Title'] = df['Title'].apply(lambda x: word_lemmatizer(x))

In [None]:
#Save the preprocessed data to a pickle file: 'df_preprocessed.pickle'
with open('df_preprocessed.pickle', 'wb') as f:
    pickle.dump(df, f)

In [None]:
df.head()

In [None]:
# Create a dictionary of all unique tokens by passing df.Body to the Dictionary method
dictionary = corpora.Dictionary(df.Body)


In [None]:
#Each unique token is mapped to an id number e.g. 'chip': 140
#dictionary.token2id

In [None]:
#Turn the dictionary into a corpus (a Bag of Words) that contains the word id and its frequency in each document (article in our case)
texts = df.Body
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# Create a dictionary of all unique tokens by passing df.Title to the Dictionary method
dictionary_from_titles = corpora.Dictionary(df.Title)

In [None]:
#Turn the dictionary into a corpus (a Bag of Words) that contains the word id and its frequency in each document (article in our case)
texts_from_titles = df.Title
corpus_from_titles = [dictionary_from_titles.doc2bow(text) for text in texts_from_titles]

In [None]:
#We can now see the first 10 word ids with their frequency counts from the fifth document
#corpus[4][:10]

In [None]:
#You can produce the corpus in human readable format for the first article
#[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
#Create an LDA model from the Body corpus which extracts 10 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [None]:
lda_model.print_topics()

In [None]:
#Create an LDA model from the Title corpus which extracts 10 topics
lda_model_from_titles = gensim.models.ldamodel.LdaModel(corpus=corpus_from_titles, id2word=dictionary_from_titles, num_topics=30, random_state=42, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [None]:
#print each topic
for i, topic in enumerate(lda_model_from_titles.print_topics(14)):
    print('{} --- {}'.format(i, topic))

In [None]:
print('\nPerplexity for Corpus from Body: ', lda_model.log_perplexity(corpus) )
print('\nPerplexity for Corpus from Titles: ', lda_model_from_titles.log_perplexity(corpus_from_titles) )

In [None]:
coherence_model_lda_body = CoherenceModel(model=lda_model, texts=df.Body, dictionary=dictionary, coherence='c_v')
coherence_lda_body = coherence_model_lda_body.get_coherence()
print('\nCoherence score from Body: ', coherence_lda_body)

coherence_model_lda_titles = CoherenceModel(model=lda_model_from_titles, texts=df.Title, dictionary=dictionary_from_titles, coherence='c_v')
coherence_lda_titles = coherence_model_lda_titles.get_coherence()
print('\nCoherence score from Titles: ', coherence_lda_titles)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
        
    return model_list, coherence_values

In [None]:
model_list_titles, coherence_values_titles = compute_coherence_values(dictionary=dictionary_from_titles, corpus=corpus_from_titles, texts=df.Title, limit=100, start=2, step=6)

In [None]:
limit=100; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values_titles, 'bo--', linewidth=2, markersize=6, alpha=0.5)
plt.xlabel("Num Topics")
plt.ylabel("Coherence Value")
plt.legend(("coherence_values"), loc='best')
plt.title('Coherence Value as a function of topic numbers\n from Krebsonsecurity Title Corpus')
plt.show()
plt.savefig('Coherence_values_titles')

In [None]:
for m, cv in zip(x, coherence_values_titles):
    print('Num Topics =', m, 'has Coherence ValueErrorlue of', round(cv,4))

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_from_titles, corpus_from_titles, dictionary_from_titles)
vis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
#Save the lda model from body to a pickle file
with open('lda_body.pickle', 'wb') as f:
    pickle.dump(lda_model, f)

In [None]:
#Save the lda model from titles to a pickle file
with open('lda_titles.pickle', 'wb') as f:
    pickle.dump(lda_model_from_titles, f)