In [440]:
import nltk
import re
import os
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.util import bigrams, trigrams, ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string 
import json 
import glob 
nltk.download('maxent_ne_chunker') 
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt') 
nltk.download('stopwords')

# Get Article Files 
f_list = os.listdir(path='/content/')
f_list

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['.config',
 'nontech-small_business .txt',
 'tech_equifax-article4.txt',
 'nontech-pipeline6.txt',
 'nontech-article2',
 'tech-article1',
 'tech-target5.txt',
 '.ipynb_checkpoints',
 'nontech-article4',
 'tech-article2',
 'nontech-target5.txt',
 'nontech-article3',
 'tech-pipeline6.txt',
 'nontech-equifax-article1.txt',
 'sample_data']

In [441]:
pwd

'/content'

## Read Tech and Nontech articles into a list and convert to Tech and Nontech strings for NLP

In [442]:
nontech_articles = []
tech_articles = []

for filename in f_list:
  if filename.startswith('nontech'):
    with open(filename) as f:
      contents = f.read().replace("\n\n", " ").replace("\n", " ")
      nontech_articles.append(contents)
  elif filename.startswith('tech'):
    with open(filename) as f:
      contents = f.read()
      tech_articles.append(contents)

nontech = ' '.join(nontech_articles)
tech = ' '.join(tech_articles)
nontech

'Why should a small business be interested in, or concerned with information security? The customers of small businesses have an expectation that their sensitive information will be respected and given adequate and appropriate protection. The employees of a small business also have an expectation that their sensitive personal information will be appropriately protected. And, in addition to these two groups, current and/or potential business partners also have their expectations of the status of information security in a small business. These business partners want assurance that their information, systems, and networks are not put “at risk” when they connect to and do business with this small business. They expect an appropriate level of security in this actual or potential business partner – similar to the level of security that they have implemented in their own systems and networks. Some of the information used in your business requires special protection for confidentiality (to ens

#Tokenization

In [443]:
# Tokenize articles 
nontech_tokens = word_tokenize(nontech)
tech_tokens = nltk.word_tokenize(tech)

punctuation = re.compile(r'[-.?!,:;()|0-9]')

# Remove punctuation
non_post_punctuation = []
for words in nontech_tokens:
  word = punctuation.sub("", words)
  if len(word)>0:
    non_post_punctuation.append(word)

tech_post_punctuation = []
for words in tech_tokens:
  word = punctuation.sub("", words)
  if len(word)>0:
    tech_post_punctuation.append(word)

#Two lists created from above:     non_post_punctuation      tech_post_punctuation

#Playing around with data, finding most common words tech

In [444]:
from nltk import FreqDist
stop_words = set(stopwords.words('english'))

word_tokens = tech_post_punctuation
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

punctuation = re.compile(r'[-.?!,:;()|0-9]')

tech_post_punctuation1 = []

for words in filtered_sentence :
  word = punctuation.sub("", words)
  if len(word)>2:
    tech_post_punctuation1.append(word)
FreqDist(tech_post_punctuation1).most_common(10)

[('network', 66),
 ('vulnerability', 48),
 ('Target', 47),
 ('Equifax', 44),
 ('Struts', 40),
 ('Apache', 39),
 ('systems', 38),
 ('data', 36),
 ('security', 34),
 ('CISA', 34)]

#Playing around with data, finding most common words in non-tech

In [445]:
non_word_tokens = non_post_punctuation
non_filtered_sentence = [w for w in non_word_tokens if not w.lower() in stop_words]

punctuation = re.compile(r'[-.?!,:;()|0-9]')

non_tech_post_punctuation1 = []

for words in non_filtered_sentence :
  word = punctuation.sub("", words)
  if len(word)>2:
    non_tech_post_punctuation1.append(word)
FreqDist(non_tech_post_punctuation1).most_common(10)

[('said', 99),
 ('information', 81),
 ('company', 67),
 ('pipeline', 64),
 ('Colonial', 54),
 ('attack', 52),
 ('data', 51),
 ('security', 46),
 ('business', 44),
 ('breach', 42)]

#Creating vector params for TF-IDF Analysis 

In [446]:
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features=100,
                                max_df=0.9, #words that occur in less than x% of the documents 
                                min_df=10, #words that occur in atleast 10 documents 
                                ngram_range = (1,3),
                                stop_words = "english"

                            )

#Function to perform TF-IDF and run it on the tech and non-tech date 

In [447]:
import numpy as np

def TF_IDF(Cleaned_words, true_k):

  vectors = vectorizer.fit_transform(Cleaned_words)
  feature_names = vectorizer.get_feature_names_out()
  dense = vectors.todense()
  denselist = dense.tolist()

  all_keywords = []
  for description in denselist:
      x=0
      keywords = []
      for word in description:
          if word > 0:
              keywords.append(feature_names[x])
          x=x+1
      all_keywords.append(keywords)

  model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

  model.fit(vectors)

  order_centroids = model.cluster_centers_.argsort()[:, ::-1]
  terms = vectorizer.get_feature_names_out()
  
  Clusters = []
  for i in range(true_k):
    print(f"\nCluster {i}")
    for ind in order_centroids[i, :10]:
      #print(terms[ind])
      print(terms[ind])
      
  

print('Non-Tech Key Words by cluster:' )
TF_IDF(non_tech_post_punctuation1,3)
print('\nTech Key Words by cluster ')
TF_IDF(tech_post_punctuation1,3)

Non-Tech Key Words by cluster:

Cluster 0
said
pipeline
information
security
colonial
attack
data
business
breach
ransomware

Cluster 1
cost
years
decision
critical
customers
cyber
cyberattack
cybersecurity
darkside
data

Cluster 2
company
years
decision
critical
customers
cyber
cyberattack
cybersecurity
darkside
data

Tech Key Words by cluster 

Cluster 0
ransomware
security
vulnerability
data
target
equifax
systems
struts
apache
cisa

Cluster 1
network
weeks
exploit
cybersecurity
darkside
data
department
detect
emergency
employees

Cluster 2
darkside
weeks
exploit
cybersecurity
data
department
detect
emergency
employees
energy


In [448]:
pip install pyldavis



# Starting LDA analysis 

In [449]:
import numpy as np
import json
import glob
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# LDA for Tech-article Analysis 

In [450]:
def lemmatization(text):
  allowed_postags = ["NOUN","ADJ", "VERB", "ADV"]
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)
  texts_out =[]
  for word in doc:
    if word.pos_ in allowed_postags:
      texts_out.append(word.lemma_)
  # texts_out.append(final)
  return texts_out
lemmaTech = np.array(lemmatization(tech))
lemmaTech

array(['week', 'follow', 'public', ..., 'colonial', 'cyberattack',
       'follow'], dtype='<U19')

In [451]:
def gen_words(texts):
  final = []
  for text in texts:
    new = gensim.utils.simple_preprocess(text)
    final.append(new)
  return final

data_words = gen_words(lemmaTech)

In [452]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
  new = id2word.doc2bow(text)
  corpus.append(new)

In [453]:
data_words

[['week'],
 ['follow'],
 ['public'],
 ['notification'],
 ['hacker'],
 ['successfully'],
 ['breach'],
 ['web'],
 ['application'],
 ['run'],
 ['vulnerable'],
 ['version'],
 ['locate'],
 ['network'],
 ['when'],
 ['able'],
 ['access'],
 ['multiple'],
 ['datum'],
 ['repository'],
 ['decision'],
 ['implement'],
 ['certain'],
 ['cybersecurity'],
 ['protocol'],
 ['recommend'],
 ['cybersecurity'],
 ['specifically'],
 ['combination'],
 ['expire'],
 ['certificate'],
 ['unencrypted'],
 ['username'],
 ['password'],
 ['lack'],
 ['network'],
 ['segmentation'],
 ['discuss'],
 ['more'],
 ['detail'],
 ['below'],
 ['compound'],
 ['effect'],
 ['breach'],
 ['learn'],
 ['breach'],
 ['access'],
 ['hacker'],
 ['gain'],
 ['multiple'],
 ['company'],
 ['system'],
 ['wait'],
 ['week'],
 ['notify'],
 ['therefore'],
 ['hacker'],
 ['access'],
 ['datum'],
 ['as'],
 ['early'],
 ['public'],
 ['unaware'],
 ['datum'],
 ['compromise'],
 ['week'],
 ['several'],
 ['current'],
 ['former'],
 ['employee'],
 ['still'],
 ['belie

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis

# LDA for Non Tech-article Analysis 

In [None]:
lemmaNonTech = np.array(lemmatization(nontech))
NonTech_data_words = gen_words(lemmaNonTech)

Nonid2word = corpora.Dictionary(NonTech_data_words)

Noncorpus = []
for text in NonTech_data_words:
  new = Nonid2word.doc2bow(text)
  Noncorpus.append(new)

Nonlda_model = gensim.models.ldamodel.LdaModel(corpus=Noncorpus,
                                           id2word=Nonid2word,
                                           num_topics=4,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(Nonlda_model, Noncorpus, Nonid2word, mds="mmds", R=10)
vis

In [None]:
print(lemmatization(nontech))

In [None]:
print(lemmatization(tech))