In [4]:
# Packages

import os
import pandas as pd
import json
import seaborn as sns
from matplotlib import pyplot as plt
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
import re
import itertools
from itertools import compress
import ast
import numpy as np
from collections import Counter
import pickle

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.test.utils import get_tmpfile, common_texts
from gensim.corpora import MalletCorpus

%matplotlib inline
sns.set(rc={'figure.figsize':(20,12)})

data_path = os.path.join('..', 'data')
out_path = os.path.join('..', 'output')
datafile = 'drr_scrape2021-07-02.json'
datafile_tokenized = 'drr_scrape2021-07-02_tokenized.json'

In [5]:
# Creating directories

if not os.path.isdir(out_path):
    os.mkdir(out_path)

In [6]:
# Loading data
path = os.path.join(data_path, datafile_tokenized)
is_tokenized = True

if not os.path.isfile(path):
    path = os.path.join(data_path, datafile)
    is_tokenized = False

with open(path, 'r') as file:
    data = json.load(file)
    
print(f"The data consists of {len(data)} texts")

The data consists of 13575 texts


In [7]:
# Functions

import spacy
nlp = spacy.load("da_core_news_sm", disable=["ner"])

stop_words = list(nlp.Defaults.stop_words)
                                            
def tokenizer_custom(text, stop_words=stop_words, tags=['NOUN', 'ADJ', 'VERB', 'PROPN']):
       
    text = text.replace('\n', ' ')
    numbers_re = r".*\d.*"
    punct_regex = r"[^\w\s]"
    
    doc = nlp(text)
        
    pos_tags = tags # Keeps proper nouns, adjectives and nouns
    
    tokens = []
      
    for word in doc:
        if (word.pos_ in pos_tags) and (len(word.lemma_) > 4) and (word.lemma_.lower() not in stop_words) and not (re.match(numbers_re, word.lemma_.lower())):
            token = word.lemma_.lower() # Returning the word in lower-case.
            token = re.sub(punct_regex, "", token)
            tokens.append(token)

    return(tokens)


def return_tokens(tokens):
    return tokens

In [8]:
# Tokenize data
if not is_tokenized:
    for entry in data:
        entry['tokens'] = tokenizer_custom(entry.get('page_text'))
        
    # Save tokenized data
    outname = "drr_scrape2021-07-02_tokenized.json"
    with open(os.path.join(data_path, outname), 'w', encoding = 'utf-8') as f:
        json.dump(data, f)

In [9]:
# Keywords based on counts

drr_tokens = [entry['tokens'] for entry in data]
drr_tokens_flat = list(itertools.chain(*drr_tokens))

print(Counter(drr_tokens_flat))



In [10]:
# Keywords based on TF-IDF

vectorizer = TfidfVectorizer(
    tokenizer=return_tokens,
    preprocessor=return_tokens,
    token_pattern=None,
    norm = False)

# Fitting vectorizer
transformed_documents = vectorizer.fit_transform(drr_tokens)
transformed_documents_as_array = transformed_documents.toarray()
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Word count
word_tfidfsum = df.sum().sort_values(ascending = False)
word_tfidfsum[0:50]

function                                      211767.852039
newcategorysectionsettingscategorysection     184525.204441
return                                        119502.850664
found                                         106026.123381
inform                                         77028.129831
koutilsarrayforeach                            63629.726973
index                                          58183.568533
disaster                                       57478.570083
delete                                         49295.723169
termid                                         47184.435493
european                                       43251.185188
itemscount                                     42958.258630
management                                     42208.284026
document                                       42073.504561
download                                       41898.111416
island                                         41796.568622
selected                                

In [11]:
# LDA

## Dictionary and filter extremes
id2token = corpora.Dictionary([entry.get('tokens') for entry in data])

## Gensim doc2bow corpus
for entry in data:
    entry['doc2bow'] = id2token.doc2bow(entry.get('tokens'))    
    
tokens_bow = [entry.get('doc2bow') for entry in data]

## LDA model

lda_model = gensim.models.LdaMulticore(corpus = tokens_bow, 
                                       num_topics = 5, 
                                       id2word = id2token, 
                                       chunksize = 1000, 
                                       passes = 20, 
                                       workers = 4, 
                                       iterations = 2000, 
                                       random_state = 1332)

## Save model
lda_model.save(os.path.join(out_path, 'lda_model'))

NameError: name 'tokens_tfidf' is not defined

In [12]:

## Compute Coherence Score - https://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf

coherence_model_lda = CoherenceModel(model=lda_model, corpus=tokens_bow, coherence='u_mass')

coherence_ldamodel = coherence_model_lda.get_coherence() 
print('\nCoherence Score: ', coherence_ldamodel)


Coherence Score:  -0.9961586469086677


In [13]:
from pprint import pprint 

# Show Topics
pprint(lda_model.show_topics(formatted=False, num_topics=15))

[(0,
  [('function', 0.0654587),
   ('newcategorysectionsettingscategorysection', 0.0560907),
   ('return', 0.040476732),
   ('found', 0.032441825),
   ('koutilsarrayforeach', 0.019348621),
   ('delete', 0.014996609),
   ('termid', 0.014413365),
   ('document', 0.0131664155),
   ('itemscount', 0.012601631),
   ('selected', 0.012577026)]),
 (1,
  [('disaster', 0.02669608),
   ('recording', 0.023992002),
   ('workshop', 0.02145153),
   ('meeting', 0.02126042),
   ('damage', 0.020715354),
   ('group', 0.017391969),
   ('working', 0.017387925),
   ('database', 0.017108247),
   ('download', 0.01669837),
   ('collection', 0.013906808)]),
 (2,
  [('european', 0.030643007),
   ('commission', 0.023119602),
   ('management', 0.022774452),
   ('disaster', 0.020152181),
   ('knowledge', 0.017898582),
   ('drmkc', 0.016303295),
   ('information', 0.013969805),
   ('explorer', 0.013960111),
   ('research', 0.011785915),
   ('projects', 0.009564656)]),
 (3,
  [('inform', 0.030696405),
   ('function',