In [1]:
# Packages

import os
import pandas as pd
import json
import seaborn as sns
from matplotlib import pyplot as plt
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
import re
import itertools
from itertools import compress
import ast
import numpy as np
from collections import Counter

%matplotlib inline
sns.set(rc={'figure.figsize':(20,12)})

data_path = os.path.join('C:/', 'data', 'drr')
out_path = os.path.join('..', 'output')
datafile = 'drr_scrape2021-07-02.json'

In [2]:
# Loading data
path = os.path.join(data_path, datafile)

with open(path, 'r') as file:
    data = json.load(file)
    
print(f"The data consists of {len(data)} texts")

The data consists of 13575 texts


In [7]:
# Functions

import spacy
nlp = spacy.load("da_core_news_sm", disable=["ner"])

stop_words = list(nlp.Defaults.stop_words)
                                            
def tokenizer_custom(text, stop_words=stop_words, tags=['NOUN', 'ADJ', 'VERB', 'PROPN']):
       
    text = text.replace('\n', ' ')
    numbers_re = r".*\d.*"
    punct_regex = r"[^\w\s]"
    
    doc = nlp(text)
        
    pos_tags = tags # Keeps proper nouns, adjectives and nouns
    
    tokens = []
      
    for word in doc:
        if (word.pos_ in pos_tags) and (len(word.lemma_) > 4) and (word.lemma_.lower() not in stop_words) and not (re.match(numbers_re, word.lemma_.lower())):
            token = word.lemma_.lower() # Returning the word in lower-case.
            token = re.sub(punct_regex, "", token)
            tokens.append(token)

    return(tokens)


def return_tokens(tokens):
    return tokens

In [8]:
# Tokenize data

for entry in data:
    entry['tokens'] = tokenizer_custom(entry.get('page_text'))

In [18]:
# Keywords based on counts

drr_tokens = [entry['tokens'] for entry in data]
drr_tokens_flat = list(itertools.chain(*drr_tokens))

print(Counter(drr_tokens_flat))



In [19]:
# Keywords based on TF-IDF

vectorizer = TfidfVectorizer(
    tokenizer=return_tokens,
    preprocessor=return_tokens,
    token_pattern=None,
    norm = False)

# Fitting vectorizer
transformed_documents = vectorizer.fit_transform(drr_tokens)
transformed_documents_as_array = transformed_documents.toarray()
df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

# Word count
word_tfidfsum = df.sum().sort_values(ascending = False)
word_tfidfsum[0:50]

function                                     211767.852039
newcategorysectionsettingscategorysection    190889.180920
return                                       119502.850664
found                                        106026.123381
inform                                        77028.129831
koutilsarrayforeach                           63629.726973
index                                         60020.705637
disaster                                      57478.570083
delete                                        49295.723169
termid                                        47184.435493
european                                      43251.185188
management                                    42208.284026
document                                      42073.504561
island                                        41796.568622
download                                      41161.615265
selected                                      40330.556304
parent                                        38193.8966

In [None]:
# LDA

## Dictionary and filter extremes
id2token = corpora.Dictionary([entry.get('tokens') for entry in data])

## Gensim doc2bow corpus
for entry in data:
    entry['doc2bow'] = id2token.doc2bow(entry.get('tokens'))    
    
tokens_bow = [entry.get('doc2bow') for entry in data]

## LDA model

lda_model = gensim.models.LdaMulticore(corpus = tokens_bow, 
                                       num_topics = 5, 
                                       id2word = id2token, 
                                       chunksize = 1000, 
                                       passes = 20, 
                                       workers = 4, 
                                       iterations = 2000, 
                                       random_state = 1332)


## Compute Coherence Score - https://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf

coherence_model_lda = CoherenceModel(model=lda_model, corpus=tokens_tfidf, coherence='u_mass')

coherence_ldamodel = coherence_model_lda.get_coherence() 
print('\nCoherence Score: ', coherence_ldamodel)

In [None]:
from pprint import pprint 

# Show Topics
pprint(lda_model.show_topics(formatted=False, num_topics=15))