In [185]:
# Packages

import os
import pandas as pd
import json
import seaborn as sns
from matplotlib import pyplot as plt
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
import re
import itertools
from itertools import compress
import ast
import numpy as np
from collections import Counter
from urllib.parse import urljoin, urlparse
import pickle
import requests
import time
import random
import pdfplumber

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.test.utils import get_tmpfile, common_texts
from gensim.corpora import MalletCorpus

%matplotlib inline
sns.set(rc={'figure.figsize':(20,12)})

data_path = os.path.join('..', 'data')
pdf_path = os.path.join(data_path, 'pdf')
out_path = os.path.join('..', 'output')
datafile = 'drr_scrape2021-07-08.json'
datafile_tokenized = 'drr_scrape2021-07-08_tokenized.json'
datafile_es = 'drr_scrape2021-07-08_es.json'

In [142]:
# Creating directories

if not os.path.isdir(out_path):
    os.mkdir(out_path)
    
if not os.path.isdir(pdf_path):
    os.mkdir(pdf_path)

In [143]:
# Loading data
path = os.path.join(data_path, datafile_tokenized)
is_tokenized = True

if not os.path.isfile(path):
    path = os.path.join(data_path, datafile)
    is_tokenized = False

with open(path, 'r') as file:
    data = json.load(file)

# Filter data

data_unique = []
urls_added = []

for entry in data:
    if entry['url'] in urls_added:
        continue
    else:
        data_unique.append(entry)
        urls_added.append(entry['url'])
    
data = data_unique

for entry in data:
    if entry['domain_url'] == 'https://www.unddr.org':
        entry['org'] = 'unddr'
    elif entry['domain_url'] == 'https://drmkc.jrc.ec.europa.eu':
        entry['org'] = 'drmkc'
    else:
        entry['org'] = ''
    
    entry['type'] = 'webpage'
    entry['page_links'] = [urljoin(entry['domain_url'], page_url) for page_url in entry['page_links']]

The data consists of 284 texts in total. 
133 texts are from UN DDR (https://www.unddr.org). 
151 texts are from DRMKC EU (https://drmkc.jrc.ec.europa.eu)


In [144]:
pdfs_un = [entry['page_links'] for entry in data_un]
pdfs_un = list(itertools.chain(*pdfs_un))
pdfs_un = [link for link in pdfs_un if link.endswith('pdf')]

pdfs_drmkc = [entry['page_links'] for entry in data_drmkc]
pdfs_drmkc = list(itertools.chain(*pdfs_drmkc))
pdfs_drmkc = [link for link in pdfs_drmkc if link.endswith('pdf')]

In [104]:
orgs = ['unddr', 'drmkc']

def url_to_filename(url):
    url = re.sub(r'(https\:\/\/(www\.)?)|(http\:\/\/(www\.)?)', '', url)
    url = re.sub(r'\:\d{2,4}(?=\/)', '', url)
    urlpart = re.search(r'(\w+?)\.\w{2,11}(\.\w{2,5})?(?=\/)', url).group(1)
    namepart = re.search(r'\.\w{2,11}(\/.+\.pdf)', url).group(1).replace("/", "-").replace("\\", "-")
    namepart = namepart.replace("?", "")
    filename = urlpart + namepart
    return(filename)

for org in orgs:
    
    missed_pdfs = []
    
    save_path = os.path.join(pdf_path, org)
    
    if not os.path.isdir(save_path):
        os.mkdir(save_path)
    
    domain_set = [entry for entry in data if entry.get('org') == org]
    
    pdfs = list(set([url for url in list(itertools.chain(*[list(compress(entry['page_links'], [(".pdf" in link) for link in entry['page_links']])) for entry in domain_set]))]))
    
    print("downloading pdfs for {}\n".format(org))
    for c, pdf_url in enumerate(pdfs, start = 1):
    
        filename = url_to_filename(pdf_url)
        
        if os.path.isfile(os.path.join(save_path, filename)):
            continue
        
        try:
            r = requests.get(pdf_url, stream=True)
        except:
            missed_pdfs.append(pdf_url)
            continue

        if r.status_code == 200:
            with open(os.path.join(save_path, filename), 'wb') as f:
                f.write(r.content)
            time.sleep(random.uniform(0.5, 1))
        else:
            missed_pdfs.append(pdf_url)
            continue

        progress = "|{0}| {1:.2f} %".format(("="*int(c/len(pdfs) * 50)).ljust(50), c/len(pdfs) * 100)
    
        print(progress, end = "\r")
        
        with open(os.path.join(save_path, 'missed_pdf.txt'), 'w', encoding = 'utf-8') as f:
            for url in missed_pdfs:
                f.write(url + "\n")
            f.close()
        
    print("\n")

downloading pdfs for unddr


downloading pdfs for drmkc




In [130]:
pdf_data = []
problem_files = []

for org in orgs:
    save_path = os.path.join(pdf_path, org)

    filenames = [os.path.join(save_path,f) for f in os.listdir(save_path) if os.path.isfile(os.path.join(save_path, f))]
    filenames = [filename for filename in filenames if filename.endswith('.pdf')]

    for c, filename in enumerate(filenames, start = 1):

        if filename in [entry.get('filename') for entry in pdf_data]:
            continue

        progress = "|{0}| {1:.2f} %".format(("="*int(c/len(filenames) * 50)).ljust(50), c/len(filenames) * 100)

        entry = {}
        entry['filename'] = filename
        entry['org'] = org

        try:
            with pdfplumber.open(filename) as pdf:
                try:
                    pdf_text = '\n'.join([page.extract_text() for page in pdf.pages if page.extract_text() is not None])
                    entry['text'] = pdf_text
                except Exception as e:
                    print(filename)
                    raise e
        except:
            problem_files.append(filename)

        pdf_data.append(entry)

        print(progress, end = "\r")



In [145]:
all_pdf_links = pdfs_un + pdfs_drmkc
pdf_link_name = {}

for pdf_link in all_pdf_links:
    filename = url_to_filename(pdf_link)
    pdf_link_name[filename] = pdf_link

for entry in pdf_data:
    if entry.get('org') == 'unddr':
        folder_path = '../data/pdf/unddr/'
    elif entry.get('org') == 'drmkc':
        folder_path = '../data/pdf/drmkc/'
        
    filename_lookup = entry.get('filename').replace(folder_path, '')
    entry['url'] = pdf_link_name.get(filename_lookup)
    entry['type'] = 'pdf'
    
    if entry.get('org') == 'unddr':
        entry['domain_url'] = 'https://www.unddr.org'
    elif entry.get('org') == 'drmkc':
        entry['domain_url'] = 'https://drmkc.jrc.ec.europa.eu'

In [146]:
data[0].keys()

dict_keys(['domain_url', 'url', 'links', 'date-of-access', 'page_text', 'page_links', 'org', 'type'])

In [147]:
pdf_data[0].keys()

dict_keys(['filename', 'org', 'text', 'url', 'type', 'domain_url'])

In [163]:
# Combine and reorder data

data_all_un = [entry for entry in data if entry['org'] == 'unddr'] + [entry for entry in pdf_data if entry['org'] == 'unddr' and 'text' in entry]
data_all_drmkc = [entry for entry in data if entry['org'] == 'drmkc'] + [entry for entry in pdf_data if entry['org'] == 'drmkc' and 'text' in entry]

data_all = []

i = 1
for entry in data_all_un:
    if ('text' not in entry and 'page_text' in entry):
        entry['text'] = entry.pop('page_text')
        
    if 'page_links' not in entry:
        entry['page_links'] = []
        
    entry['id'] = 'unddr' + str.rjust(str(i), 5, str(0))
    i = i +1
    
    new_entry = {}
    new_entry['id'] = entry['id']
    new_entry['url'] = entry['url']
    new_entry['domain_url'] = entry['domain_url']
    new_entry['text'] = entry['text']
    new_entry['org'] = entry['org']
    new_entry['page_links'] = entry['page_links']
    new_entry['type'] = entry['type']
    
    data_all.append(new_entry)

i = 1
for entry in data_all_drmkc:
    if ('text' not in entry and 'page_text' in entry):
        entry['text'] = entry.pop('page_text')
    
    if 'page_links' not in entry:
        entry['page_links'] = []
        
    entry['id'] = 'drmkc' + str.rjust(str(i), 5, str(0))
    i = i +1
    
    new_entry = {}
    new_entry['id'] = entry['id']
    new_entry['url'] = entry['url']
    new_entry['domain_url'] = entry['domain_url']
    try: 
        new_entry['text'] = entry['text']
    except:
        print(entry)
    new_entry['org'] = entry['org']
    new_entry['page_links'] = entry['page_links']
    new_entry['type'] = entry['type']
    
    data_all.append(new_entry)

In [186]:
# Export data for ES

with open(os.path.join(data_path, datafile_es), 'w', encoding = 'utf-8') as f:
    json.dump(data_all, f)

In [187]:
# Functions

import spacy
nlp = spacy.load("da_core_news_sm", disable=["ner"])
nlp.max_length = 2500000

stop_words = list(nlp.Defaults.stop_words)

# Tokenizer
def tokenizer_custom(text, stop_words=stop_words, tags=['NOUN', 'ADJ', 'VERB', 'PROPN']):
       
    text = text.replace('\n', ' ')
    numbers_re = r".*\d.*"
    punct_regex = r"[^\w\s]"
    
    doc = nlp(text)
        
    pos_tags = tags # Keeps proper nouns, adjectives and nouns
    
    tokens = []
      
    for word in doc:
        if (word.pos_ in pos_tags) and (len(word.lemma_) > 4) and (word.lemma_.lower() not in stop_words) and not (re.match(numbers_re, word.lemma_.lower())):
            token = word.lemma_.lower() # Returning the word in lower-case.
            token = re.sub(punct_regex, "", token)
            tokens.append(token)

    return(tokens)


# Dummy functions for using existing tokens in sklearn vectorizer
def return_tokens(tokens):
    return tokens

# Function for summarizing keywords with tf-idf
def tfidf_summarize(token_list, n_words = 50):
    vectorizer = TfidfVectorizer(
        tokenizer=return_tokens,
        preprocessor=return_tokens,
        token_pattern=None,
        norm = False)

    # Fitting vectorizer
    transformed_documents = vectorizer.fit_transform(token_list)
    transformed_documents_as_array = transformed_documents.toarray()
    df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names())

    # Word count
    word_tfidfsum = df.sum().sort_values(ascending = False)
    word_tfidfsum_select = word_tfidfsum[0:n_words]
    
    return(word_tfidfsum_select)

In [188]:
# Tokenize data
if not is_tokenized:
    for entry in data_all:
        entry['tokens'] = tokenizer_custom(entry.get('text'))
        
    # Save tokenized data
    with open(os.path.join(data_path, datafile_tokenized), 'w', encoding = 'utf-8') as f:
        json.dump(data_all, f)

In [189]:
data_un = [entry for entry in data_all if entry['org'] == 'unddr']
data_drmkc = [entry for entry in data_all if entry['org'] == 'drmkc']

print(f"""The data consists of {len(data_all)} texts in total. \n
{len(data_un)} texts are from UN DDR (https://www.unddr.org). {dict(Counter([entry.get('type') for entry in data_un]))['webpage']} from webpages and {dict(Counter([entry.get('type') for entry in data_un]))['pdf']} from pdfs \n
{len(data_drmkc)} texts are from DRMKC EU (https://drmkc.jrc.ec.europa.eu). {dict(Counter([entry.get('type') for entry in data_drmkc]))['webpage']} from webpages and {dict(Counter([entry.get('type') for entry in data_drmkc]))['pdf']} from pdfs""")

The data consists of 433 texts in total. 

196 texts are from UN DDR (https://www.unddr.org). 133 from webpages and 63 from pdfs 

237 texts are from DRMKC EU (https://drmkc.jrc.ec.europa.eu). 151 from webpages and 86 from pdfs


In [190]:
# Keywords based on counts

drr_tokens = [entry['tokens'] for entry in data_all]
drr_tokens_flat = list(itertools.chain(*drr_tokens))

un_tokens = [entry['tokens'] for entry in data_un]
un_tokens_flat = list(itertools.chain(*un_tokens))

drmkc_tokens = [entry['tokens'] for entry in data_drmkc]
drmkc_tokens_flat = list(itertools.chain(*drmkc_tokens))

tokens_counted = Counter(drr_tokens_flat)
tokens_counted_un = Counter(un_tokens_flat)
tokens_counted_drmkc = Counter(drmkc_tokens_flat)
tokens_counted.most_common()[0:50]

[('should', 6140),
 ('their', 4689),
 ('national', 4093),
 ('support', 3832),
 ('reintegration', 3675),
 ('international', 3664),
 ('management', 3532),
 ('disaster', 3504),
 ('security', 3323),
 ('information', 3290),
 ('groups', 3288),
 ('programmes', 3161),
 ('programme', 3071),
 ('european', 2836),
 ('processes', 2479),
 ('system', 2450),
 ('these', 2422),
 ('force', 2364),
 ('service', 2200),
 ('including', 2156),
 ('development', 2063),
 ('commission', 2016),
 ('crisis', 2007),
 ('community', 2002),
 ('demobilization', 1976),
 ('planning', 1976),
 ('activities', 1960),
 ('peace', 1905),
 ('measures', 1884),
 ('people', 1883),
 ('women', 1882),
 ('process', 1875),
 ('weapons', 1825),
 ('united', 1798),
 ('between', 1784),
 ('natural', 1756),
 ('assessment', 1716),
 ('excombatants', 1693),
 ('during', 1692),
 ('transitional', 1630),
 ('provide', 1612),
 ('health', 1594),
 ('inform', 1592),
 ('research', 1579),
 ('communities', 1573),
 ('different', 1560),
 ('disarmament', 1522),
 (

In [191]:
tokens_counted_un.most_common()[0:50]

[('should', 5109),
 ('reintegration', 3675),
 ('their', 3373),
 ('support', 3177),
 ('programmes', 3039),
 ('national', 3038),
 ('programme', 2892),
 ('security', 2864),
 ('groups', 2819),
 ('international', 2321),
 ('force', 2298),
 ('processes', 2126),
 ('demobilization', 1976),
 ('peace', 1895),
 ('information', 1872),
 ('women', 1872),
 ('weapons', 1822),
 ('excombatants', 1693),
 ('including', 1673),
 ('transitional', 1626),
 ('process', 1556),
 ('activities', 1543),
 ('disarmament', 1522),
 ('community', 1519),
 ('these', 1519),
 ('planning', 1468),
 ('mission', 1429),
 ('rights', 1337),
 ('ensure', 1264),
 ('ammunition', 1237),
 ('development', 1234),
 ('integrated', 1196),
 ('provide', 1192),
 ('communities', 1186),
 ('measures', 1144),
 ('combatants', 1135),
 ('nations', 1122),
 ('include', 1105),
 ('during', 1080),
 ('management', 1067),
 ('where', 1053),
 ('training', 1048),
 ('united', 1036),
 ('political', 1031),
 ('service', 996),
 ('between', 939),
 ('gender', 939),
 ('v

In [192]:
tokens_counted_drmkc.most_common()[0:50]

[('disaster', 3495),
 ('european', 2799),
 ('management', 2465),
 ('crisis', 1936),
 ('system', 1769),
 ('inform', 1495),
 ('commission', 1473),
 ('impacts', 1444),
 ('research', 1438),
 ('information', 1418),
 ('international', 1343),
 ('their', 1316),
 ('people', 1302),
 ('service', 1204),
 ('assessment', 1182),
 ('disasters', 1181),
 ('change', 1172),
 ('global', 1143),
 ('natural', 1127),
 ('damage', 1118),
 ('hazards', 1066),
 ('national', 1055),
 ('medium', 1035),
 ('should', 1031),
 ('index', 1010),
 ('different', 1003),
 ('science', 997),
 ('knowledge', 962),
 ('impact', 962),
 ('communication', 945),
 ('events', 942),
 ('centre', 920),
 ('model', 915),
 ('these', 903),
 ('reduction', 881),
 ('vulnerability', 881),
 ('complex', 860),
 ('severity', 853),
 ('between', 845),
 ('infrastructure', 844),
 ('africa', 842),
 ('development', 829),
 ('resilience', 826),
 ('population', 808),
 ('health', 804),
 ('report', 795),
 ('university', 790),
 ('framework', 781),
 ('protection', 775

In [193]:
# Keywords based on TF-IDF

tfidf_summarize(drr_tokens)

should            14188.745480
their              9765.367421
reintegration      9666.695295
disaster           8864.243130
national           8441.441019
support            7802.304282
groups             7771.235126
programmes         7471.068806
international      7320.198314
programme          7258.352516
management         7237.647295
security           7154.630370
european           6807.011724
information        6593.621692
force              6274.538082
processes          5544.934553
these              5436.737990
system             5257.187580
demobilization     5244.706958
women              5136.428009
weapons            5103.390608
service            4973.892393
peace              4944.823206
excombatants       4933.675336
crisis             4817.232909
including          4771.728472
natural            4703.613486
commission         4641.557578
community          4626.362940
transitional       4558.096817
activities         4546.130512
planning           4532.787977
measures

In [194]:
tfidf_summarize(un_tokens)

should            10395.325658
reintegration      6764.030336
their              6585.682620
support            5923.081635
national           5813.990352
programmes         5702.636339
groups             5617.546147
programme          5571.423257
security           5480.996830
force              4810.975571
international      4657.173792
processes          4015.492224
women              3756.238405
weapons            3733.455142
demobilization     3683.981527
information        3679.819632
peace              3602.730593
excombatants       3596.474853
transitional       3331.832086
including          3288.642224
process            3188.395281
activities         3161.757017
these              3090.722191
community          3090.722191
darfur             3034.839366
mission            2887.328584
planning           2885.670523
ammunition         2884.634533
disarmament        2837.560670
rights             2819.487605
ensure             2665.544004
measures           2504.054886
communit

In [195]:
tfidf_summarize(drmkc_tokens)

disaster          7052.642204
european          5552.192417
management        5152.451556
crisis            4171.659036
system            3933.858739
impacts           3482.679146
inform            3390.029191
commission        3193.766820
research          3060.733032
change            3000.610267
people            2991.844192
their             2889.415594
information       2878.000334
natural           2819.730063
service           2766.651618
should            2681.702822
international     2664.020870
assessment        2662.765829
damage            2658.521393
global            2644.202016
disasters         2643.271884
hazards           2552.785623
communication     2540.236583
science           2532.823706
medium            2532.561147
vulnerability     2329.037393
national          2316.362805
journal           2315.031774
model             2306.739664
complex           2292.423038
index             2275.290598
different         2274.380788
severity          2273.763781
events    

In [196]:
# LDA

## Dictionary and filter extremes
id2token = corpora.Dictionary([entry.get('tokens') for entry in data_all])
id2token.filter_extremes(no_below=0.05, no_above=0.95)

## Gensim doc2bow corpus
for entry in data_all:
    entry['doc2bow'] = id2token.doc2bow(entry.get('tokens'))    
    
tokens_bow = [entry.get('doc2bow') for entry in data_all]

## LDA model

lda_model = gensim.models.LdaMulticore(corpus = tokens_bow, 
                                       num_topics = 10, 
                                       id2word = id2token, 
                                       chunksize = 1000, 
                                       passes = 20, 
                                       workers = 4, 
                                       iterations = 2000, 
                                       random_state = 1332)

## Save model
lda_model.save(os.path.join(out_path, 'lda_model'))

In [197]:
## Compute Coherence Score - https://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf

coherence_model_lda = CoherenceModel(model=lda_model, corpus=tokens_bow, coherence='u_mass')

coherence_ldamodel = coherence_model_lda.get_coherence() 
print('\nCoherence Score: ', coherence_ldamodel)


Coherence Score:  -0.7038474234374916


In [198]:
from pprint import pprint 

# Show Topics
pprint(lda_model.show_topics(formatted=False, num_topics=15))

[(0,
  [('disaster', 0.01122476),
   ('management', 0.007977575),
   ('european', 0.0079092765),
   ('system', 0.005714025),
   ('impacts', 0.0048752055),
   ('their', 0.004089153),
   ('change', 0.003980041),
   ('disasters', 0.0038803057),
   ('information', 0.003842459),
   ('research', 0.0038103017)]),
 (1,
  [('function', 0.018646052),
   ('newcategorysectionsettingscategorysection', 0.015501083),
   ('european', 0.011650168),
   ('return', 0.011440888),
   ('commission', 0.010161978),
   ('found', 0.009384863),
   ('university', 0.009039401),
   ('research', 0.008704226),
   ('centre', 0.008032969),
   ('joint', 0.0071320636)]),
 (2,
  [('inform', 0.028230362),
   ('index', 0.017968168),
   ('severity', 0.015301224),
   ('crisis', 0.011634377),
   ('people', 0.008789542),
   ('dimension', 0.0065143546),
   ('population', 0.0063117063),
   ('indicators', 0.006056953),
   ('model', 0.0054247826),
   ('vulnerability', 0.0050228564)]),
 (3,
  [('reintegration', 0.008804386),
   ('nat