In [1]:
import json
import os
import pandas as pd
from itertools import compress

In [2]:
datapath = os.path.join('..', 'data')
filepath = os.path.join(datapath, 'drmkc-scrape_20210218.json')

# load data

with open(filepath, 'r') as f:
    data = json.load(f)

In [3]:
# Functions

def split_urls(entry):
    internals = list(compress(entry['links'], [("drmkc.jrc.ec" in link or "http" not in link) for link in entry['links']]))
    pdfs = list(compress(entry['links'], [(".pdf" in link) for link in entry['links']]))
    externals = list(set(entry['links']) - set(internals))
    
    entry['internals'] = internals
    entry['pdfs'] = pdfs
    entry['externals'] = externals
    
    return(entry)


In [4]:
for entry in data:
    entry = split_urls(entry)

In [5]:
df = pd.DataFrame.from_records(data)

In [6]:
df.head()

Unnamed: 0,url,links,date-of-access,keywords_matched,internals,pdfs,externals
0,https://drmkc.jrc.ec.europa.eu/,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,"[publicly available, social media]",[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/jrc/en, https://europa.e..."
1,https://drmkc.jrc.ec.europa.eu/knowledge/Gaps-...,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,[social media],[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/jrc/en, https://europa.e..."
2,https://drmkc.jrc.ec.europa.eu/knowledge/PROJE...,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,[social media],[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,[https://www.securityresearch-cou.eu/thethemes...
3,https://drmkc.jrc.ec.europa.eu/knowledge/Chall...,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,[social media],[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/jrc/en, https://europa.e..."
4,https://drmkc.jrc.ec.europa.eu/overview/Documents,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,[social media],[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/jrc/en, https://europa.e..."


In [10]:
# Keywords found

df_kw = df.explode('keywords_matched')
df_kw['keywords_matched'].value_counts()

social media               42
government                 10
lidar                       4
publicly available          3
applications                2
agriculture                 2
participatory               2
damage assessment           1
artificial intelligence     1
situational awareness       1
drought risk assessment     1
Name: keywords_matched, dtype: int64

In [16]:
df_kw.head()

Unnamed: 0,url,links,date-of-access,keywords_matched,internals,pdfs,externals
0,https://drmkc.jrc.ec.europa.eu/,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,publicly available,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/knowledge4policy, https:..."
0,https://drmkc.jrc.ec.europa.eu/,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,social media,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/knowledge4policy, https:..."
1,https://drmkc.jrc.ec.europa.eu/knowledge/Gaps-...,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,social media,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/knowledge4policy, https:..."
2,https://drmkc.jrc.ec.europa.eu/knowledge/PROJE...,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,social media,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,[https://www.securityresearch-cou.eu/thethemes...
3,https://drmkc.jrc.ec.europa.eu/knowledge/Chall...,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,2021-02-18,social media,[http://drmkc.jrc.ec.europa.eu/Overview/Events...,[/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_We...,"[https://ec.europa.eu/knowledge4policy, https:..."


In [21]:
df_kw_filter = df_kw.loc[:, ['url', 'keywords_matched']]
df_kw_filter.to_excel(os.path.join(datapath, "DRMKC-EU_urls-keywords.xlsx"), index = False)

In [44]:
# Pdfs

df_pdf = df.explode('pdfs')

pdfs = set(list(df_pdf['pdfs']))

pdfs

{'/PRIVACY_STATEMENT/Privacy_Statement_DRMKC_Website.pdf',
 '/Portals/0/Partnerships/Loss/0-Survey/Loss Database Survey Form -France.pdf',
 '/Portals/0/Partnerships/Loss/0-Survey/Loss Database Survey Form-Bulgaria.pdf',
 '/Portals/0/Partnerships/Loss/1-Nov-2013/1_MINUTES Disaster Loss Data Workshop 15 November 2013.pdf',
 '/Portals/0/Partnerships/Loss/1-Nov-2013/3_Agenda (updated).pdf',
 '/Portals/0/Partnerships/Loss/2-Feb-014/20140213 DraftMinutesWorksho onDisasterLossDataSysteminEU_revised.pdf',
 '/Portals/0/Partnerships/Loss/2-Feb-014/Invitation 2nd Workshop on Loss Data in EU v1.pdf',
 '/Portals/0/Partnerships/Loss/3-May-014/20140520_IRDREULossDataWorkshop_Minutes.pdf',
 '/Portals/0/Partnerships/Loss/3-May-014/Assessment of data quality and uncertainty in disaster loss analysis - DRAFT.pdf',
 '/Portals/0/Partnerships/Loss/3-May-014/Joint agenda  Loss Data in EU  with IRDR meeting v2 (1)_de_1.pdf',
 '/Portals/0/Partnerships/Loss/4-Oct-2014/Minutes_STATE OF THE ART OF DISASTER LOSS D

In [43]:
# Externals

df_ext = df.explode('externals')

exts = set(list(df_ext['externals']))

exts

{' http://eur-lex.europa.eu/legal-content/EN/TXT/;ELX_SESSIONID=ZQnbJ1JTj0Bd5MvQyGf2gPCGx5byd16Y4Hg53FNVpvGFY0y0y1Rh!1404494154?uri=celex:32014R0661',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?qid=1401179579415&uri=CELEX:32013D1313',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?qid=1523606241167&uri=CELEX:32007L0060',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32013R1300',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:01996L0082-20120813',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:31982L0501',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32012L0018',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:52013DC0216',
 ' https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A12012A%2FTXT',
 ' https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=celex:32008L0114',
 'http://aristotle.ingv.it/tiki-index.php',
 'http://dx.doi.org/10.2760/647488',
 'http://dx.doi.org/10.2760/840421',
 'http://dx.doi.org/

In [84]:
# alternate structure

df_links = df.explode('links').reset_index(drop = True).loc[:, ['url', 'links', 'date-of-access', 'keywords_matched']]
df_links['links'] = df_links['links'].fillna('""')
df_links['links'] = df_links['links'].convert_dtypes()

df_links['internal'] = df_links['links'].apply(lambda link: ("drmkc.jrc.ec" in link or "http" not in link))
df_links['pdf'] = df_links['links'].apply(lambda link: ".pdf" in link)

In [85]:
df_links.head()

Unnamed: 0,url,links,date-of-access,keywords_matched,internal,pdf
0,https://drmkc.jrc.ec.europa.eu/,http://drmkc.jrc.ec.europa.eu/Overview/Events,2021-02-18,"[publicly available, social media]",True,False
1,https://drmkc.jrc.ec.europa.eu/,/overview/Events,2021-02-18,"[publicly available, social media]",True,False
2,https://drmkc.jrc.ec.europa.eu/,/knowledge/Gaps-Explorer,2021-02-18,"[publicly available, social media]",True,False
3,https://drmkc.jrc.ec.europa.eu/,/knowledge/Challenges-Sharing,2021-02-18,"[publicly available, social media]",True,False
4,https://drmkc.jrc.ec.europa.eu/,http://drmkc.jrc.ec.europa.eu,2021-02-18,"[publicly available, social media]",True,False


In [99]:
df_linkskw = df_links.explode('keywords_matched').reset_index()

In [105]:
set(list(df_linkskw.loc[(df_linkskw['keywords_matched'] == "lidar") & (df_linkskw['internal'] == False), :]['links']))

{' http://eur-lex.europa.eu/legal-content/EN/TXT/;ELX_SESSIONID=ZQnbJ1JTj0Bd5MvQyGf2gPCGx5byd16Y4Hg53FNVpvGFY0y0y1Rh!1404494154?uri=celex:32014R0661',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?qid=1401179579415&uri=CELEX:32013D1313',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?qid=1523606241167&uri=CELEX:32007L0060',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32013R1300',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:01996L0082-20120813',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:31982L0501',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32012L0018',
 ' http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:52013DC0216',
 ' https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A12012A%2FTXT',
 ' https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=celex:32008L0114',
 'http://aristotle.ingv.it/tiki-index.php',
 'http://dx.doi.org/10.2760/647488',
 'http://dx.doi.org/10.2760/840421',
 'http://dx.doi.org/