In [19]:
import os
import json
from itertools import compress
from itertools import chain
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
import re
import time
import random

data_path = os.path.join("D:/", "data", "drmkc")
pdf_path = os.path.join(data_path, "pdfs")

filename = "drr-scrape_total_20210505.json"

with open(os.path.join(data_path, filename), 'r') as f:
    data = json.load(f)
    
domain_url = "https://" + urlparse(data[0].get('url')).netloc
pdfs = list(set([urljoin(domain_url, url) for url in list(chain(*[list(compress(entry['links'], [(".pdf" in link) for link in entry['links']])) for entry in data]))]))

if not os.path.isdir(pdf_path):
    os.mkdir(pdf_path)

In [21]:
for entry in data:
    entry['domain_url'] = urlparse(entry.get('url')).netloc

In [22]:
exclude_kw = ['social media', 'government', 'participatory', 'youtube', 'nongovernmental organizations', 'notifications']

data_exclude = [entry for entry in data if not all([kw in exclude_kw for kw in entry.get('keywords_matched')])]

#data_exclude[0]

In [23]:
drop_links_manual = ['twitter', 'facebook', 'youtube', 'google', 'nature.com', 'zoom.us', 'goo.gl', 'bit.ly', 'flcikr', 't.co', 'medium.com', 'github', 'tandfonline', 'linkedin', 'bbc.co.uk', 'news24.com', 'vimeo', 'dx.doi.org']

data_exclude = [entry for entry in data if not any([url in entry.get('url') for url in drop_links_manual])]

In [24]:
domain_urls = list(set([entry.get('domain_url') for entry in data_exclude]))

In [25]:
domain_urls

['drmkc.jrc.ec.europa.eu',
 'covid-statistics.jrc.ec.europa.eu',
 'www.who.int',
 'eur-lex.europa.eu',
 'startnetwork.org',
 'www.securityresearch-cou.eu',
 'www.undrr.org',
 'dppa.un.org']

In [26]:
domain_urls = [url for url in domain_urls if url != 'www.who.int']

In [28]:
def url_to_filename(url):
    url = re.sub(r'(https\:\/\/(www\.)?)|(http\:\/\/(www\.)?)', '', url)
    url = re.sub(r'\:\d{2,4}(?=\/)', '', url)
    urlpart = re.search(r'(\w+?)\.\w{2,11}(\.\w{2,5})?(?=\/)', url).group(1)
    namepart = re.search(r'\.\w{2,11}(\/.+\.pdf)', url).group(1).replace("/", "-").replace("\\", "-")
    namepart = namepart.replace("?", "")
    filename = urlpart + namepart
    return(filename)

for domain_url in domain_urls:
    
    missed_pdfs = []
    
    save_path = os.path.join(pdf_path, domain_url)
    
    if not os.path.isdir(save_path):
        os.mkdir(save_path)
    
    domain_set = [entry for entry in data_exclude if entry.get('domain_url') == domain_url]
    
    url_prefix = "https://" + domain_url
    
    pdfs = list(set([urljoin(url_prefix, url) for url in list(chain(*[list(compress(entry['links'], [(".pdf" in link) for link in entry['links']])) for entry in domain_set]))]))
    
    print("downloading pdfs for {}\n".format(domain_url))
    for c, pdf_url in enumerate(pdfs, start = 1):
    
        filename = url_to_filename(pdf_url)
        
        if os.path.isfile(os.path.join(save_path, filename)):
            continue
        
        try:
            r = requests.get(pdf_url, stream=True)
        except:
            missed_pdfs.append(pdf_url)
            continue

        if r.status_code == 200:
            with open(os.path.join(save_path, filename), 'wb') as f:
                f.write(r.content)
            time.sleep(random.uniform(0.5, 1))
        else:
            missed_pdfs.append(pdf_url)
            continue

        progress = "|{0}| {1:.2f} %".format(("="*int(c/len(pdfs) * 50)).ljust(50), c/len(pdfs) * 100)
    
        print(progress, end = "\r")
        
        with open(os.path.join(save_path, 'missed_pdf.txt'), 'w', encoding = 'utf-8') as f:
            for url in missed_pdfs:
                f.write(url + "\n")
            f.close()
        
    print("\n")

downloading pdfs for drmkc.jrc.ec.europa.eu


downloading pdfs for covid-statistics.jrc.ec.europa.eu



downloading pdfs for eur-lex.europa.eu



downloading pdfs for startnetwork.org


downloading pdfs for www.securityresearch-cou.eu



downloading pdfs for www.undrr.org



downloading pdfs for dppa.un.org




In [7]:
pdf_url

'https://who.foundation/cms/wp-content/uploads/2020/11/TFIU_HEALING_ARTS_PR_6-10-2020_WHO-1.pdf'

In [90]:
url = re.sub(r'(https\:\/\/(www\.)?)|(http\:\/\/(www\.)?)', '', pdf_url)
url = re.sub(r'\:\d{2,4}(?=\/)', '', url)
urlpart = re.search(r'(\w+?)\.\w{2,5}(\.\w{2,5})?(?=\/|\:)', url).group(1)
namepart = re.search(r'\.\w{2,5}(\/.+\.pdf)', url).group(1).replace("/", "-").replace("\\", "-")
filename = urlpart + namepart

In [91]:
filename

'ggtc-dmdocuments-5.3%20toolkit%202015.pdf'

In [85]:
filename

'imperial-bitstream-10044-1-77482-14-2020-03-16-COVID19-Report-9.pdf'