In [70]:
import argparse
import concurrent.futures
import logging
import datetime
import requests
import re
import urllib.error
import itertools
import pprint
import ast
import bs4
import textract
import os
from pathlib import Path
import better_exceptions
import subprocess

ENCODE_IN = 'utf-8'
ENCODE_OUT = 'utf-8'
MAXTRIES = 5
STARTURL="https://www.mercatus.org/tags/federalism"
pubpages = {}
tagpages = {}

#### UTILITY FUNCTIONS #########################################################
## The following functions return a soup object from a URL and perform the 
## gathering of 'tag' links and 'document' links for pages. Both are called by
## functions focusing on both tag and document pages.

def get_soup(url):
    for i in range(MAXTRIES):
        r = requests.get(url)
        r.encoding = ENCODE_IN
        return bs4.BeautifulSoup(r.content, "lxml")

def gather_links(soup):
    
    # This function goes through all the links in a page, finding
    # which ones are complete and which ones need concatenation, and then
    # sorting these links into `tagpages` if they are tag pages and `pubpages`
    # if they are publication pages. The value of 0 is assigned to indicate
    # that the tagpage or pubpage has not yet gone through tagpager or pubpager.
    
    for x in soup.find_all('a', {'href':True}):
        href = x['href']
        if len(href.split('/')) > 3:
            if href.split('/')[3] == "tags" and href not in tagpages and '?' not in href:
                tagpages[href] = 0
            if href.split('/')[3] in ["publication","publications","commentary"] and href not in pubpages \
            and href[:4] == "http" and href.split('/')[2].split('.')[1] == 'mercatus':
                if href[0] == '/':
                    pubpages['https://www.mercatus.org' + href] = 0
                else:
                    pubpages[href] = 0
        if len(href.split('/')) == 3:
            if href.split('/')[1] == "tags" and 'https://www.mercatus.org' + href not in tagpages \
            and '?' not in href:
                tagpages['https://www.mercatus.org' + href] = 0
            if href.split('/')[1] in ["publication","publications","commentary"] \
            and 'https://www.mercatus.org' + href not in pubpages:
                pubpages['https://www.mercatus.org' + href] = 0
                    
#### PAGER FUNCTIONS ###########################################################
## The following functions handle the tag and publication pages, respectively.
## Handling tag pages is a simple matter of collecting all of the links on the
## tag page and sorting them into the `pubpages` and `tagpages` dictionaries.
## Handling publication pages is more complicated, as metadata must be collected
## along with text, and these must be saved to a text file. An additional
## function called `find_and_read_pdfs` extracts text from PDFs for the pubpager
## function.
    
def tagpager(url):
    print('running tagpager on ' + url)
    for x in range(0,3):
        soup = get_soup(url + "?page=" + str(x))
        gather_links(soup)

def pubpager(url):
    print('running pubpager on ' + url)
    tagpattern = re.compile('^\/tags\/')
    soup = get_soup(url)
    
    # first we must extract the type of publication, which helps us extract text below.
    
    try:
        pubtype = soup.find("div", class_="field-name-field-publication-type").text.strip().lower()
    except AttributeError:
        try:
            soup = get_soup(url)
            pubtype = soup.find("div", class_="pane-node-content-type").text.strip().lower()
        except:
            print('Bailing out; error fetching basics on ' + url)
            return
        
    if 'visualization' not in pubtype and 'book' not in pubtype:
        # figure out if this is a commentary or publications document and then extract
        # text using the appropriate method - either directly from the page, or from PDFs.
        
        if url.split("/")[3] == "commentary":
            text = " ".join([x.text for x in soup.find_all("div",class_="field-type-text-with-summary")])
            pub_style = soup.find('div',{'class':'pane-node-content-type'}).text.strip().replace('/','-')
        elif url.split("/")[3] in ["publications","publication"]:
            text = find_and_read_pdfs(soup, url)
            pub_style = soup.find('div',{'class':'field-name-field-publication-type'}).text.strip().replace('/','-')
#             if not text:
#                 return
            if len(text) < 40:
                text = " ".join([x.text for x in soup.find_all("div",class_="field-type-text-with-summary")])
                
        
        # collect title and append the url to the end of the text
        
        title = soup.find('div',class_="pane-node-title").text.strip().replace('/','-')
        text += url
        
        # gather tags - if this fails, we will return out of this function,
        # as the tags are the whole point of this.
        
        try:
            tags=[x.text.replace('/','-') for x in soup.find('div',class_='field-name-field-tags').find_all('a', href=tagpattern)]
        except AttributeError:
            print('Could not find tags for ' + url)
            return
        
        # find the date and author names, formatting them for output
        
        date = soup.find("span",class_="date-display-single").text.strip()
        names = []
        for name in soup.find('div',{'class':'field-name-field-people'}).find_all('h4',{'class':'node-title'}):
            names.extend([name.text])
        if len(names) > 2:
            names = names[:2]
        authors = '--'.join(names)
        
        # for each tag, save a copy of the text to a text file.

        for tag in tags:
            outpath = Path("new_data/" + tag + "/" + authors + "__" + 
                           pub_style + "__" + title + " - " + date + '.txt')
            
            # if the length of the outpath is too long to save, removed 20
            # character from the title and try again.
            
            while len(str(outpath)) > 220:
                title = title[:-20]
                outpath = Path("new_data/" + tag + "/" + authors + "__" + 
                               pub_style + "__" + title + " - " + date + '.txt')
                
            # save the file.
            
            try:
                outpath.parent.mkdir(parents=True, exist_ok=True)
            except FileExistsError:
                pass
            outpath.write_text(text, encoding=ENCODE_OUT)
    
    # always call gather_links to look for additional tag or pub links
    # that are not yet in tagpages or pubpages.
    
    gather_links(soup)

def find_and_read_pdfs(soup, url):
    
    # download pdfs and extract text. if multiple pdfs, then append
    # text from subsequent pdfs to text from first pdf. 
    
    text = " "
    pdfpattern = re.compile("\.pdf$")
    for pdf in soup.find_all('a', href=pdfpattern, title=True):
        with Path('raw/' + pdf['title']).open('wb') as outf:
            if 'http' not in pdf['href']:
                pdf_url = 'http://mercatus.org' + pdf['href']
            else:
                pdf_url = pdf['href']
            outf.write(requests.get(pdf_url).content)
            
    # There are many types of errors that can occur, and this block helps
    # deal with that. So far only about ~60 documents fall into these
    # exceptions, so working out the kinks isn't really worth it.
    
        try:
            text += subprocess.check_output(['pdftotext','-enc','UTF-8','raw/' + pdf['title'],'-'], 
                                           stderr=subprocess.DEVNULL).decode('utf-8', errors="ignore")
            print("Successfully extracted " + pdf['title'])
        except UnicodeDecodeError:
            print("DECODE ERROR OCCURRED ON " + pdf['title'])
        except:
            print("MYSTERY??? ERROR OCCURRED ON " + pdf['title'])
        os.remove('raw/' + pdf['title'])
    
    return " ".join(text.split())

#### CONTROLLER FUNCTIONS ######################################################
## These functions manage the `pubpages` and `tagpages` dictionaries, looking
## for urls with value 0, setting them to 1, and running the necessary pager
## function on that url.
    
def pubcontroller():
    zeroes=[url for url, status in pubpages.items() if status == 0]
    for url in zeroes:
        pubpages[url] = 1
        pubpager(url)

def tagcontroller():
    zeroes=[url for url, status in tagpages.items() if status == 0]
    for url in zeroes:
        tagpages[url] = 1
        tagpager(url)
        
#### MAIN FUNCTION ############################################################
## The main function runs an initial tagpager call on the STARTURL and then
## begins a `while True` loop that breaks only when all pubpages and tagpages
## values are 1. 

def main():
    tagpager(STARTURL)
    while True:
        if 0 in pubpages.values():
            print('running pubcontroller')
            pubcontroller()
        if 0 in tagpages.values():
            print('running tagcontroller')
            tagcontroller()
        if 0 not in pubpages.values() and 0 not in tagpages.values():
            break
            
main()

running tagpager on https://www.mercatus.org/tags/federalism
running pubcontroller
running pubpager on https://www.mercatus.org/publication/should-puerto-rico-be-allowed-restructure-its-debt-mercatus-debate
Successfully extracted Verret-Joffe-Puerto-Rico-Debate-v2.pdf
running pubpager on https://www.mercatus.org/publication/be-rather-seem-fiscal-responsibility-and-political-economy-north-carolina
Successfully extracted Miller-Fiscal-Responsibility-NC.pdf
Successfully extracted Miller-Fiscal-Responsibility-NC-summary.pdf
running pubpager on https://www.mercatus.org/publication/political-economy-medicaid-expansion-federalism-interest-groups-and-aca
Successfully extracted Sutter-Medicaid-Expansion-v1.pdf
Successfully extracted Sutter-Medicaid-Expansion-sum-v1.pdf
running pubpager on https://www.mercatus.org/publications/capitol-hill-state-house-or-city-hall-debating-location-political-power-and-decision
running pubpager on https://www.mercatus.org/publications/federal-highway-funding-need