In [3]:
#!/usr/bin/env python
# coding: utf-8

import pathlib
import pprint
import json
import re
import requests
import time
import config
import argparse
import logging
import urllib.parse
import csv
from datetime import datetime


ENDPT = 'https://web.archive.org/save/'
UA_STRING = config.UA_STRING
ACCESS_KEY = config.ACCESS_KEY
SECRET_KEY = config.SECRET_KEY
HEADERS = {'Accept':'application/json',
           'User-Agent': UA_STRING,
           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}
IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one

In [14]:
def main():
    parser = argparse.ArgumentParser(description='Creates job ')
    parser.add_argument('-i', help='Input directory with metadata files')
    parser.add_argument('-o', help='Location to save job id file')
    ## TODO: Maybe switch this so default is to ignore?
    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',
            action='store_true')

    args = parser.parse_args()

    
    # Make a list of the files that we are going to be editing (skip those already edited)
    files = pathlib.Path(args.i).glob('**/*.json')
    ## FOR TESTING ONLY!!!
    #files = list(files)[10:11]
    archive_files(files, args.o, args.ignore_self_links)
    
def archive_files(files, output_file, ignore_self_links):
    
    
    def get_urls_to_archive(fn):
        '''Takes a file, gets the urls to archive, and passes them to the archive_url function'''
        with open(filename, 'r') as f:
            j_obj = json.load(f)
            # Get the URLs from the file
            query_url, link_urls = get_urls_from_json(j_obj)
            # Filter out the self links and search engine cache urls
            link_urls = filter_link_urls(query_url, link_urls)
        
        with open(output_file, 'w') as out_file:
            f = csv.writer(out_file)
            # Get outlinks for the query URL. This gets these jobs started early, so some will
            # hopefully be done by the time we make the calls
            query_job = archive_url(query_url, capture_outlinks=1)
            store_job_id(f, query_url, query_job)
            for url in link_urls:
                job_id = archive_url(url)
                store_job_id(f, url, job_id)
    
    def store_job_id(f, url, job_id):
        '''Writes the result of an archive operation to a csv file (f) and the complete_urls dict'''
        time = datetime.now()
        f.writerow([time, url, job_id])
        completed_urls[url] = job_id
    
    def filter_link_urls(query_url,
                         urls,
                    remove_cache=True):
        '''
        Takes link urls and filters them in three ways:
        1. (Optionally) Ignores urls from the two caches:
        webcache.googleusercontent.com
        https://cc.bingj
        2. Filters out those which are in the completed_urls dictionary
        3. (Optionally) Identifies URLs which have the same domain as the query URL.
        Checks the skipped_urls list to see if the URL already appears there. If so, we assume
        that we want it archived and move it from skipped to the to_archive list
        '''
        to_archive = []
        if ignore_self_links:
            domain = get_domain(query_url)
        else:
            domain = None
        cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'
        for url in urls:
            if url in completed_urls:
                continue

            if remove_cache == True:
                if re.match(cache_regex, url):
                    continue

            if ignore_self_links and re.match(f'https?://\w*\.?{domain}', url):
                # If it matches, check if it's in skipped URLs
                # If so, remove it from there, and add it to the to_archive list
                if url in skipped_urls:
                    to_archive.append(url)
                    skipped_urls.remove(url)
                # Else, add it to the skipped urls (and skip it)
                else:
                    skipped_urls.append(url)
            else:
                to_archive.append(url)
        return to_archive
   
    
    completed_urls = dict_from_csv(output_file)
    skipped_urls = []
    attempts = 0
    incomplete_files = list(files)
    while len(incomplete_files) > 0:
        if attempts == 3:
            break
        for fn in incomplete_files:
            try:
                archive_urls(fn)
                incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list
            except ConnectionError:
                failed_files.append(fn)
        attempts += 1
        logging.warn('Files that failed: {}'.format(incomplete_files))
        time.sleep(30) # If something goes wrong, wait to see if it gets better :)


def dict_from_csv(csv_file):
    result = {}
    if pathlib.Path(csv_file).exists():
        with open(csv_file, 'r') as fn:
            f = csv.reader(fn)
            for row in f:
                result[row[0]] = row[1]
    return result 

    
def get_domain(url):
    domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0]
    if not domain:
        raise ValueError("Can't find URL in {url}")
    return domain



def get_urls_from_json(j_obj):
    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''
    query_url = urlencode_url(j_obj['link'])
    link_urls = []
    
    for x in j_obj['linkElements']:
        url = x['href']
        if re.match('javascript', url) or url == '':
            continue
        link_urls.append(urlencode_url(url))
    return (query_url, link_urls)
    
def urlencode_url(url):
    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))

def archive_url(url, 
                wait = 2,         
                capture_outlinks = 0 # Whether to capture outlinks (default is no)
                ):



    payload = {'url': url,
              'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,
              #'capture_screenshot': capture_screenshot,
              'capture_outlinks': capture_outlinks
              }
    r = requests.post(ENDPT, headers=HEADERS, data=payload)
    logging.debug(r.content)
    print(f'Should have a valid job id for {url}. Instead, this was returned:\n {r.content}')

    if r.status_code == 429:
        logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')
        time.sleep(wait)
        return archive_url(url = url,
                           wait = wait * 1.2, 
                           capture_outlinks = capture_outlinks)
    if r.status_code in [104,502,503,504,443,401]:
        logging.warning(url)
        logging.warning(r.text)
        if r.status_code in [104, 401, 443]:
            logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')
            return None
        logging.warning('502 or 503 or 504 status received; waiting 30 seconds')
        time.sleep(30)
        return archive_url(url = url,
                           capture_outlinks = capture_outlinks)
                          
    r.raise_for_status()
    try:
        return r.json()['job_id']
    except KeyError:
        logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\n {r.content}')

