# Harvest data from Papers Past

In [None]:
import logging
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd
from tqdm import tqdm_notebook
import time
import re
from slugify import slugify
from time import strftime

logging.basicConfig(level=logging.ERROR)
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))

## Add your API key

In [None]:
api_key = ''
print('Your API key is: {}'.format(api_key))

## Add your search query

In [38]:
query = 'possum'

## Set up some code

In [None]:
class Harvester():
    
    def __init__(self, params):
        self.params = params
        self.total = 0
        self.more = True
        self.articles = []

    def process_results(self, data):
        results = data['search']['results']
        if results:
            self.articles += self.process_articles(results)
            return True
        else:
            return False
        
    def process_articles(self, results):
        articles = []
        for result in results:
            title = re.sub(r'(\([^)]*\))[^(]*$', '', result['title']).strip()
            articles.append({
                'id': result['id'],
                'title': title,
                'newspaper': result['publisher'][0],
                'date': result['date'],
                'text': result['fulltext'],
                'paperspast_url': result['landing_url'],
                'source_url': result['source_url']
            })
        return articles

    def get_data(self):
        response = s.get('http://api.digitalnz.org/v3/records.json', params=self.params)
        return response.json()
    
    def harvest(self):
        data = self.get_data()
        total = data['search']['result_count']
        self.more = self.process_results(data)
        with tqdm_notebook(total=total) as pbar:
            pbar.update(100)
            while self.more:
                self.params['page'] += 1
                data = self.get_data()
                self.more = self.process_results(data)
                pbar.update(100)
                time.sleep(0.2)
                
    def save_as_csv(self, filename=None):
        if not filename:
            filename = '{}-{}.csv'.format(slugify(self.params['text']), strftime("%Y%m%d"))
        df = pd.DataFrame(self.articles)
        df.to_csv(filename, index=False)
        return filename

## Start your harvest

In [None]:
params = {
    'and[display_collection][]': 'Papers Past',
    'text': query,
    'per_page': '100',
    'page': 1,
    'api_key': api_key
}
harvester = Harvester(params)
harvester.harvest()

## Save your harvest

In [None]:
harvester.save_as_csv()