In [55]:
import time
import csv
import os
import math
import requests
from PIL import Image, ImageOps
from requests import ConnectionError
from recordsearch_tools.utilities import retry
from recordsearch_tools.client import RSSearchClient, RSSeriesClient
from tinydb import TinyDB, Query
try:
    from io import BytesIO
except ImportError:
    from StringIO import StringIO
from IPython.display import Image as DImage
from IPython.core.display import HTML

In [42]:
IMAGES_DIR = 'data/images'

In [43]:
class SeriesHarvester():
    def __init__(self, series, control=None):
        self.series = series
        self.control = control
        self.total_pages = None
        self.pages_complete = 0
        self.client = RSSearchClient()
        self.prepare_harvest()
        self.db = TinyDB('data/db-{}.json'.format(self.series))
        self.items = self.db.table('items')
        self.images = self.db.table('images')

    def get_total(self):
        return self.client.total_results

    def prepare_harvest(self):
        if self.control:
            self.client.search(series=self.series, control=self.control)
        else:
            self.client.search(series=self.series)
        total_results = self.client.total_results
        print('{} items'.format(total_results))
        self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1
        print(self.total_pages)

    @retry(ConnectionError, tries=20, delay=10, backoff=1)
    def start_harvest(self, page=None):
        Record = Query()
        if not page:
            page = self.pages_complete + 1
        while self.pages_complete < self.total_pages:
            if self.control:
                response = self.client.search(series=self.series, page=page, control=self.control, sort='9')
            else:
                response = self.client.search(series=self.series, page=page, sort='9')
            for result in response['results']:
                self.items.upsert(result, Record.identifier == result['identifier'])
            self.pages_complete += 1
            page += 1
            print('{} pages complete'.format(self.pages_complete))
            time.sleep(1)
        
    @retry(ConnectionError, tries=20, delay=10, backoff=1)
    def harvest_images(self):
        Record = Query()
        items = self.items.search(Record.digitised_status == True)
        headers = {'User-Agent': 'Mozilla/5.0'}
        for item in items:
            directory = os.path.join(IMAGES_DIR, '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-'), item['identifier']))
            if not os.path.exists(directory):
                os.makedirs(directory)
            for page in range(1, item['digitised_pages'] + 1):
                filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)
                print('{}, p. {}'.format(item['identifier'], page))
                if not os.path.exists(filename):
                    img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)
                    response = requests.get(img_url, headers=headers, stream=True, verify=False)
                    response.raise_for_status()
                    try:
                        image = Image.open(BytesIO(response.content))
                    except IOError:
                        print('Not an image')
                    else:
                        width, height = image.size
                        image.save(filename)
                        del response
                        image_meta = {
                            'image_id': '{}-{}'.format(item['identifier'], page),
                            'identifier': item['identifier'],
                            'page': page,
                            'width': width,
                            'height': height
                        }
                        self.images.upsert(image_meta, Record.image_id == image_meta['image_id'])
                        print('Image saved')
            time.sleep(1)

In [44]:
h = SeriesHarvester('B6527')

44 items
3


In [37]:
h.start_harvest()

1 pages complete
2 pages complete
3 pages complete


In [38]:
db = TinyDB('data/db-B6527.json')

In [39]:
items = db.table('items')
len(items)

44

In [45]:
h.harvest_images()

5993848, p. 1
Image saved
5993848, p. 2
Image saved
5993849, p. 1
Image saved
5993849, p. 2
Image saved
5993849, p. 3
Image saved
5993849, p. 4
Image saved
5993849, p. 5
Image saved
5993849, p. 6
Image saved
5993849, p. 7
Image saved
5993849, p. 8
Image saved
5993849, p. 9
Image saved
5993849, p. 10
Image saved
5993849, p. 11
Image saved
5993849, p. 12
Image saved
5993849, p. 13
Image saved
5993849, p. 14
Image saved
5993849, p. 15
Image saved
5993849, p. 16
Image saved
5993849, p. 17
Image saved
5993849, p. 18
Image saved
5993849, p. 19
Image saved
5993849, p. 20
Image saved
5993849, p. 21
Image saved
5993849, p. 22
Image saved
5993849, p. 23
Image saved
5993849, p. 24
Image saved
5993849, p. 25
Image saved
5993849, p. 26
Image saved
5993849, p. 27
Image saved
5993849, p. 28
Image saved
5993849, p. 29
Image saved
5993849, p. 30
Image saved
5993849, p. 31
Image saved
5993849, p. 32
Image saved
5993849, p. 33
Image saved
5993849, p. 34
Image saved
5993849, p. 35
Image saved
5993849, p. 

Image saved
5993851, p. 89
Image saved
5993851, p. 90
Image saved
5993851, p. 91
Image saved
5993851, p. 92
Image saved
5993851, p. 93
Image saved
5993851, p. 94
Image saved
5993851, p. 95
Image saved
5993851, p. 96
Image saved
5993851, p. 97
Image saved
5993851, p. 98
Image saved
5993851, p. 99
Image saved
5993851, p. 100
Image saved
5993851, p. 101
Image saved
5993851, p. 102
Image saved
5993851, p. 103
Image saved
5993851, p. 104
Image saved
5993851, p. 105
Image saved
5993851, p. 106
Image saved
5993851, p. 107
Image saved
5993851, p. 108
Image saved
5993851, p. 109
Image saved
5993851, p. 110
Image saved
5993851, p. 111
Image saved
5993851, p. 112
Image saved
5993851, p. 113
Image saved
5993851, p. 114
Image saved
5993851, p. 115
Image saved
5993851, p. 116
Image saved
5993851, p. 117
Image saved
5993851, p. 118
Image saved
5993851, p. 119
Image saved
5993851, p. 120
Image saved
5993851, p. 121
Image saved
5993851, p. 122
Image saved
5993851, p. 123
Image saved
5993851, p. 124
Ima

5993850, p. 1
5993850, p. 2
5993850, p. 3
5993850, p. 4
5993850, p. 5
5993850, p. 6
5993850, p. 7
5993850, p. 8
5993850, p. 9
5993850, p. 10
5993850, p. 11
5993850, p. 12
5993850, p. 13
5993850, p. 14
5993850, p. 15
5993850, p. 16
5993850, p. 17
5993850, p. 18
5993850, p. 19
5993850, p. 20
5993850, p. 21
5993850, p. 22
5993850, p. 23
5993850, p. 24
5993850, p. 25
5993850, p. 26
5993850, p. 27
5993850, p. 28
5993850, p. 29
5993850, p. 30
5993850, p. 31
5993850, p. 32
5993850, p. 33
5993850, p. 34
5993850, p. 35
5993850, p. 36
5993850, p. 37
5993850, p. 38
5993850, p. 39
5993850, p. 40
5993850, p. 41
5993850, p. 42
5993850, p. 43
5993850, p. 44
5993850, p. 45
5993850, p. 46
5993850, p. 47
5993850, p. 48
5993850, p. 49
5993850, p. 50
5993850, p. 51
5993850, p. 52
5993850, p. 53
5993850, p. 54
5993850, p. 55
5993850, p. 56
5993850, p. 57
5993850, p. 58
5993850, p. 59
5993850, p. 60
5993850, p. 61
5993850, p. 62
5993850, p. 63
5993850, p. 64
5993850, p. 65
5993850, p. 66
5993850, p. 67
5993

Image saved
5993852, p. 152
Image saved
5993852, p. 153
Image saved
5993852, p. 154
Image saved
5993852, p. 155
Image saved
5993852, p. 156
Image saved
5993852, p. 157
Image saved
5993852, p. 158
Image saved
5993852, p. 159
Image saved
5993852, p. 160
Image saved
5993852, p. 161
Image saved
5993852, p. 162
Image saved
5993852, p. 163
Image saved
5993852, p. 164
Image saved
5993852, p. 165
Image saved
5993852, p. 166
Image saved
5993852, p. 167
Image saved
5993852, p. 168
Image saved
5993852, p. 169
Image saved
5993852, p. 170
Image saved
5993852, p. 171
Image saved
5993852, p. 172
Image saved
5993852, p. 173
Image saved
5993852, p. 174
Image saved
5993852, p. 175
Image saved
5993852, p. 176
Image saved
5993852, p. 177
Image saved
5993852, p. 178
Image saved
5993852, p. 179
Image saved
5993852, p. 180
Image saved
5993852, p. 181
Image saved
5993852, p. 182
Image saved
5993852, p. 183
Image saved
5993852, p. 184
Image saved
5993852, p. 185
Image saved
5993852, p. 186
Image saved
5993852,

KeyboardInterrupt: 

In [67]:
display(HTML('<img src="data/images/B6527/23-SEP-1916-1-ANGLESEY-[5993849]/5993849-p1.jpg" width="200">'))