In [167]:
import os
from tqdm import tqdm_notebook
import glob
import re
from fpdf import FPDF
import sys
from PIL import Image
import arrow

In [168]:

title = '''
    <h1>The Bulletin</h1>
    <h3>Editorial cartoons {} to {}</h3>
'''

html = '''
    <h2>Warning</h2>
    <p>The language, images, and ideas presented in <i>The Bulletin</i> were often racist, 
anti-Semitic, and sexist. You won't have to look far within this collection to find 
something offensive. This was, after all, the journal whose slogan for many years was 
'Australia for the white man'.</p>
    <p>&nbsp;</p>
    
    <h2>Artists and copyright</h2>
    <p>The artists represented in this collection include (amongst others):</p>
    <ul>
    <li>Livingston Hopkins (Hop) (1846-1927)</li>
    <li>Percy Leason (1889-1959)</li>
    <li>Norman Lindsay (1879-1969)</li>
    <li>Phil May (1864-1903)</li>
    <li>CH Percival (1891-?)</li>
    <li>Edward Scorfield (1884-1968)</li>
    </ul>
    </p>
    <p>&nbsp;</p>
    <p>The creators of many of these works can be identified by their signatures. 
If the artist died before 1955, then their work will be out of copyright. Otherwise 
the work will remain in copyright until seventy years after their death. For example, 
many of the works are by Norman Lindsay who died in 1969. His works will remain in 
copyright until 2039. However, depending on the conditions under which the artists 
were employed, the copyright in the images might be owned by <i>The Bulletin</i> and its 
corporate successors, rather than the artists and their families. 
Here's a <a href="http://www.cartoonists.org.au/members/history">useful biographical source</a> 
on Australian cartoonists.</p>
    <p>&nbsp;</p>
    <p>If the work is unsigned, and the artist can't be identified in some other way, 
then it's out of copyright as all the works in this collection were published before 1955.</p>
    <p>&nbsp;</p>
    <h2>Acknowledgements</h2>
    <p>These images were downloaded from <a href="https://trove.nla.gov.au">Trove</a>. 
Digitisation of <i>The Bulletin</i> was supported by:</p>
    <ul>
    <li>AustLit (University of Queensland)</li>
    <li>State Library of New South Wales</li>
    <li>State Library of Victoria</li>
    </ul>
    <p>&nbsp;</p>
    <h2>More info</h2>
    <p>This collection was compiled by <a href="https://timsherratt.org">Tim Sherratt</a> to help researchers and promote use of Australia's digital cultural collections. The methods used to harvest 
the metadata and images are described in the <a href="https://glam-workbench.github.io/trove-journals/">Trove Journals</a> 
section of his <a href="https://glam-workbench.github.io/">GLAM Workbench</a>.</p>
    <p>&nbsp;</p> 
    <p>Click on the links below each cartoon to view and download individual images from Trove. The complete collection of high-resolution images (about 60gb in total) can be 
<a href="https://cloudstor.aarnet.edu.au/plus/s/bI7hJREvO0oJLGL">downloaded from CloudStor</a>.
'''

issue_info = '''
    <p><i>The Bulletin</i>, no. {0}, {1}, page {2}<br>
<a href="https://nla.gov.au/{3}">https://nla.gov.au/{3}</a></p>
'''

from fpdf import FPDF, HTMLMixin

class MyFPDF(FPDF, HTMLMixin):
    pass

def get_year(path):
    return os.path.basename(path)[:4]

for decade in range(188, 196):
    pdf = MyFPDF()
    # compression is not yet supported in py3k version
    pdf.compress = True
    pdf.set_left_margin(25)
    pdf.set_right_margin(25)
    pdf.set_top_margin(20)
    pages = glob.glob('/Volumes/bigdata/mydata/Trove-text/Bulletin/covers/images/{}*.jpg'.format(decade))
    pages = sorted(pages)
    start = get_year(pages[0])
    end = get_year(pages[-1])
    pdf.add_page()
    pdf.write_html(title.format(start, end))
    pdf.add_page()
    pdf.set_font('Times', '', 11)
    pdf.write_html(html)
    for page in tqdm_notebook(pages):
        filename = os.path.basename(page)
        date, number, issue_id, page_number = re.search(r'(\d+)-(\w+)-(nla\.obj-\d+)-(\d+).jpg', filename).groups()
        fdate = arrow.get(date, 'YYYYMMDD').format('D MMMM YYYY')
        pdf.add_page()
        
        # Unicode is not yet supported in the py3k version; use windows-1252 standard font  
        img = Image.open(page)
        w, h = img.size
        scale = 1500 / h
        w = round(w * scale)
        img = img.resize((w, 1500), Image.LANCZOS)
        img.save('temp/{}'.format(filename), quality=75)
        pdf.image('temp/{}'.format(filename), 25, 20, h=230)
        pdf.ln(230)
        pdf.write_html(issue_info.format(number, fdate, page_number, issue_id))
    pdf.output('bulletin-{}-{}.pdf'.format(start, end), 'F')

HBox(children=(IntProgress(value=0, max=178), HTML(value='')))

HBox(children=(IntProgress(value=0, max=521), HTML(value='')))

HBox(children=(IntProgress(value=0, max=522), HTML(value='')))

HBox(children=(IntProgress(value=0, max=524), HTML(value='')))

HBox(children=(IntProgress(value=0, max=523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=537), HTML(value='')))

HBox(children=(IntProgress(value=0, max=143), HTML(value='')))