Skip to content

Commit 67f2519

Browse files
authored
Merge pull request #14 from betatim/add-bookmarks
[WIP] Add table of contents to PDFs based on h1 tags
2 parents 2c53969 + 3e3503e commit 67f2519

File tree

3 files changed

+71
-8
lines changed

3 files changed

+71
-8
lines changed

example.ipynb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,11 +138,9 @@
138138
]
139139
},
140140
{
141-
"cell_type": "code",
142-
"execution_count": null,
141+
"cell_type": "markdown",
143142
"metadata": {},
144-
"outputs": [],
145-
"source": []
143+
"source": ["# A h1 title\n"]
146144
}
147145
],
148146
"metadata": {

example.pdf

292 Bytes
Binary file not shown.

notebook_as_pdf/__init__.py

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,22 @@ async def html_to_pdf(html_file, pdf_file, pyppeteer_args=None):
5353
width = dimensions["width"]
5454
height = dimensions["height"]
5555

56+
await page.evaluate(
57+
"""
58+
function getOffset( el ) {
59+
var _x = 0;
60+
var _y = 0;
61+
while( el && !isNaN( el.offsetLeft ) && !isNaN( el.offsetTop ) ) {
62+
_x += el.offsetLeft - el.scrollLeft;
63+
_y += el.offsetTop - el.scrollTop;
64+
el = el.offsetParent;
65+
}
66+
return { top: _y, left: _x };
67+
}
68+
""",
69+
force_expr=True,
70+
)
71+
5672
await page.addStyleTag(
5773
{
5874
"content": """
@@ -73,6 +89,17 @@ async def html_to_pdf(html_file, pdf_file, pyppeteer_args=None):
7389
}
7490
)
7591

92+
h1s = await page.evaluate(
93+
"""() => {
94+
var vals = []
95+
for (const elem of document.getElementsByTagName("h1")) {
96+
//console.log(elem, getOffset(elem).top, elem.innerText)
97+
vals.push({ top: getOffset(elem).top, text: elem.innerText })
98+
}
99+
return vals
100+
}"""
101+
)
102+
76103
await page.pdf(
77104
{
78105
"path": pdf_file,
@@ -87,12 +114,47 @@ async def html_to_pdf(html_file, pdf_file, pyppeteer_args=None):
87114

88115
await browser.close()
89116

117+
return h1s
118+
119+
120+
def finish_pdf(pdf_in, pdf_out, notebook, headings):
121+
"""Add finishing touches to the PDF file.
122+
123+
To make the PDF nicer we:
90124
91-
def attach_notebook(pdf_in, pdf_out, notebook):
125+
* attach the original notebook to the PDF for reference
126+
* add bookmarks pointing to the headers in a notebook
127+
"""
92128
pdf = PyPDF2.PdfFileWriter()
93129
pdf.appendPagesFromReader(PyPDF2.PdfFileReader(pdf_in, "rb"))
94130
pdf.addAttachment(notebook["file_name"], notebook["contents"])
95131

132+
for heading in headings:
133+
page_num = heading["top"] // (200 * 72)
134+
135+
page_height = pdf.getPage(page_num).artBox[-1]
136+
137+
# position on the page as measured from the bottom of the page
138+
# with a bit of leeway so that clicking the bookmark doesn't put
139+
# the heading right at the border
140+
on_page_pos = page_height - (heading["top"] % (200 * 72)) + 20
141+
142+
# there is no nice way of passing the "zoom arguments" at the very
143+
# end of the function call without explicitly listing all the parameters
144+
# of the function. We can't use keyword arguments :(
145+
pdf.addBookmark(
146+
heading["text"],
147+
page_num,
148+
None,
149+
None,
150+
False,
151+
False,
152+
"/XYZ",
153+
0,
154+
on_page_pos,
155+
None,
156+
)
157+
96158
with open(pdf_out, "wb") as fp:
97159
pdf.write(fp)
98160

@@ -116,7 +178,9 @@ async def notebook_to_pdf(
116178
with tempfile.NamedTemporaryFile(suffix=".html") as f:
117179
f.write(exported_html.encode())
118180
f.flush()
119-
await html_to_pdf(f.name, pdf_path, pyppeteer_args)
181+
heading_positions = await html_to_pdf(f.name, pdf_path, pyppeteer_args)
182+
183+
return heading_positions
120184

121185

122186
class PDFExporter(Exporter):
@@ -161,7 +225,7 @@ def from_notebook_node(self, notebook, resources=None, **kwargs):
161225
pdf_fname2 = os.path.join(name, "output-with-attachment.pdf")
162226
pyppeteer_args = ["--no-sandbox"] if self.no_sandbox else None
163227

164-
self.pool.submit(
228+
heading_positions = self.pool.submit(
165229
asyncio.run,
166230
notebook_to_pdf(
167231
notebook,
@@ -174,13 +238,14 @@ def from_notebook_node(self, notebook, resources=None, **kwargs):
174238
).result()
175239
resources["output_extension"] = ".pdf"
176240

177-
attach_notebook(
241+
finish_pdf(
178242
pdf_fname,
179243
pdf_fname2,
180244
{
181245
"file_name": f"{resources['metadata']['name']}.ipynb",
182246
"contents": nbformat.writes(notebook).encode("utf-8"),
183247
},
248+
heading_positions,
184249
)
185250

186251
with open(pdf_fname2, "rb") as f:

0 commit comments

Comments
 (0)