Skip to content

Commit

Permalink
omit unnecessary spaces around <b>, <em>, and <strike>
Browse files Browse the repository at this point in the history
fixes #324
  • Loading branch information
snarfed committed Dec 6, 2020
1 parent 4235c1a commit 6241853
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 9 deletions.
1 change: 1 addition & 0 deletions ChangeLog.rst
Expand Up @@ -5,6 +5,7 @@ UNRELEASED
* Feature #318: Make padded tables more similar to pandoc's pipe_tables.
* Add support for Python 3.9.
* Fix extra line breaks inside html link text (between '[' and ']')
* Fix #324: unnecessary spaces around ``<b>``, ``<em>``, and ``strike`` tags.

2020.1.16
=========
Expand Down
23 changes: 14 additions & 9 deletions html2text/__init__.py
Expand Up @@ -3,6 +3,7 @@
import html.entities
import html.parser
import re
import string
import urllib.parse as urlparse
from textwrap import wrap
from typing import Dict, List, Optional, Tuple, Union
Expand Down Expand Up @@ -404,13 +405,13 @@ def handle_tag(
self.blockquote -= 1
self.p()

def no_preceding_space(self: HTML2Text) -> bool:
return bool(
self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1])
)

if tag in ["em", "i", "u"] and not self.ignore_emphasis:
if start and no_preceding_space(self):
if (
start
and self.preceding_data
and self.preceding_data[-1] not in string.whitespace
and self.preceding_data[-1] not in string.punctuation
):
emphasis = " " + self.emphasis_mark
else:
emphasis = self.emphasis_mark
Expand All @@ -420,7 +421,11 @@ def no_preceding_space(self: HTML2Text) -> bool:
self.stressed = True

if tag in ["strong", "b"] and not self.ignore_emphasis:
if start and no_preceding_space(self):
if (
start
and self.preceding_data
and self.preceding_data[-1] == self.strong_mark[0]
):
strong = " " + self.strong_mark
else:
strong = self.strong_mark
Expand All @@ -430,7 +435,7 @@ def no_preceding_space(self: HTML2Text) -> bool:
self.stressed = True

if tag in ["del", "strike", "s"]:
if start and no_preceding_space(self):
if start and self.preceding_data and self.preceding_data[-1] == "~":
strike = " ~~"
else:
strike = "~~"
Expand Down Expand Up @@ -826,7 +831,7 @@ def handle_data(self, data: str, entity_char: bool = False) -> None:
self.preceding_stressed = True
elif self.preceding_stressed:
if (
re.match(r"[^\s.!?]", data[0])
re.match(r"[^][(){}\s.!?]", data[0])
and not hn(self.current_tag)
and self.current_tag not in ["a", "code", "pre"]
):
Expand Down
7 changes: 7 additions & 0 deletions test/emphasis_preserved_whitespace.html
Expand Up @@ -16,5 +16,12 @@
<p><em>emphasis</em>.</p>
<p><em>emphasis</em>?</p>
<p><em>emphasis</em>!</p>
<p>(<em>emphasis</em>)</p>
<p>[<b>bold</b>}</p>
<p>(<strike>strike</strike>]</p>

<!-- Strong and strike characters have space -->
<p>*<b>bold</b></p>
<p>~<strike>strike</strike></p>

<p><em>em1</em><em>em2</em></p>
10 changes: 10 additions & 0 deletions test/emphasis_preserved_whitespace.md
Expand Up @@ -24,5 +24,15 @@ _emphasis_?

_emphasis_!

(_emphasis_)

[**bold**}

(~~strike~~]

* **bold**

~ ~~strike~~

_em1_ _em2_

0 comments on commit 6241853

Please sign in to comment.