Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added a images_to_alt option to discard images and keep only their alt #49

Merged
merged 1 commit into from
Dec 23, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
self.google_list_indent = config.GOOGLE_LIST_INDENT
self.ignore_links = config.IGNORE_ANCHORS
self.ignore_images = config.IGNORE_IMAGES
self.images_to_alt = config.IMAGES_TO_ALT
self.ignore_emphasis = config.IGNORE_EMPHASIS
self.bypass_tables = config.BYPASS_TABLES
self.google_doc = False
Expand Down Expand Up @@ -390,23 +391,29 @@ def handle_tag(self, tag, attrs, start):

if tag == "img" and start and not self.ignore_images:
if 'src' in attrs:
attrs['href'] = attrs['src']
if not self.images_to_alt:
attrs['href'] = attrs['src']
alt = attrs.get('alt') or ''
self.o("![" + escape_md(alt) + "]")

if self.inline_links:
href = attrs.get('href') or ''
self.o("(" + escape_md(href) + ")")
# If we have images_to_alt, we discard the image itself,
# considering only the alt text.
if self.images_to_alt:
self.o(escape_md(alt))
else:
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
self.o("![" + escape_md(alt) + "]")
if self.inline_links:
href = attrs.get('href') or ''
self.o("(" + escape_md(href) + ")")
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("[" + str(attrs['count']) + "]")
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("[" + str(attrs['count']) + "]")

if tag == 'dl' and start:
self.p()
Expand Down
10 changes: 9 additions & 1 deletion html2text/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ def main():
default=config.IGNORE_IMAGES,
help="don't include any formatting for images"
)
p.add_option(
"--images-to-alt",
dest="images_to_alt",
action="store_true",
default=config.IMAGES_TO_ALT,
help="Discard image data, only keep alt text"
)
p.add_option(
"-g", "--google-doc",
action="store_true",
Expand Down Expand Up @@ -140,9 +147,10 @@ def main():
h.ignore_emphasis = options.ignore_emphasis
h.ignore_links = options.ignore_links
h.ignore_images = options.ignore_images
h.images_to_alt = options.images_to_alt
h.google_doc = options.google_doc
h.hide_strikethrough = options.hide_strikethrough
h.escape_snob = options.escape_snob
h.bypass_tables = options.bypass_tables

wrapwrite(h.handle(data))
wrapwrite(h.handle(data))
3 changes: 2 additions & 1 deletion html2text/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

IGNORE_ANCHORS = False
IGNORE_IMAGES = False
IMAGES_TO_ALT = False
IGNORE_EMPHASIS = False

# For checking space-only lines on line 771
Expand Down Expand Up @@ -102,4 +103,4 @@
'rlm': ''
}

BYPASS_TABLES = False
BYPASS_TABLES = False
3 changes: 3 additions & 0 deletions test/images_to_alt.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<a href="http://example.com">
<img src="http://example.com/img.png" alt="ALT TEXT" />
</a>
2 changes: 2 additions & 0 deletions test/images_to_alt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[ ALT TEXT ](http://example.com)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any specific reason for having space around alt text [ ALT TEXT ] why it can't be stripped ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Alir3z4 Well, it's more related to how the o() method handle its output. Honestly, changing that method behavior for my option seems overkill to me, what would you suggest ?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JocelynDelalande You're right, for now I don't think it would be a good idea to touch o since it might mess up all the unit-tests.

This change is fine then ;)


4 changes: 4 additions & 0 deletions test/test_html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ def test_cmd(self):
module_args['body_width'] = 0
cmdline_args.append('--body-width=0')

if base_fn.startswith('images_to_alt'):
module_args['images_to_alt'] = True
cmdline_args.append('--images-to-alt')

return test_mod, test_cmd

# Originally from http://stackoverflow.com/questions/32899/\
Expand Down