Skip to content
This repository has been archived by the owner on Apr 17, 2024. It is now read-only.

Commit

Permalink
Identify where multiple <p>&nbsp;</p> tags have been added and give w…
Browse files Browse the repository at this point in the history
…arning. Also remove warning for old way of detecting mixed top/non-top content. Fixes #548.
  • Loading branch information
robintw committed Nov 23, 2023
1 parent a09ae5d commit ac2287e
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 3 deletions.
34 changes: 34 additions & 0 deletions parser/lman_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import subprocess
from urllib.parse import urlparse
import bs4
from bs4 import BeautifulSoup
from pprint import pprint, pformat
from html_to_dita import htmlToDITA
Expand Down Expand Up @@ -565,6 +566,39 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="")
# insert rest of converted content
dita_section.extend(converted_bits)

def is_empty_p_element(el):
if el is None:
return False
elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0:
return True
else:
return False

def next_sibling_tag(el):
next_sib = el.next_sibling
while type(next_sib) is bs4.element.NavigableString:
next_sib = next_sib.next_sibling

return next_sib

# Check for repeated <p>&nbsp;</p> elements
p_elements = page.find_all("p")
empty_p_elements = list(filter(is_empty_p_element, p_elements))

found = False
for el in empty_p_elements:
count = 0
while is_empty_p_element(next_sibling_tag(el)):
count += 1
if count >= 4:
found = True
break
if found:
logging.warning(
f"Found string of repeated <p>&nbsp;</p> elements in div with ID {page.get('id')} in file {filename}"
)
break

return dita_section

def find_first_page_layer(self, top_to_div_mapping, html_soup):
Expand Down
6 changes: 3 additions & 3 deletions parser/parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,9 @@ def generate_top_to_div_mapping(
# exited in an earlier if statement), so we check if there are some elements without top values
# and raise a warning if so
if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1:
logging.warning(
f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
)
# logging.warning(
# f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
# )
return [(0, html_soup)]

return top_to_div_mapping
Expand Down

0 comments on commit ac2287e

Please sign in to comment.