Identify where multiple <p> </p> tags have been added and give w…

…arning. Also remove warning for old way of detecting mixed top/non-top content. Fixes #548.
DeepBlueCLtd · Nov 23, 2023 · ac2287e · ac2287e
1 parent a09ae5d
commit ac2287e
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 3 deletions.
diff --git a/parser/lman_parser.py b/parser/lman_parser.py
@@ -8,6 +8,7 @@
 import os
 import subprocess
 from urllib.parse import urlparse
+import bs4
 from bs4 import BeautifulSoup
 from pprint import pprint, pformat
 from html_to_dita import htmlToDITA
@@ -565,6 +566,39 @@ def process_generic_file_pagelayer(self, dita_soup, page, topic_id, filename="")
         # insert rest of converted content
         dita_section.extend(converted_bits)
 
+        def is_empty_p_element(el):
+            if el is None:
+                return False
+            elif el.name == "p" and el.text.strip() == "" and len(el.find_all()) == 0:
+                return True
+            else:
+                return False
+
+        def next_sibling_tag(el):
+            next_sib = el.next_sibling
+            while type(next_sib) is bs4.element.NavigableString:
+                next_sib = next_sib.next_sibling
+
+            return next_sib
+
+        # Check for repeated <p>&nbsp;</p> elements
+        p_elements = page.find_all("p")
+        empty_p_elements = list(filter(is_empty_p_element, p_elements))
+
+        found = False
+        for el in empty_p_elements:
+            count = 0
+            while is_empty_p_element(next_sibling_tag(el)):
+                count += 1
+                if count >= 4:
+                    found = True
+                    break
+            if found:
+                logging.warning(
+                    f"Found string of repeated <p>&nbsp;</p> elements in div with ID {page.get('id')} in file {filename}"
+                )
+                break
+
         return dita_section
 
     def find_first_page_layer(self, top_to_div_mapping, html_soup):

diff --git a/parser/parser_utils.py b/parser/parser_utils.py
@@ -191,9 +191,9 @@ def generate_top_to_div_mapping(
     # exited in an earlier if statement), so we check if there are some elements without top values
     # and raise a warning if so
     if len(elements_without_top_value) > 0 and len(html_soup.find_all(recursive=False)) > 1:
-        logging.warning(
-            f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
-        )
+        # logging.warning(
+        #     f"Elements with no top value found inside element with ID {html_soup.get('id')} in file {filename}"
+        # )
         return [(0, html_soup)]
 
     return top_to_div_mapping