In [9]:
import json
import random
import requests
from bs4 import BeautifulSoup
from unstructured.cleaners.core import *
from bs4 import NavigableString
import re
from tqdm.notebook import tqdm

In [5]:
# headers needed to access SEC APIs
headers = '{"User-Agent": test, email: test@test.net"}' 

def get_soup(url, headers=headers):
    # Use requests to fetch the content of the webpage
    response = requests.get(url, headers=headers)
    text = re.sub(r'&nbsp;', ' ', response.text)
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(text, 'html.parser')
    return soup

def extract_pages(soup):
    
    tables = []
    images = []
    
    soup = preprocess_soup(soup, tables, images)

    page_breaks = soup.find_all('hr')

    raw_pages = []
    # Iterate through the range, stop before the last break
    # skipping the last page that is for signatures.
    for i in range(len(page_breaks)-1):
        page = soup.new_tag('page')
        # Get all tags between page_breaks[i] and page_breaks[i + 1]
        current_break = page_breaks[i]
        next_break = page_breaks[i + 1]
        # Extract elements and move them to the 'page' tag
        element = current_break.find_next_sibling()
        while element and element != next_break:
            next_element = element.find_next_sibling()
            page.append(element.extract())
            element = next_element
       
        raw_pages.append(page)
    pages_soup = []
    
    for page in raw_pages:  
        html_text = [str(p) for p in page]
        pages_soup.append(BeautifulSoup(''.join(html_text), 'html.parser'))       
    return pages_soup

def preprocess_soup(soup, tables, images):
    #add marker to bold and italic text
    soup = add_markers_around_bold(soup)
    soup = add_markers_around_italic(soup)
    # preprocess
    soup = preprocess_inline_new_lines(soup)
    soup = preprocess_tables(soup, tables)
    soup = preprocess_images(soup, images)
    soup = preprocess_paragraph_new_lines(soup)
    return soup

def is_bold(tag):
        # Checks if a tag is explicitly bold or has a bold style
    if tag.name in ["b", "strong"]:
        return True    
    if tag.has_attr('style') and is_style_bold(tag['style']):
        return True
    return False

def is_style_bold(style_str):
    styles = [style.strip().lower().replace(' ','') for style in style_str.split(';')]
    return any('font-weight:bold' in style or 'font-weight:700' in style for style in styles)

def add_markers_around_bold(soup):
    for tag in soup.find_all(is_bold):
        tag.insert(0, NavigableString('[BOLD]'))
        tag.append(NavigableString('[BOLD_END]'))
    return soup

def is_italic(tag):
    # Checks if a tag is explicitly italic or has an italic style
    if tag.name in ["i", "em"]:
        return True   
    if tag.has_attr('style') and is_style_italic(tag['style']):
        return True    
    return False
                   
def is_style_italic(style_str):
    styles = [style.strip().lower().replace(' ','') for style in style_str.split(';')]
    return any('font-style:italic' in style for style in styles)


def add_markers_around_italic(soup):
    for tag in soup.find_all(is_italic):
        tag.insert(0, NavigableString('[ITALIC]'))
        tag.append(NavigableString('[ITALIC_END]'))
    return soup

def preprocess_tables(soup, tables): 
    def count_non_empty_row(tag):
        non_empty_rows_count = 0
        for row in tag.find_all('tr'):
            if row.get_text(strip=True):
                non_empty_rows_count += 1
        return non_empty_rows_count

    for tag in tqdm(soup.find_all('table'), desc="Preprocess table elements"):  
        new_p_tag = soup.new_tag("p")
        if count_non_empty_row(tag) == 1:
            new_p_tag.string = f'{tag.get_text()}'.replace("\n","")
        else:
            new_p_tag.string = f'\n\n[TABLE_REPLACED_{len(tables)}]\n\n'
            tables.append(tag)     
        tag.insert_after(new_p_tag)
        tag.decompose()
    return soup

def preprocess_images(soup, images):
    for tag in soup.find_all('img'):  
        new_p_tag = soup.new_tag("p")
        new_p_tag.string = f'[IMAGE_REPLACED_{len(images)}]'
        images.append(tag)
        tag.insert_after(new_p_tag)
        tag.decompose
    return soup
               
def preprocess_inline_new_lines(soup):
    """
    Recursively process a BS4 tag to replace '\n' with ' ' in inline elements
    without preformatted text and with no relevant CSS white-space property.
    """
    inline_elements = ['span', 'p', 'font', 'a', 'b', 'i', 'em', 'strong', 'u', 'small', 'sub', 'sup']  
    inline_tags = soup.find_all(inline_elements)
    
    for tag in soup.find_all(inline_elements):  
        style = tag.attrs.get('style', '')
        # Check if the style does not contain 'white-space: pre' before processing
        if 'white-space: pre' not in style:
            for content in tag.contents:
                if isinstance(content, NavigableString):
                    # Replace newline characters in the string
                    new_content = content.replace('\n', ' ')
                    content.replace_with(new_content)
    return soup

def preprocess_paragraph_new_lines(soup):
    """
    Recursively process a BS4 tag to append '\n\n' at the end of all specified block-level elements.
    """
    newline_tags = ['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 
                    'blockquote', 'pre', 'dl', 'dt', 'dd', #'ul', 'ol', 'li',
                    'table', 'form', 'address', 'figure', 'figcaption']
    for tag in soup.find_all(newline_tags):  
        # Check if the tag is not empty or doesn't already end with '\n\n'
        if tag.contents and not str(tag.contents[-1]).endswith('\n\n'):
            # Append '\n\n' as a new NavigableString
            tag.append(NavigableString('\u00A0 \n\n'))
    return soup
    


In [None]:
%%time

from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm  
from bs4 import BeautifulSoup, NavigableString, Tag

# Define the file path
file_path = "sp500_ten_years_documents.json"

# Load the JSON file back into a list of dictionaries
with open(file_path, 'r') as json_file:
    document_dictionaries = json.load(json_file)


sample = random.choice(document_dictionaries)
url=sample['form_link']

print(sample['company_name'], sample['year'])
print(url)


USER_AGENT = 'test@test1.com'
headers = {'User-Agent': USER_AGENT}
soup = get_soup(url)

print('Preparing Soup')
pages_soup = extract_pages(soup)
print('Pages Soup Ready')


In [5]:
%%time

import unstructured
from unstructured.cleaners.core import *
from IPython.display import HTML
from collections import Counter



marker_dict = {
        '[BOLD]': '[BOLD_END]',
        '[ITALIC]': '[ITALIC_END]',
        # Add more markers as needed
    }

marker_list = [marker for markers in marker_dict.items() for marker in markers]


def select_one_marker(text, marker, marker_dict=marker_dict):
    if marker not in text:
        return re.sub(r'\s+', ' ', text.strip()) 
    d_copy = marker_dict.copy()
    del d_copy[marker]
    markers_to_remove = list(d_copy.keys()) + list(d_copy.values())
    t = text
    for m in markers_to_remove:
        text = text.replace(m, '')
    return re.sub(r'\s+', ' ', text.strip()) 


def clean_markers(text, marker_dict=marker_dict):
    # First, remove empty markers or markers with only whitespace between them
    for open_marker, close_marker in marker_dict.items():
        esc_open_marker = re.escape(open_marker)
        esc_close_marker = re.escape(close_marker)
        # Pattern to match markers with only whitespace or nothing between them
        empty_pattern = f"{esc_open_marker}\\s*{esc_close_marker}"
        text = re.sub(empty_pattern, "", text)
    
    # Then, handle cases of adjacent markers or nested markers without relevant content
    # This might need a more nuanced approach depending on the exact behavior you want
    for open_marker, close_marker in marker_dict.items():
        esc_open_marker = re.escape(open_marker)
        esc_close_marker = re.escape(close_marker)
        # Adjust pattern to consider nested markers or directly adjacent markers
        nested_pattern = f"({esc_close_marker})\\s*({esc_open_marker})"
        # Simple approach: Replace directly adjacent markers with a single space or other logic as needed
        text = re.sub(nested_pattern, " ", text)  # Adjust this as per your requirement
   
    openers = list(marker_dict.keys())
    closers = list(marker_dict.values())
    text = remove_consecutive_markers(text, openers, closers, marker_dict)

    return text

def remove_consecutive_markers(text, openers, closers, marker_dict):
    for close_marker in closers:
        close_opener = close_marker[:-5]+']'  
        for open_marker in openers:
            if close_marker == open_marker:  # Skip if it's the same marker
                continue
            esc_close_marker = re.escape(close_marker)
            esc_open_marker = re.escape(open_marker)
            esc_close_opener = re.escape(close_opener)
            # Pattern to match the specific sequence: closer + optional whitespace + opener + optional whitespace + close_opener
            # This time, we'll capture the opener to ensure it's not removed
            pattern = f"({esc_close_marker})\\s*({esc_open_marker})\\s*{esc_close_opener}"
            # Use a replacement function to keep the opener while removing the closer and close_opener
            def replacement(match):
                return match.group(2)  # Keep only the opener
            text = re.sub(pattern, replacement, text)
    return text


def is_removable_entry(text, marker_dict=marker_dict):
    keywords = {
        "TOC","TABLE OF CONTENT", "TABLE OF CONTENTS", "INDEX",
        "INDEX TO FINANCIAL STATEMENTS",
        "PART I","PART II","PART III","PART I.","PART II.","PART III.",
        "PART 1","PART 2","PART 3","PART 1.","PART 2.","PART 3."}
    for k, v in marker_dict.items():
        text = text.replace(k, '')
        text = text.replace(v, '')
        

    text_upper = text.strip().upper()
    return text_upper in keywords

    



def remove_numbers_and_substrings(text):
    # Remove numbers up to three digits
    text = re.sub(r'\b\d{1,3}\b', '', text)
    
    # Define a list of substrings to remove
    substrings_to_remove = ['page', 'number', 'of', 'nr', 'nr.', 'n', 'n.']
    
    # Construct a regex pattern to match any of the substrings
    pattern = r'\b(?:' + '|'.join(re.escape(sub) for sub in substrings_to_remove) + r')\b'
    
    # Remove the matched substrings
    text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    return text.strip()  # Added strip to remove leading/trailing whitespace

def remove_numbers_and_words(text):
    # Define the pattern to match 1 to 3 digit numbers or the specific words
    # \b ensures that we are matching whole words only (word boundary)
    pattern = r'\b(\d{1,3}|page|number|of|nr|n)\b'
    
    # Use re.sub to replace the found patterns with an empty string
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    # Remove extra spaces that may be left after removals
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text


def remove_headers(pages, n=3, threshold=5):
    pages_dicts = []
    string_count = {}

    # Step 1: Creating dictionaries for the first n elements of each page with masked values
    for page in pages:
        page_dict = {}
        for i, original_string in enumerate(page[:n]):
            masked_string = mask_string(original_string)  # Mask the string for counting
            page_dict[i] = original_string  # Store original string in the dict
            string_count[masked_string] = string_count.get(masked_string, 0) + 1
        pages_dicts.append(page_dict)

    # Step 2: Identifying masked strings with a count higher than the threshold
    strings_above_threshold = {string for string, count in string_count.items() if count > threshold}

    # Step 3: Removing headers based on masked strings
    cleaned_pages = []
    for page_dict, page in zip(pages_dicts, pages):
        cleaned_page = []
        for index, original_string in page_dict.items():
            masked_string = mask_string(original_string)  # Mask the string to check against threshold
            if masked_string not in strings_above_threshold:
                cleaned_page.append(original_string)
        # Append the rest of the page if it's longer than n
        if len(page) > n:
            cleaned_page.extend(page[n:])
        cleaned_pages.append(cleaned_page)

    return cleaned_pages


def mask_string(string):
    # Remove all numbers up to 3 digits, specific words, and all whitespaces
    s = string.lower()
    s = re.sub(r'\b(page|number|nr|or)\b', '', s, flags=re.IGNORECASE)
    s = re.sub(r'\d{1,3}', '', s)
    s = re.sub(f"[ |i|v|x|l|c]", '', s)
    return s if (s[:4] not in ['item', '[tab', '[img']) else string

def remove_footers(pages, n=3, threshold=5):
    pages_dicts = []
    string_count = {}

    # Step 1: Creating dictionaries for the last n elements of each page with masked values
    for page in pages:
        page_dict = {}
        for i, original_string in enumerate(page[-n:]):
            masked_string = mask_string(original_string)  # Mask the string for counting
            page_dict[-(i + 1)] = original_string  # Store original string in the dict
            string_count[masked_string] = string_count.get(masked_string, 0) + 1
        pages_dicts.append(page_dict)

    # Step 2: Identifying masked strings with a count higher than the threshold
    strings_above_threshold = {string for string, count in string_count.items() if count > threshold}

    # Step 3: Removing footers based on masked strings
    cleaned_pages = []
    for page_dict, page in zip(pages_dicts, pages):
        cleaned_page = list(page)  # Start with a full copy of the original page
        for index, original_string in page_dict.items():
            masked_string = mask_string(original_string)  # Mask the string to check against threshold
            if masked_string in strings_above_threshold:
                # Remove the original string from the cleaned page
                cleaned_page.remove(original_string)
        cleaned_pages.append(cleaned_page)

    return cleaned_pages

def clean_list(text):
    # This pattern matches list markers at the start of the string or after a newline.
    # It captures digits followed by a period, or various bullet characters, at the start of a line.
    pattern = r'(^|\n)(\d+\.\s*|[\*\-\+\•\•\•\■\□\○\●\◇\◆\▷\◁]\s*)'
    
    # Function to replace the matched markers with a standardized format.
    def replacer(match):
        start, marker = match.groups()
        if marker.strip().isdigit():
            # Keep ordered lists as they are, ensuring there's exactly one space.
            new_marker = f'{start}{marker.rstrip()} '
        else:
            # Replace unordered list markers with a bullet and ensure one space.
            new_marker = f'{start}- '
        return new_marker
    
    # Replace the markers in the text based on the pattern and replacer function.
    cleaned_text = re.sub(pattern, replacer, text, flags=re.MULTILINE)
    
    return cleaned_text
    
    return text
def preprocess_page_text_sections(text):
    # standardadize start and end new lines
    text = text.strip()
    text += '\n\n'  

    def clean_text_before_listing(t):
        t = re.sub(r'\s*\n\s*\n\s*', '\n\n', t)
        return t
    text_list = [clean_text_before_listing(t) for t in text.split('\n\n') if t.strip() and not is_removable_entry(t)]

    
    clean_text = []
    for t in text_list:
        t = t.strip()
        t = re.sub(r'\s{2,}', ' ', t)
        t = replace_unicode_quotes(t)
        t = clean_markers(t)
        t = clean_extra_whitespace(t)
        t = re.sub(r'\s*([,.;:?!\'"\)])', r'\1', t)
        t = clean_list(t)
        t_mask = t

        clean_text.append(t)

    return clean_text

def preproces_pages(page_soup):
    pages = []
    for page in pages_soup:
#         print(page.prettify())
        text = page.get_text(strip=False)
        text = re.sub(r'\xa0', ' ', text)
        text = re.sub(r'(\s?\n){1}(\s?\n){2,}', '\n\n', text)
        text_list = preprocess_page_text_sections(text)
        pages.append(text_list)
        

    pages = remove_headers(pages)
    pages = remove_footers(pages)
    

    return pages

    
pages = preproces_pages(pages_soup)

sections = [section for page in pages for section in page]

print(url)

https://www.sec.gov/Archives/edgar/data/906345/000090634519000006/cpt1231201810k.htm
CPU times: user 63.2 ms, sys: 6.96 ms, total: 70.2 ms
Wall time: 75.8 ms


In [6]:
def ignore_markers(text, marker_dict=marker_dict):
    clean_text = text  # Start with the original text
    for k, v in marker_dict.items():
        clean_text = clean_text.replace(k, '')  # Remove the opening marker
        clean_text = clean_text.replace(v, '')  # Remove the closing marker
    return re.sub(r'\s+', ' ', clean_text.strip()) 

def aggregate_lists(sections, marker_dict=marker_dict):
    processed_sections = []
    i = 0
    while i < len(sections):
        section = sections[i]
        # Remove markers from the section for checking purposes
        cleaned_section = ignore_markers(section, marker_dict)
        # Use the original section for appending
        aggregated_section = section

        # Check if the cleaned section starts with '- '
        if cleaned_section.startswith('- ') or cleaned_section.startswith(' - '):
            # Join consecutive sections that start with '- '
            while i + 1 < len(sections):
                next_cleaned_section = ignore_markers(sections[i + 1], marker_dict)
                if next_cleaned_section.startswith('- ') or cleaned_section.startswith(' - '):
                    aggregated_section += '\n' + sections[i + 1]  # Use the original next section
                    i += 1
                else:
                    break

        processed_sections.append(aggregated_section)
        i += 1
    return processed_sections
    
    
    
sections = aggregate_lists(sections, marker_dict=marker_dict)

    



In [7]:
%%time 

import torch
import pandas as pd
from transformers import BertModel, BertTokenizer, AutoModel, AutoTokenizer
import logging

print(url)

item_dict = {
    'Item 1': ['ITEM 1 '],
    'Item 1 and 2': ['ITEM 1 AND 2 ', 'ITEM 1 & 2 ', 'ITEMS 1 AND 2 ', 'ITEMS 1 & 2 '],
     'Item 1A': ['ITEM 1A ', 'ITEM 1 A '],
     'Item 1B': ['ITEM 1B ', 'ITEM 1 B '],
     'Item 2': ['ITEM 2 '],
     'Item 3': ['ITEM 3 '],
     'Item 4': ['ITEM 4 '],
     'Item 5': ['ITEM 5 '],
     'Item 6': ['ITEM 6 '],
     'Item 7': ['ITEM 7 '],
     'Item 7A': ['ITEM 7A ', 'ITEM 7 A '],
     'Item 8': ['ITEM 8 '],
     'Item 9': ['ITEM 9 '],
     'Item 9A': ['ITEM 9A ', 'ITEM 9 A '],
     'Item 9B': ['ITEM 9B ', 'ITEM 9 B '],
     'Item 9C': ['ITEM 9C ', 'ITEM 9 C '],
     'Item 10': ['ITEM 10 '],
     'Item 11': ['ITEM 11 '],
     'Item 12': ['ITEM 12 '],
     'Item 13': ['ITEM 13 '],
     'Item 14': ['ITEM 14 '],
     'Item 15': ['ITEM 15 '],
     'Item 16': ['ITEM 16 ']}

item_notations = [item for sublist in item_dict.values() for item in sublist]

def item_standardization(text):
    # Replace specified characters with whitespace
    cleaned_text = re.sub(r'[.,\-/_\(\)\[\]{}\\]', ' ', text)
    # Replace multiple consecutive whitespaces with a single whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text =  cleaned_text.upper()
    return cleaned_text

def assign_title_levels(df):
    # Condition 1
    mask_condition_1 = (df['bold_all'] == True) & (df['italic_all'] == False) & (df['text'].str.isupper())
    df.loc[mask_condition_1, 'title_level'] = 9
    # Condition 2
    mask_condition_2 = (df['bold_all'] == True) & (df['italic_all'] == False) & (~df['text'].str.isupper())
    df.loc[mask_condition_2, 'title_level'] = 8
    # Condition 3
    mask_condition_3 = (df['bold_all'] == True) & (df['italic_all'] == True)
    df.loc[mask_condition_3, 'title_level'] = 7
    mask_condition_32 = (df['bold_all'] == False) & (df['italic_all'] == True)
    df.loc[mask_condition_32, 'title_level'] = 6

    # Condition 4
    mask_condition_4 = (df['bold_all'] == False) & (df['italic_all'] == False) & (df['bold_start'] == True) & (df['italic_start'] == False)
    df.loc[mask_condition_4, 'title_level'] = 5
    # Condition 5
    mask_condition_5 = (df['bold_all'] == False) & (df['italic_all'] == False) & (df['bold_start'] == True) & (df['italic_start'] == True)
    df.loc[mask_condition_5, 'title_level'] = 4
    # Condition 6
    mask_condition_6 = (df['bold_all'] == False) & (df['italic_all'] == False) & (df['bold_start'] == False) & (df['italic_start'] == True)
    df.loc[mask_condition_6, 'title_level'] = 3
    # Set type based on title_level
    df.loc[(df['title_level'] >= 7), 'type'] = 'title'
    df.loc[(df['type'] == 'list'), 'title_level'] = 0
    df.loc[~df['item'].isna(), 'title_level'] = 11
    df['title_level'] = df['title_level'].fillna(2)
    return df


def find_item_key(text, item_dict):
    """
    Function to find the key of the item notation that matches the beginning of the standardized text.
    """
    standardized_text = item_standardization(text)  # Standardize the text
    for key, notations in item_dict.items():  # Iterate over item_dict
        for notation in notations:  # Check each notation for a match
            if standardized_text.startswith(notation):  # If a match is found
                return key  # Return the key (item name)
    return np.nan  # Return NaN if no match is found

def process_text_for_list_element(text, marker_dict=marker_dict):
    # Combine all markers into a single pattern for easy removal
    all_markers_pattern = '|'.join(re.escape(marker) for marker in list(marker_dict.keys()) + list(marker_dict.values()))

    # Pattern to check the start condition and presence of a newline followed by zero or more markers
    start_check_pattern = rf'^({all_markers_pattern})*\s*-\s+| - '
    newline_check_pattern = rf'(\n)({all_markers_pattern})*'

    # Check if the text meets the start condition and contains the required newline pattern
    if re.match(start_check_pattern, text) and re.search(newline_check_pattern, text):
        # Remove all specified markers from the text
        cleaned_text = re.sub(all_markers_pattern, '', text)

        # Add '[LIST_ELEMENT]' at the beginning of the string
        result_text = '[LIST_ELEMENT]' + cleaned_text
    else:
        # If conditions are not met, return the original text
        result_text = text

    return result_text


def word_count(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    word_list = cleaned_text.strip().split()
    return len(word_list)

# Function to count tokens using BERT tokenizer
def token_count(text, tokenizer):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    return len(tokens)

def assign_id(df):
    # Find the index of the row that matches the condition
    condition_index = df[(df['item'].isin(['Item 1', 'Item 1 and 2'])) & (df['type'] == 'ItemMarker')].index[0]
    
    # Calculate IDs for rows before the condition_index
    df.loc[:condition_index, 'id'] = range(-1, -condition_index - 2, -1)
    
    # Calculate IDs for rows from the condition_index onwards
    df.loc[condition_index:, 'id'] = range(1, len(df) - condition_index + 1)
    
    return df

def assign_parent_id(df):
    # Initialize the 'parent_id' column with None (or any other default value)
    df['parent_id'] = None
    
    # Reverse iterate over the DataFrame indices
    for i in reversed(df.index):
        current_title_level = df.loc[i, 'title_level']
        
        # Look for the first row with a higher 'title_level' above the current row
        for j in reversed(df.index[:i]):
            if df.loc[j, 'title_level'] > current_title_level:
                # Assign the 'id' of the parent row instead of its index
                df.loc[i, 'parent_id'] = df.loc[j, 'id']
                break  # Stop searching once the parent is found
    
    return df

def create_dataframe(sections, marker_list=marker_list):
    tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
    # Prepare data for pandas
    data = []
    for string in sections:
        cleaned_text = string

        cleaned_text = process_text_for_list_element(cleaned_text)
        if cleaned_text.startswith('[LIST_ELEMENT]'):
            for m in marker_list:
                cleaned_text = cleaned_text.replace(m,'')
            data.append({'text': cleaned_text[14:], 'type': 'list', 'marked_text': string.replace('[LIST_ELEMENT]','')})
            continue

        cleaned_text = select_one_marker(cleaned_text, '[BOLD]')
        cleaned_text = ignore_markers(string)
        data.append({'text': cleaned_text, 'type': 'NarrativeText', 'marked_text': string})


    # Create DataFrame from the list of dictionaries
    df = pd.DataFrame(data)

    # Create marks variables
    df['bold_all'] = df['marked_text'].apply(lambda text: select_one_marker(text, '[BOLD]').startswith('[BOLD]') and
                                                          select_one_marker(text, '[BOLD]').endswith('[BOLD_END]') and 
                                                          text.count('[BOLD]') == 1)
    df['bold_start'] = df['marked_text'].apply(lambda text: select_one_marker(text, '[BOLD]').startswith('[BOLD]'))
    df['bold_has_any'] = df['marked_text'].apply(lambda text: text.count('[BOLD]') > 0)
    # TODO POSITION OF ALL MARKED TEXT
    df['italic_all'] = df['marked_text'].apply(lambda text: select_one_marker(text, '[ITALIC]').startswith('[ITALIC]') and 
                                                            select_one_marker(text, '[ITALIC]').endswith('[ITALIC_END]') and 
                                                            text.count('[ITALIC]') == 1)
    df['italic_start'] = df['marked_text'].apply(lambda text: select_one_marker(text, '[ITALIC]').startswith('[ITALIC]'))
    df['italic_has_any'] = df['marked_text'].apply(lambda text: text.count('[ITALIC]') > 0)


    # Identify Items
    df['item'] = df.apply(lambda x: find_item_key(x['text'], item_dict) if x['bold_all'] else np.nan, axis=1)
    if len(df[~df['item'].isna()])!= df['item'].nunique() or df['item'].nunique() <= 10:
        raise ValueError("item labelling error ", (df['item'].value_counts()))

    # Create title levels and items
    df = assign_title_levels(df)
    df.loc[~df['item'].isna(), 'type'] = 'ItemMarker'
    df['item'] = df['item'].fillna(method='ffill')
    df['item'] = df['item'].fillna('PreText')
    df.loc[df['text'].str.startswith('[TABLE_'), 'type'] = 'table'
    df.loc[df['text'].str.startswith('[IMAGE_'), 'type'] = 'image'

    # Create Token Counter
    # suppress log for token size over 512
    original_logging_level = logging.getLogger("transformers").getEffectiveLevel()
    logging.getLogger("transformers").setLevel(logging.ERROR)  # Suppress warnings
    # apply token_count
    df['token_count'] = df['text'].apply(lambda t: token_count(t, tokenizer))
    # Restore the original logging level
    logging.getLogger("transformers").setLevel(original_logging_level) 

    # Other Counters
    df['word_count'] = df['text'].apply(word_count)
    df['char_count'] = df['text'].apply(len)
    
    # Add headers of text before first item
    new_row_values = {'type':'ItemMarker','text':'Content before First Item.',
                      'marked_text':'[BOLD][ITALIC] Content before First Item . [BOLD_END][ITALIC_END]',
                      'item':'PreText','title_level': 11,
                      'token_count':5,'word_count':4,'char_count':28,
                      'bold_all':True,'bold_start':True,'bold_has_any':True,
                      'italic_all':True,'italic_start':True,'italic_has_any':True}

    new_row_df = pd.DataFrame([new_row_values], columns=df.columns)
    df = pd.concat([new_row_df, df]).reset_index(drop=True)

    df.reset_index(drop=True, inplace=True)

    df = assign_id(df)
    df = assign_parent_id(df)


    # reorder columns
    df = df[['id', 'parent_id', 'type', 'text', 'marked_text','item', 'title_level',
         'token_count', 'word_count', 'char_count',
         'bold_all', 'bold_start', 'bold_has_any', 'italic_all', 'italic_start', 'italic_has_any',
        ]]

    
    return df

df = create_dataframe(sections)


  torch.utils._pytree._register_pytree_node(


https://www.sec.gov/Archives/edgar/data/906345/000090634519000006/cpt1231201810k.htm
CPU times: user 906 ms, sys: 263 ms, total: 1.17 s
Wall time: 1.4 s


In [9]:

def replace_markers(text):
    # Replace bold markers
    text = re.sub(r'\[BOLD\]', '<b>', text)
    text = re.sub(r'\[BOLD_END\]', '</b>', text)
    # Replace italic markers
    text = re.sub(r'\[ITALIC\]', '<i>', text)
    text = re.sub(r'\[ITALIC_END\]', '</i>', text)
    # Replace newline characters with <br> tags
    text = re.sub(r'\n', '<br>', text)

    return text

def rebuild_html(s):
    html_sections = []
    s = replace_markers(s)
    tag = '<div>' + s + '</div>'
    html_sections.append(tag)
#     display((HTML(s)))

    return html_sections
    
        
sections_html = [rebuild_html(s) for s in aggregate_lists(sections)]






## Future cleaning notes
- table preprocessing
    - one column table to multiple strings
    - two column 
        - if first one is list char (one or 2 non whitespaces or number, abcd..) 
        - if first one is text and have one element each row first column and if not
    - check if they have headers

- store linkks image tables

Part, Items (mind item 1 and 2 can be merged)





### Document with ussues examples


https://www.sec.gov/Archives/edgar/data/8670/000000867022000038/adp-20220630.htm

double column text at 'A major natural disaster or catastrophic event could have a materially adverse effect on our business, financial'