In [2]:
import re
from fuzzywuzzy import fuzz
from bs4 import BeautifulSoup

def loadFile(filePath):
    with open(filePath, "r", encoding="utf-8") as file:
        return file.read();

# Parse the HTML content, extract text from the parsed HTML, & do some initial clean-up
def preProcessText(content):
    soup = BeautifulSoup(content, "html.parser");
    text = soup.get_text(separator="");

    # Remove standalone page numbers
    pageNumPattern = re.compile(r'^\s*\d+\s*$', re.MULTILINE);
    text = re.sub(pageNumPattern, '', text);

    # Remove extra newline characters
    text = re.sub(r'\n\s*\n+', '\n\n', text);

    return text.strip();

def removeTableOfContents(text):
    # Regular expression patterns for table of contents
    tocStartPattern = re.compile(r'(Table of Contents|Contents|TABLE OF CONTENT|CONTENTS)', re.IGNORECASE);
    tocEndPattern = re.compile(r'(Introduction|Chapter \d+|Section \d+|Part \d+|Page \d+)', re.IGNORECASE);

    # Find the start of the table of contents
    tocStartMatch = tocStartPattern.search(text);
    if not tocStartMatch: # No table of contents found
        return text;

    tocStartIndex = tocStartMatch.start();

    # Find the end of the table of contents
    tocEndMatch = tocEndPattern.search(text, tocStartIndex);
    if not tocEndMatch: # No end of table of contents found
        return text;

    tocEndIndex = tocEndMatch.start();

    # Remove the table of contents section
    cleanedText = text[:tocStartIndex] + text[tocEndIndex:];

    # Remove any remaining table of contents references
    cleanedText = re.sub(r'\btable\s*of\s*contents?\b|\btableofcontents?\b', '', cleanedText, flags=re.IGNORECASE);

    # Handle concatenated cases
    cleanedText = re.sub(r'(?i)table\s*of\s*contents?|tableofcontents?', '', cleanedText)

    return cleanedText.strip();

def extractSection(text, startCandidates, stopCandidates):
    # Tokenize text into lines
    lines = text.split("\n");
    startIndex, stopIndex = -1, -1;

    # Identify the start and stop indexes using fuzzy matching
    for i, line in enumerate(lines):
        if startIndex == -1 and any(fuzz.partial_ratio(line.lower(), sc.lower()) > 95 for sc in startCandidates):
            startIndex = i;
        if startIndex != -1 and i > startIndex and any(fuzz.partial_ratio(line.lower(), sc.lower()) > 80 for sc in stopCandidates):
            stopIndex = i;
            break;

    # Extract the section
    if startIndex != -1 and stopIndex != -1 and startIndex != stopIndex:
        return "\n".join(lines[startIndex:stopIndex]).strip();
    return None;

text = loadFile("./SampleData/sample4.txt");
cleanedText = preProcessText(text);
cleanedText = removeTableOfContents(cleanedText);
truncatedText = cleanedText[50000:1000000]; # Chop off the table of contents

startPhrases = [
    "Background of the transaction",
    "Background of the merger",
    "Background of the offer",
    "background of the acquisition"
];

stopPhrases = [
    "Reasons for the Transactions",
    "Reasons for the merger",
    "Reasons for the offer",
    "Reasons for the acquisition"
];

backgroundSection = extractSection(truncatedText, startPhrases, stopPhrases);
print(backgroundSection);

ValueError: [E088] Text of length 3802588 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.