In [1]:
import re
import spacy
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
# from sentence_transformers import SentenceTransformer, util
# import numpy as np
import torch
import sys

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load spaCy model
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

def loadFile(filePath):
    with open(filePath, "r", encoding="utf-8") as file:
        return file.read()

# Preprocess the text
def preProcessText(content):
    soup = BeautifulSoup(content, "html.parser")
    text = soup.get_text(separator="\n")

    # Remove standalone page numbers
    pageNumPattern = re.compile(r'^\s*\d+\s*$', re.MULTILINE)
    text = re.sub(pageNumPattern, '', text)

    # Remove extra newline characters
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    return text.strip()

def removeTableOfContents(text):
    # Regular expression patterns for table of contents
    tocStartPattern = re.compile(r'(Table of Contents|Contents|TABLE OF CONTENT|CONTENTS)', re.IGNORECASE)
    tocEndPattern = re.compile(r'(Introduction|Chapter \d+|Section \d+|Part \d+|Page \d+)', re.IGNORECASE)

    # Find the start of the table of contents
    tocStartMatch = tocStartPattern.search(text)
    if not tocStartMatch:  # No table of contents found
        return text

    tocStartIndex = tocStartMatch.start()

    # Find the end of the table of contents
    tocEndMatch = tocEndPattern.search(text, tocStartIndex)
    if not tocEndMatch:  # No end of table of contents found
        return text

    tocEndIndex = tocEndMatch.start()

    # Remove the table of contents section
    cleanedText = text[:tocStartIndex] + text[tocEndIndex:]

    # Remove any remaining table of contents references
    cleanedText = re.sub(r'\btable\s*of\s*contents?\b|\btableofcontents?\b', '', cleanedText, flags=re.IGNORECASE)
    cleanedText = re.sub(r'(?i)table\s*of\s*contents?|tableofcontents?', '', cleanedText)

    return cleanedText.strip()

def removeBinaryLikeData(text):
    # Split text into lines for processing
    lines = text.splitlines()
    filtered_lines = []

    for line in lines:
        # Remove lines with high symbol density
        symbol_count = len(re.findall(r'[^\w\s]', line))  # Count non-alphanumeric symbols
        total_length = len(line)
        if total_length > 0 and (symbol_count / total_length) > 0.1:
            continue  # Skip binary-like lines
        filtered_lines.append(line)

    return "\n".join(filtered_lines)

def extractSection(text, startCandidates, stopCandidates):
    doc = nlp(text);
    sentences = [sent.text.strip() for sent in doc.sents];

    startIndex = -1;

    # Locate the start of the desired background section
    for i, sentence in enumerate(sentences):
        match = next(
            (sc for sc in startCandidates if sc.lower() in sentence.lower() or fuzz.partial_ratio(sentence.lower(), sc.lower()) > 95),
            None
        );
        if match:
            startIndex = i;
            matchedSC = match;
            break;
    
    # No "Background" section found
    if (startIndex == -1 or match is None):
        return None;
    
    section = sentences[startIndex:]

    # Clean up the content after the section title by removing single character
    # & removing prior section that got included
    splitContent = section[0].split(matchedSC, 1);
    for content in splitContent:
        splitBr = content.split("\n");
        for br in splitBr:
            if (len(br) > 2):
                section[0] = matchedSC + "\n\n" + br;
                break;
    
    return "\n".join(section);

# Load and preprocess the file
text = loadFile("./SampleData/sample4.txt")
cleanedText = preProcessText(text)
cleanedText = removeTableOfContents(cleanedText)
# cleanedText = removeBinaryLikeData(cleanedText)

# print(cleanedText);
truncatedText = cleanedText[50000:1000000]  # Shrink to manageable size for spaCy

startPhrases = [
    "Background of the Transaction",
    "Background of the Merger",
    "Background of the Offer",
    "Background of the Acquisition",
    "Background of the Agreement"
]

stopPhrases = [
    "Reasons for the Transaction",
    "Reasons for the Merger",
    "Reasons for the Offer",
    "Reasons for the Acquisition"
]

# Extract the background section
backgroundSection = extractSection(truncatedText, startPhrases, stopPhrases)
print(backgroundSection)


Background of the Transaction

VMware’s senior management and the VMware board of directors regularly review VMware’s performance, strategy, competitive position, opportunities and prospects in light of current business and economic environments and developments in the software industry and the opportunities and challenges facing participants in the industry.
These reviews have included consideration by VMware’s senior management and the VMware board of directors of potential strategic alternatives, including acquisitions, business combinations and other strategic transactions.
However, VMware’s recent focus, both before and after the VMware spin-off on November 1, 2021, had been on executing its standalone plan.
On April 26, 2022, the assistant to Hock E. Tan, the President and Chief Executive Officer of Broadcom, contacted the assistant to Michael S. Dell, the Chairman of the VMware board of directors and a large VMware 


stockholder, to request a meeting between Mr. Tan and Mr. Del