In [2]:
import re
import spacy
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def loadFile(filePath):
    with open(filePath, "r", encoding="utf-8") as file:
        return file.read()

# Preprocess the text
def preProcessText(content):
    soup = BeautifulSoup(content, "html.parser")
    text = soup.get_text(separator="")

    # Remove standalone page numbers
    pageNumPattern = re.compile(r'^\s*\d+\s*$', re.MULTILINE)
    text = re.sub(pageNumPattern, '', text)

    # Remove extra newline characters
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    return text.strip()

def removeTableOfContents(text):
    # Regular expression patterns for table of contents
    tocStartPattern = re.compile(r'(Table of Contents|Contents|TABLE OF CONTENT|CONTENTS)', re.IGNORECASE)
    tocEndPattern = re.compile(r'(Introduction|Chapter \d+|Section \d+|Part \d+|Page \d+)', re.IGNORECASE)

    # Find the start of the table of contents
    tocStartMatch = tocStartPattern.search(text)
    if not tocStartMatch:  # No table of contents found
        return text

    tocStartIndex = tocStartMatch.start()

    # Find the end of the table of contents
    tocEndMatch = tocEndPattern.search(text, tocStartIndex)
    if not tocEndMatch:  # No end of table of contents found
        return text

    tocEndIndex = tocEndMatch.start()

    # Remove the table of contents section
    cleanedText = text[:tocStartIndex] + text[tocEndIndex:]

    # Remove any remaining table of contents references
    cleanedText = re.sub(r'\btable\s*of\s*contents?\b|\btableofcontents?\b', '', cleanedText, flags=re.IGNORECASE)
    cleanedText = re.sub(r'(?i)table\s*of\s*contents?|tableofcontents?', '', cleanedText)

    return cleanedText.strip()

def removeBinaryLikeData(text):
    # Split text into lines for processing
    lines = text.splitlines()
    filtered_lines = []

    for line in lines:
        # Remove lines with high symbol density
        symbol_count = len(re.findall(r'[^\w\s]', line))  # Count non-alphanumeric symbols
        total_length = len(line)
        if total_length > 0 and (symbol_count / total_length) > 0.05:
            continue  # Skip binary-like lines
        filtered_lines.append(line)

    return "\n".join(filtered_lines)

def extractSection(text, startCandidates, stopCandidates):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    # Initialize start and stop indexes
    start_index, stop_index = -1, -1

    # Use both exact and fuzzy matching to find the section
    for i, sentence in enumerate(sentences):
        if start_index == -1 and (
            any(sc.lower() in sentence.lower() for sc in startCandidates) or
            any(fuzz.partial_ratio(sentence.lower(), sc.lower()) > 95 for sc in startCandidates)
        ):
            start_index = i
        elif start_index != -1 and (
            any(sc.lower() in sentence.lower() for sc in stopCandidates) or
            any(fuzz.partial_ratio(sentence.lower(), sc.lower()) > 85 for sc in stopCandidates)
        ):
            stop_index = i
            break

    # Extract the section
    if start_index != -1 and stop_index != -1 and start_index != stop_index:
        return " ".join(sentences[start_index:stop_index]).strip()
    return None

# Load and preprocess the file
text = loadFile("./SampleData/sample3.txt")
cleanedText = preProcessText(text)
cleanedText = removeTableOfContents(cleanedText)
cleanedText = removeBinaryLikeData(cleanedText)
truncatedText = cleanedText[50000:1000000]  # Chop off the table of content

startPhrases = [
    "Background of the transaction",
    "Background of the merger",
    "Background of the offer",
    "background of the acquisition"
]

stopPhrases = [
    "Reasons for the Transactions",
    "Reasons for the merger",
    "Reasons for the offer",
    "Reasons for the acquisition"
]

# Extract the background section
backgroundSection = extractSection(truncatedText, startPhrases, stopPhrases)
print(backgroundSection)


Background of the Merger

   The terms and conditions of the merger agreement and merger are the result
of arm's length negotiations between representatives of Chordiant and
representatives of Prime Response. Set forth below is a summary of the
background of these negotiations.

    Chordiant and Prime Response have been familiar with each other's businesses
for a number of years. Senior executives of the two companies have previously
encountered one another in a variety of business and industry settings.

    Throughout the summer and early fall of 2000, Prime Response engaged in
discussions with a number of companies concerning potential business
combinations or other partnering transactions.

 Officer, met with representatives of Thomas Weisel Partners, to explore
strategic partnership options, including a possible marketing partnership with
some of Thomas Weisel Partners' clients.

 Chairman of the Chordiant board of directors called Mr. Boni to explore
strategic partnership option