In [1]:
import re
import spacy
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import sys

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Load spaCy model
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

def loadFile(filePath):
    with open(filePath, "r", encoding="utf-8") as file:
        return file.read()

# Preprocess the text
def preProcessText(content):
    soup = BeautifulSoup(content, "html.parser")
    text = soup.get_text(separator="")

    # Remove standalone page numbers
    pageNumPattern = re.compile(r'^\s*\d+\s*$', re.MULTILINE)
    text = re.sub(pageNumPattern, '', text)

    # Remove extra newline characters
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    return text.strip()

def removeTableOfContents(text):
    # Regular expression patterns for table of contents
    tocStartPattern = re.compile(r'(Table of Contents|Contents|TABLE OF CONTENT|CONTENTS)', re.IGNORECASE)
    tocEndPattern = re.compile(r'(Introduction|Chapter \d+|Section \d+|Part \d+|Page \d+)', re.IGNORECASE)

    # Find the start of the table of contents
    tocStartMatch = tocStartPattern.search(text)
    if not tocStartMatch:  # No table of contents found
        return text

    tocStartIndex = tocStartMatch.start()

    # Find the end of the table of contents
    tocEndMatch = tocEndPattern.search(text, tocStartIndex)
    if not tocEndMatch:  # No end of table of contents found
        return text

    tocEndIndex = tocEndMatch.start()

    # Remove the table of contents section
    cleanedText = text[:tocStartIndex] + text[tocEndIndex:]

    # Remove any remaining table of contents references
    cleanedText = re.sub(r'\btable\s*of\s*contents?\b|\btableofcontents?\b', '', cleanedText, flags=re.IGNORECASE)
    cleanedText = re.sub(r'(?i)table\s*of\s*contents?|tableofcontents?', '', cleanedText)

    return cleanedText.strip()

def removeBinaryLikeData(text):
    # Split text into lines for processing
    lines = text.splitlines()
    filtered_lines = []

    for line in lines:
        # Remove lines with high symbol density
        symbol_count = len(re.findall(r'[^\w\s]', line))  # Count non-alphanumeric symbols
        total_length = len(line)
        if total_length > 0 and (symbol_count / total_length) > 0.05:
            continue  # Skip binary-like lines
        filtered_lines.append(line)

    return "\n".join(filtered_lines)

def calculate_ngram_probabilities(sentences, n=2):
    """
    Calculate the probability that each sentence belongs to the same section
    as the preceding n-1 sentences using embeddings.

    Parameters:
    - sentences (list): List of sentences in the text.
    - n (int): N-gram size (e.g., 2 for bigram, 3 for trigram).

    Returns:
    - probabilities (list): List of probabilities for each sentence.
    """
    probabilities = []

    # Generate embeddings for all sentences
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)

    # Iterate through sentences
    for i in range(len(sentences)):
        if i < n - 1:
            # First few sentences lack enough context, assume they belong
            probabilities.append(1.0)
            continue

        # Combine embeddings of the last n-1 sentences (context)
        context_embeddings = sentence_embeddings[i-n+1:i]
        context_embedding = torch.mean(context_embeddings, dim=0)

        # Current sentence embedding
        current_embedding = sentence_embeddings[i]

        # Calculate cosine similarity
        similarity = util.cos_sim(context_embedding, current_embedding).item()

        # Normalize similarity to probability
        probability = max(0, min(1, similarity))
        probabilities.append(probability)

    return probabilities

def extractSection(text, startCandidates, stopCandidates, n=2):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    # Initialize start and stop indexes
    start_index = -1

    # Use both exact and fuzzy matching to find the section
    for i, sentence in enumerate(sentences):
        if start_index == -1 and (
            any(sc.lower() in sentence.lower() for sc in startCandidates) or
            any(fuzz.partial_ratio(sentence.lower(), sc.lower()) > 95 for sc in startCandidates)
        ):
            start_index = i
            break

    if start_index == -1:
        return None

    section = sentences[start_index:]

    # Calculate n-gram probabilities
    probabilities = calculate_ngram_probabilities(section, n=n)

    for i, prob in enumerate(probabilities):
        print(f"Sentence {i + 1}: '{section[i]}'")
        print(f"Probability of same section: {prob:.4f}\n")

    return section



# Load the uploaded file
text = loadFile("./SampleData/sample4.txt")
cleanedText = preProcessText(text)
cleanedText = removeTableOfContents(cleanedText)
cleanedText = removeBinaryLikeData(cleanedText)
truncatedText = cleanedText[50000:1000000]  # Shrink to manageable size for spaCy

startPhrases = [
    "Background of the transaction",
    "Background of the merger",
    "Background of the offer",
    "background of the acquisition"
]

stopPhrases = [
    "Reasons for the Transactions",
    "Reasons for the merger",
    "Reasons for the offer",
    "Reasons for the acquisition"
]

# Extract the background section
backgroundSection = extractSection(truncatedText, startPhrases, stopPhrases, n=6)
print("Extracted Section:", backgroundSection)


  from .autonotebook import tqdm as notebook_tqdm


Sentence 1: 'Background of the Transactions VMware’s senior management and the VMware board of directors regularly review VMware’s performance, strategy, competitive position, opportunities and prospects in light of current business and economic environments and developments in the software industry and the opportunities and challenges facing participants in the industry.'
Probability of same section: 1.0000

Sentence 2: 'These reviews have included consideration by VMware’s senior management and the VMware board of directors of potential strategic alternatives, including acquisitions, business combinations and other strategic transactions.'
Probability of same section: 1.0000

Sentence 3: 'However, VMware’s recent focus, both before and after the VMware spin-off on November 1, 2021, had been on executing its standalone plan.'
Probability of same section: 1.0000

Sentence 4: 'On April 26, 2022, the assistant to Hock E. Tan, the President and Chief Executive Officer of Broadcom, contact