In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import json
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Event
import re
from tqdm import tqdm # Progress bar for visualizing data cleaning progress
from fuzzywuzzy import fuzz
import os
import spacy
import torch

# Load spaCy model
if torch.cuda.is_available():
    spacy.prefer_gpu();

nlp = spacy.load("en_core_web_sm");

# Global termination flag
terminationEvent = Event();

# Defined global variables
csvFile = "../truncatedData.csv";
minDate = datetime(2001, 1, 1);
formTypes = ["PREM14A", "S-4", "SC 14D9", "SC TO-T"];
# mainIndex = 2;
maxNumOfThreads = os.cpu_count(); # Assuming Window system

# Read the CSV file and extract the date & both merging companies (index base)
filedDate = pd.read_csv(csvFile, header=None).iloc[:, 1].tolist();
companyAList = pd.read_csv(csvFile, header=None).iloc[:, 2].tolist();
companyBList = pd.read_csv(csvFile, header=None).iloc[:, 3].tolist();

# Phrases for locating start/end point of the background section
startPhrases = [
    "Background of the transaction",
    "Background of the merger",
    "Background of the offer",
    "Background of the acquisition",
    "Background of the Offer and the Merger"
]

stopPhrases = [
    "Reasons for the Transactions",
    "Reasons for the merger",
    "Reasons for the offer",
    "Reasons for the acquisition"
]

In [3]:
# Acquire the constraint of a given date.
# Pad 2 months backward and forward for constraint.
def getDateConstraints(date):
    originalDate = datetime.strptime(date, "%m/%d/%Y");

    # Define the lower-bound date
    lbMonth = originalDate.month - 2;
    if (lbMonth <= 0): # Case: Wrap to previous year
        lbMonth += 12;
        lbYear = originalDate.year - 1;
    else: # Case: Still on current year
        lbYear = originalDate.year;

    # Construct lower-bound date
    try:
        lowerBoundDate = originalDate.replace(year=lbYear, month=lbMonth);
    except ValueError: # Catch potential error i.e. feb. 30 not existing
        lowerBoundDate = originalDate.replace(year=lbYear, month=lbMonth, day=1);

    # Ensure the new date does not go below the minimum date
    if (lowerBoundDate < minDate):
        lowerBoundDate = minDate;

    
    # Define the upper-bound date
    ubMonth = originalDate.month + 2;
    if (ubMonth > 12): # Case: Wrap to next year
        ubMonth -= 12;
        ubYear = originalDate.year + 1;
    else: # Case: Still on current year
        ubYear = originalDate.year;

    # Construct upper-bound date
    try:
        upperBoundDate = originalDate.replace(year=ubYear, month=ubMonth);
    except ValueError: # Catch potential error i.e. feb. 30 not existing
        upperBoundDate = originalDate.replace(year=ubYear, month=ubMonth + 1, day=1);

    return [lowerBoundDate, upperBoundDate];

In [4]:
# Get the list of CIKs for the merging companies
def getCIKS(searchCompany, pairCompany, dateLB, dateUB, formTypes):
    restructName = searchCompany.replace(" ", "%20");
    
    url = f"https://efts.sec.gov/LATEST/search-index?q={restructName}&dateRange=custom&category=custom&startdt={dateLB}&enddt={dateUB}&forms={formTypes}";

    # Create a request that mimics browser activity
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Referer": "https://www.sec.gov/"
    }

    # Request the search query & acquire the DOM elements
    response = requests.get(url, headers=headers);
    if (response.status_code != 200):
        print("FATAL: getDocumentJson response yielded an error!");
        sys.exit(response.status_code);
    
    data = response.json();
    totalValue = data["hits"]["total"]["value"];

    if (totalValue <= 0):
        return None;
    
    # Formulate the list of entities for CIK lookup
    entityList = [];
    for entities in data["aggregations"]["entity_filter"]["buckets"]:
        entityList.append(entities["key"]);

    # Acquire the CIK for the given company using combined fuzzy matching techniques
    threshold = 90
    filteredMatch = [
        entity for entity in entityList if fuzz.partial_ratio(pairCompany.lower(), entity.lower()) > threshold
    ];
    
    # Extract the CIK from the filtered match
    cikList = [];
    for entity in filteredMatch:
        cikList.append(re.search(r'\(CIK (\d+)\)', entity).group(1));

    return cikList if cikList else None;

In [5]:
# Acquire all the json documents for the given companies with CIK filter
def getCIKDocumentJson(searchCompany, pairCompany, dateLB, dateUB, formTypes):
    # Remove parantheses content from the company names
    searchCompany = re.sub(r'\(.*\)', '', searchCompany).strip();
    pairCompany = re.sub(r'\(.*\)', '', pairCompany).strip();

    # We will try and acquire the cikList for the first company;
    # If the cikList is None, we will try and acquire the cikList for the second company.
    cikList = getCIKS(searchCompany, pairCompany, dateLB, dateUB, formTypes);
    if (cikList == None):
        cikList = getCIKS(pairCompany, searchCompany, dateLB, dateUB, formTypes);
    
    if (cikList == None):
        return None;

    """
        - Fetch data for each CIK concurrently
        - We do not need to verify if the hit returns nothing as if the entity is not found,
            the CIK will not be present in the list.
    """
    restructName = searchCompany.replace(" ", "%20");

    urls = [f"https://efts.sec.gov/LATEST/search-index?q={restructName}&dateRange=custom&category=custom&startdt={dateLB}&enddt={dateUB}&forms={formTypes}&filter_ciks={cik}" for cik in cikList];
    
    # Create a request that mimics browser activity
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Referer": "https://www.sec.gov/"
    }

    # Fetch the json data for each CIK
    if len(urls) == 1: # Case: Single URL; no threads required
        response = requests.get(urls[0], headers=headers);
        if (response.status_code != 200):
            print("FATAL: getDocumentJson response yielded an error!");
            sys.exit(response.status_code);
        
        result = response.json();
        mergedHits = result["hits"]["hits"] if result and "hits" in result and "hits" in result["hits"] else [];
    else: # Case: Multiple URLs; use threads for concurrent fetching
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(lambda url: requests.get(url, headers=headers), urls));
        
        # Merge the results into a single list
        mergedHits = [];
        for response in results:
            if (response.status_code != 200):
                print("FATAL: getDocumentJson response yielded an error!");
                sys.exit(response.status_code);

            result = response.json();
            if result and "hits" in result and "hits" in result["hits"]:
                mergedHits.extend(result["hits"]["hits"]);

    return mergedHits if mergedHits else None;

In [6]:
"""
    - No documents were found associated with the CIKs.
    - We will let fuzzy match determine if the company is present in the document.
        - Return a truncated list of documents for both companies without the cik filter.
    - Basically throwing a dart at the board and hoping it hits the target if no cik filtering is found.
"""
def getDocumentJson(searchCompany, pairCompany, dateLB, dateUB, formTypes):
    # Remove parantheses content from the company names
    searchCompany = re.sub(r'\(.*\)', '', searchCompany).strip();
    pairCompany = re.sub(r'\(.*\)', '', pairCompany).strip();

    restructSearch = searchCompany.replace(" ", "%20");
    restructPair = pairCompany.replace(" ", "%20");

    urls = [
        f"https://efts.sec.gov/LATEST/search-index?q={restructSearch}&dateRange=custom&category=custom&startdt={dateLB}&enddt={dateUB}&forms={formTypes}",
        f"https://efts.sec.gov/LATEST/search-index?q={restructPair}&dateRange=custom&category=custom&startdt={dateLB}&enddt={dateUB}&forms={formTypes}"
    ];
    
    # Create a request that mimics browser activity
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Referer": "https://www.sec.gov/"
    }

    # Fetch the json data for each company using threads for concurrent fetching
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda url: requests.get(url, headers=headers), urls));
    
    # Merge the results into a single list
    mergedHits = [];
    for response in results:
        if (response.status_code != 200):
            print("FATAL: getDocumentJson response yielded an error!");
            sys.exit(response.status_code);

        result = response.json();
        if result and "hits" in result and "hits" in result["hits"]:
            mergedHits.extend(result["hits"]["hits"]);

    return mergedHits if mergedHits else None;

In [7]:
# Formulate the source document links from the search result json
def getSourceLinks(documentJson):
    # Formulate all source document file links
    sourceLinks = [];

    # Iterate through each json object and construct the source document file links
    for document in documentJson:
        try:
            # Get the CIK id or if there is multiple, then acquire the last one
            validatedCik = None;
            ciks = document["_source"]["ciks"];
            if ciks:
                lastCIK = ciks[-1];
                validatedCik = lastCIK;

            # Remove leading zeros from the CIK
            validatedCik.lstrip('0');
        
            # Acquire normal adsh & adsh without the "-" character
            adsh = document["_source"]["adsh"];
            truncatedADSH = document["_source"]["adsh"].replace("-", "");
            
            sourceLinks.append(f"https://www.sec.gov/Archives/edgar/data/{validatedCik}/{truncatedADSH}/{adsh}.txt");
        except KeyError as e:
            print(f"Missing key in document: {e}, result: {document}");
            continue; # Skip the document if there is a missing key; logged for further investigation

    return sourceLinks;

In [8]:
def loadFileFromURL(url):
    # Create a request that mimics browser activity
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9"
    }

    response = requests.get(url, headers=headers);
    if (response.status_code == 200):
        return response.text;
    else:
        print("FATAL: Failed to load document via url.");
        sys.exit(response.status_code);

def preProcessText(content):
    soup = BeautifulSoup(content, "html.parser");
    text = soup.get_text(separator="\n");

    # Remove standalone page numbers
    pageNumPattern = re.compile(r'^\s*\d+\s*$', re.MULTILINE);
    text = re.sub(pageNumPattern, '', text);

    # Remove extra newline characters
    text = re.sub(r'\n\s*\n+', '\n\n', text);

    return text.strip();

def extractFirstWord(companyName):
    clean_name = re.sub(r"\(.*?\)", "", companyName);  # Remove parentheses content
    return clean_name.split()[0];

def checkCompaniesInDocument(url, companyNames):
    if (terminationEvent.is_set()):
        return None, False  # Exit early if thread termination is triggered
    
    rawText = loadFileFromURL(url);
    if (not rawText):  # If we cannot load the document
        return "", False;

    cleanedText = preProcessText(rawText);
    lowerText = cleanedText.lower();

    # Check if both company names are present as whole words
    foundCompanies = [name for name in companyNames if re.search(r'\b' + re.escape(name) + r'\b', lowerText)];
    
    # Return the cleanedText if both company names are found, else False
    return cleanedText, len(foundCompanies) == len(companyNames);

In [9]:
def removeTableOfContents(text):
    # Regular expression patterns for table of contents
    tocStartPattern = re.compile(r'(Table of Contents|Contents|TABLE OF CONTENT|CONTENTS)', re.IGNORECASE);
    tocEndPattern = re.compile(r'(Introduction|Chapter \d+|Section \d+|Part \d+|Page \d+)', re.IGNORECASE);

    # Find the start of the table of contents
    tocStartMatch = tocStartPattern.search(text);
    if (not tocStartMatch):  # No table of contents found
        return text;

    tocStartIndex = tocStartMatch.start();

    # Find the end of the table of contents
    tocEndMatch = tocEndPattern.search(text, tocStartIndex);
    if (not tocEndMatch):  # No end of table of contents found
        return text;

    tocEndIndex = tocEndMatch.start();

    # Remove the table of contents section
    cleanedText = text[:tocStartIndex] + text[tocEndIndex:];

    # Remove any remaining table of contents references
    cleanedText = re.sub(r'\btable\s*of\s*contents?\b|\btableofcontents?\b', '', cleanedText, flags=re.IGNORECASE);
    cleanedText = re.sub(r'(?i)table\s*of\s*contents?|tableofcontents?', '', cleanedText);

    return cleanedText.strip();

In [10]:
def extractSection(text, startCandidates):
    doc = nlp(text);
    sentences = [sent.text for sent in doc.sents];

    startIndex = -1;

    # Locate the start of the desired background section
    for i, sentence in enumerate(sentences):
        match = next(
            (sc for sc in startCandidates if sc.lower() in sentence.lower() or fuzz.partial_ratio(sentence.lower(), sc.lower()) > 95),
            None
        );
        if match:
            startIndex = i;
            break;
    
    # No "Background" section found
    if (startIndex == -1 or match is None):
        return None;
    
    section = sentences[startIndex:];

    return "\n".join(section);

In [11]:
def get_current_timestamp():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S");

def logMessage(message):
    with open(f"../logs.txt", "a", encoding="utf-8") as file:
        file.write(message + "\n");

    print(message);

In [12]:
for mainIndex in range(11,12):
    print("Processing index: ", mainIndex, "; Companies: ", companyAList[mainIndex], " & ", companyBList[mainIndex]);

    constraintDates = getDateConstraints(filedDate[mainIndex]);
    lbDate, ubDate = constraintDates;
    restructLB = f"{lbDate.year}-{lbDate.month:02}-{lbDate.day:02}";
    restructUB = f"{ubDate.year}-{ubDate.month:02}-{ubDate.day:02}";
    restructForms = "%2C".join(formTypes).replace(" ", "%20");

    # Find the documents with CIK filtering
    results = getCIKDocumentJson(companyAList[mainIndex], companyBList[mainIndex], restructLB, restructUB, restructForms);
    if (results == None): # Acquire all documents within our guess
        results = getDocumentJson(companyAList[mainIndex], companyBList[mainIndex], restructLB, restructUB, restructForms);

    # No documents found for our 2 companies
    if (results == None):
        logMessage(f"[{get_current_timestamp()}] [-] No documents found for: {companyAList[mainIndex]} & {companyBList[mainIndex]}");
        continue;

    # Extract the source document links
    sourceLinks = getSourceLinks(results);

    """
        - Here, we will verify that both company names are present in the document.
            - Reduces the amount of documents needed to be processed with NLP.
        - Next, if both company names are present, we will try and locate the "Background of the Merger"
        chronological timeline.
    """
    companyNames = [companyAList[mainIndex], companyBList[mainIndex]];
    companyNames = [extractFirstWord(name).lower() for name in companyNames];

    # Locate the documents with both company names present.
    foundData = False;
    terminationEvent.clear();
    with ThreadPoolExecutor(max_workers=maxNumOfThreads) as executor:
        futures = {executor.submit(checkCompaniesInDocument, url, companyNames): url for url in sourceLinks};

        for future in as_completed(futures):
            if (terminationEvent.is_set()):  # If background section is found already
                break;

            try:
                cleanedText, bothFound = future.result();
                print(bothFound);
                if bothFound:
                    # Additional preprocess cleaning
                    cleanedText = removeTableOfContents(cleanedText);
                    truncatedText = cleanedText[50000:1000000];  # Shrink to manageable size for spaCy
                    print(truncatedText)

                    backgroundSection = extractSection(truncatedText, startPhrases, stopPhrases);
                    if backgroundSection is None:
                        continue;
                    
                    # print(futures[future]);
                    # print(backgroundSection);

                    # Write the data to a file
                    foundData = True;
                    formatDocName = f"{companyAList[mainIndex].replace(' ', '_')}_&_{companyBList[mainIndex].replace(' ', '_')}";
                    with open(f"../DataSet/{formatDocName}.txt", "w", encoding="utf-8") as file:
                        file.write(f"URL: {futures[future]}\n\n");
                        file.write(backgroundSection);
                    
                    logMessage(f"[{get_current_timestamp()}] [+] Successfully created document for: {companyAList[mainIndex]} & {companyBList[mainIndex]}");

                    # Signal termination and exit
                    terminationEvent.set();
                    break;
            except Exception as e:
                url = futures[future];
                logMessage(f"[{get_current_timestamp()}] [-] Error processing {url}: {e}");

    if not foundData:
        logMessage(f"[{get_current_timestamp()}] [-] No background section found for index {mainIndex}: {companyAList[mainIndex]} & {companyBList[mainIndex]};");
        logMessage(f"\tDumping its document links:");
        for url in sourceLinks:
            logMessage(f"\t\t{url}");



Processing index:  11 ; Companies:  Dallas-Semiconductor Corp  &  Maxim Integrated Products Inc
False
False
[2025-01-30 19:52:03] [-] No background section found for index 11: Dallas-Semiconductor Corp & Maxim Integrated Products Inc;
	Dumping its document links:
		https://www.sec.gov/Archives/edgar/data/0000743316/000109581101500299/0001095811-01-500299.txt
		https://www.sec.gov/Archives/edgar/data/0000743316/000109581101500103/0001095811-01-500103.txt
