In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import json
import sys
import concurrent.futures
# from tqdm import tqdm # Progress bar for visualizing data cleaning progress

# Defined global variables
csvFile = "./truncatedData.csv";
cikFile = "./cik-lookup-data.txt";
minDate = datetime(2001, 1, 1);
formTypes = ["PREM14A", "S-4", "SC 14D9", "SC TO-T"];

# Read the CSV file and extract the date & both merging companies (index base)
filedDate = pd.read_csv(csvFile, header=None).iloc[:, 1].tolist();
companyAList = pd.read_csv(csvFile, header=None).iloc[:, 2].tolist();
companyBList = pd.read_csv(csvFile, header=None).iloc[:, 3].tolist();

In [2]:
# Acquire the constraint of a given date.
# Pad 2 months backward and forward for constraint.
def getDateConstraints(date):
    originalDate = datetime.strptime(date, "%m/%d/%Y");

    # Define the lower-bound date
    lbMonth = originalDate.month - 2;
    if (lbMonth <= 0): # Case: Wrap to previous year
        lbMonth += 12;
        lbYear = originalDate.year - 1;
    else: # Case: Still on current year
        lbYear = originalDate.year;

    # Construct lower-bound date
    try:
        lowerBoundDate = originalDate.replace(year=lbYear, month=lbMonth);
    except ValueError: # Catch potential error i.e. feb. 30 not existing
        lowerBoundDate = originalDate.replace(year=lbYear, month=lbMonth, day=1);

    # Ensure the new date does not go below the minimum date
    if (lowerBoundDate < minDate):
        lowerBoundDate = minDate;

    
    # Define the upper-bound date
    ubMonth = originalDate.month + 2;
    if (ubMonth > 12): # Case: Wrap to next year
        ubMonth -= 12;
        ubYear = originalDate.year + 1;
    else: # Case: Still on current year
        ubYear = originalDate.year;

    # Construct upper-bound date
    try:
        upperBoundDate = originalDate.replace(year=ubYear, month=ubMonth);
    except ValueError: # Catch potential error i.e. feb. 30 not existing
        upperBoundDate = originalDate.replace(year=ubYear, month=ubMonth, day=1);

    return [lowerBoundDate, upperBoundDate];

In [3]:
# Acquire the CIK for further filtering in document look-up
def getCIKs(companyName):
    cikList = [];

    # Create a request that mimics browser activity
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Referer": "https://www.sec.gov/search-filings/cik-lookup"
    }

    formData = {
        "company": companyName
    }

    url = "https://www.sec.gov/cgi-bin/cik_lookup";

    # Request the search query & acquire the DOM elements
    response = requests.post(url, headers=headers, data=formData);
    if (response.status_code != 200):
        sys.exit(response.status_code);

    soup = BeautifulSoup(response.text, "html.parser");

    preTag = soup.find_all("pre");
    if (len(preTag) >= 2):
        preTag = preTag[1];
        for anchor in preTag.find_all("a"):
            cikList.append(anchor.text.strip());
    else:
        print("ERROR: CIK not found!")
        sys.exit(1);
    
    return cikList;

In [4]:
# Reconstruct the constraint date, company name, & form type for url parsing
constraintDates = getDateConstraints(filedDate[0]);
lbDate = constraintDates[0];
ubDate = constraintDates[1];
restructLB = f"{lbDate.year}-{lbDate.month:02}-{lbDate.day:02}";
restructUB = f"{ubDate.year}-{ubDate.month:02}-{ubDate.day:02}";

restructName = companyAList[0].replace(" ", "%20");
restructForms = "%2C".join(formTypes).replace(" ", "%20");

# Create a request that mimics browser activity
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Referer": "https://www.sec.gov/"
}

In [5]:
# Acquire all the json documents for the given company
def getDocumentJson(cik):
    url = f"https://efts.sec.gov/LATEST/search-index?q={restructName}&dateRange=custom&category=custom&startdt={restructLB}&enddt={restructUB}&forms={restructForms}&filter_ciks={cik}";

    response = requests.get(url, headers=headers);
    if (response.status_code != 200):
        print("FATAL: getDocumentJson response yielded an error!");
        sys.exit(response.status_code);
    
    data = response.json();
    totalValue = data["hits"]["total"]["value"];

    if (totalValue > 0):
        return data["hits"]["hits"];
    else:
        return None;

# Multi-thread the ciks to concurrently process https request
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(getDocumentJson, getCIKs(companyBList[0])));

# Check if all results are None
if all(result is None for result in results):
    print("ERROR: No document Json found!")
    sys.exit(1)

In [6]:
# Formulate all source document file links
sourceLinks = [];

# Iterate through each json object and construct the source document file links
for result in results[0]:
    try:
        # Get the CIK id or if there is multiple, then acquire the last one
        validatedCik = None;
        ciks = result["_source"]["ciks"];
        if ciks:
            lastCIK = ciks[-1];
            validatedCik = lastCIK;

        # Remove leading zeros from the CIK
        validatedCik.lstrip('0');
    
        # Acquire normal adsh & adsh without the "-" character
        adsh = result["_source"]["adsh"];
        truncatedADSH = result["_source"]["adsh"].replace("-", "");
        
        sourceLinks.append(f"https://www.sec.gov/Archives/edgar/data/{validatedCik}/{truncatedADSH}/{adsh}.txt");
    except KeyError as e:
        print(f"Missing key in result: {e}, result: {result}")