In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from bs4 import BeautifulSoup

In [2]:
def create_driver():
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-extensions")
    driver = webdriver.Chrome(options=options)
    return driver

def wait_for_element(driver, by, value, timeout=20):
    # Wait until the specified element is present in the DOM
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((by, value))
    )

def wait_for_clickable(driver, by, value, timeout=20):
    # Wait until the specified element is clickable
    return WebDriverWait(driver, timeout).until(
        EC.element_to_be_clickable((by, value))
    )

def extract_management_discussion(html_content):
    # Define a flexible regex pattern
    pattern = re.compile(
        r'(Item\s*2\.\s*Management[’\'`]?s\s*Discussion\s*and\s*Analysis\s*of\s*Financial\s*Condition\s*and\s*Results\s*of\s*Operations.*?)'
        r'(Item\s*\d+|Part\s*\d+|\Z)',
        re.DOTALL | re.IGNORECASE
    )
    
    # Search for the pattern in the HTML content
    match = pattern.search(html_content)
    if match:
        return match.group(1)
    else:
        return "Section not found."

In [3]:
driver = create_driver()
cik = "0000320193"
report_text="10-Q (Quarterly report)"
driver.get("https://www.sec.gov/edgar/search/")
WebDriverWait(driver, 20).until(EC.url_contains("sec.gov"))
print(f"{cik}: Navigated to SEC EDGAR search page. Current URL: {driver.current_url}")

0000320193: Navigated to SEC EDGAR search page. Current URL: https://www.sec.gov/edgar/search/


In [4]:
# Wait for the input field and enter the CIK number
input_button = wait_for_element(driver, By.XPATH, '//*[@id="entity-short-form"]')
input_button.clear()
input_button.send_keys(cik)
print(f"{cik}: Cleared the main page and entered CIK")

# Click the search button
search_button = wait_for_clickable(driver, By.XPATH, '//*[@id="search"]')
search_button.click()
print(f"{cik}: Clicked the search button")
time.sleep(3)

# Clear the text search bar
keywords_input = wait_for_element(driver, By.XPATH, '//*[@id="keywords"]')
keywords_input.clear()
print(f"{cik}: Text Search Cleared")

# Enter the CIK number in the full form search bar
full_form_input = wait_for_element(driver, By.XPATH, '//*[@id="entity-full-form"]')
full_form_input.send_keys(cik)
print(f"{cik}: Send CIK keys to search bar")

# Click the keyword input to free the cursor for the next steps
keywords_input = wait_for_clickable(driver, By.XPATH, '//*[@id="keywords"]')
keywords_input.click()
print(f"{cik}: Keywords input clicked")
time.sleep(3)

# Interact with the dropdown menu to select report category
category_select = wait_for_clickable(driver, By.XPATH, '//*[@id="category-select"]')
category_select.click()
print(f"{cik}: Clicked on report options")

# Choose "All Reports" option
all_reports_option = wait_for_clickable(driver, By.XPATH, '//*[@id="category-type-grp"]/ul/li[3]')
all_reports_option.click()
print(f"{cik}: Selected All Reports option")

# Click the search button again to search for the specified report
search_button = wait_for_clickable(driver, By.XPATH, '//*[@id="search"]')
search_button.click()
print(f"{cik}: Clicked search button for reports")
time.sleep(3)

# Click on the specified report link
try:
    report_link = wait_for_clickable(driver, By.XPATH, f"//a[contains(text(), '{report_text}')]")
    report_link.click()
    print(f"{cik}: Clicked on the specified report link")
except Exception as e:
    print(f"No {report_text} found for {cik}: {e}")

time.sleep(3)

# Open the document in another tab
try:
    open_file_button = wait_for_clickable(driver, By.XPATH, '//*[@id="open-file"]/button')
    open_file_button.click()
    print(f"{cik}: Opening the HTML report...")
except Exception as e:
    print(f"Failed to find or click the open file button: {e}")

time.sleep(3)

# Switch to the new tab
WebDriverWait(driver, 20).until(EC.number_of_windows_to_be(2))
driver.switch_to.window(driver.window_handles[1])
print(f"{cik}: Switched to the new tab")

0000320193: Cleared the main page and entered CIK
0000320193: Clicked the search button
0000320193: Text Search Cleared
0000320193: Send CIK keys to search bar
0000320193: Keywords input clicked
0000320193: Clicked on report options
0000320193: Selected All Reports option
0000320193: Clicked search button for reports
0000320193: Clicked on the specified report link
0000320193: Opening the HTML report...
0000320193: Switched to the new tab


In [7]:
html=driver.page_source
soup = BeautifulSoup(html, 'lxml')
all_text = soup.get_text(separator='\n')
print(all_text)



aapl-20240330
false
2024
Q2
0000320193
--09-28
P1Y
P1Y
P1Y
P1Y
http://fasb.org/us-gaap/2023#MarketableSecuritiesCurrent http://fasb.org/us-gaap/2023#MarketableSecuritiesNoncurrent
http://fasb.org/us-gaap/2023#MarketableSecuritiesCurrent http://fasb.org/us-gaap/2023#MarketableSecuritiesNoncurrent
http://fasb.org/us-gaap/2023#LongTermDebtCurrent http://fasb.org/us-gaap/2023#LongTermDebtNoncurrent
http://fasb.org/us-gaap/2023#LongTermDebtCurrent http://fasb.org/us-gaap/2023#LongTermDebtNoncurrent
0000320193
2023-10-01
2024-03-30
0000320193
us-gaap:CommonStockMember
2023-10-01
2024-03-30
0000320193
aapl:A0.000Notesdue2025Member
2023-10-01
2024-03-30
0000320193
aapl:A0.875NotesDue2025Member
2023-10-01
2024-03-30
0000320193
aapl:A1.625NotesDue2026Member
2023-10-01
2024-03-30
0000320193
aapl:A2.000NotesDue2027Member
2023-10-01
2024-03-30
0000320193
aapl:A1.375NotesDue2029Member
2023-10-01
2024-03-30
0000320193
aapl:A3.050NotesDue2029Member
2023-10-01
2024-03-30
0000320193
aapl:A0.500Notesdu