In [1]:
import requests
import json
import pandas as pd
import time
from datetime import datetime
from bs4 import BeautifulSoup

# Step 1: Load ticker to CIK mapping
url = "https://www.sec.gov/files/company_tickers.json"
headers = {'User-Agent': 'YourName your@email.com'}  # SEC requires user-agent header

response = requests.get(url, headers=headers)
data = response.json()

# Convert to DataFrame for easier lookup
ticker_df = pd.DataFrame.from_dict(data, orient='index')
ticker_df['ticker'] = ticker_df['ticker'].str.upper()

# Create dictionary: Ticker -> CIK
ticker_to_cik = dict(zip(ticker_df['ticker'], ticker_df['cik_str']))

# Preview
print("Sample Mapping (Ticker to CIK):")
print(ticker_df.head())

# Function to get CIK from ticker
def get_cik(ticker):
    ticker = ticker.upper()
    cik = ticker_to_cik[ticker]
    return str(cik).zfill(10) if cik else None

# Test
print("\nCIK for AAPL:", get_cik("AAPL"))

Sample Mapping (Ticker to CIK):
   cik_str ticker           title
0   320193   AAPL      Apple Inc.
1  1045810   NVDA     NVIDIA CORP
2   789019   MSFT  MICROSOFT CORP
3  1018724   AMZN  AMAZON COM INC
4  1652044  GOOGL   Alphabet Inc.

CIK for AAPL: 0000320193


In [2]:


def fetch_8k_entries(cik, count=100, stock_ticker="UNKNOWN"):
    feed_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=8-K&count={count}&output=atom"
    headers = {'User-Agent': 'YourName your@email.com'}

    response = requests.get(feed_url, headers=headers)
    soup = BeautifulSoup(response.content, 'xml')

    entries = []
    for entry in soup.find_all('entry'):
        title = entry.title.text
        filing_time = entry.updated.text
        filing_href = entry.link['href']

        # Fallback: If <company-name> tag is missing, use stock ticker
        company_tag = entry.find('company-name')
        company_name = company_tag.text if company_tag else f"CIK-{cik}"

        entries.append({
            'company_name': company_name,
            'stock_name': stock_ticker,
            'filing_time': filing_time,
            'filing_url': filing_href,
            'title': title
        })

    return entries

# Test it on AAPL again
apple_entries = fetch_8k_entries(get_cik("AAPL"), count=5, stock_ticker="AAPL")
pd.DataFrame(apple_entries)


Unnamed: 0,company_name,stock_name,filing_time,filing_url,title
0,CIK-0000320193,AAPL,2025-02-25T16:47:19-05:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
1,CIK-0000320193,AAPL,2025-01-30T16:30:23-05:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
2,CIK-0000320193,AAPL,2025-01-03T16:30:56-05:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
3,CIK-0000320193,AAPL,2024-10-31T16:30:25-04:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
4,CIK-0000320193,AAPL,2024-09-10T09:06:34-04:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
5,CIK-0000320193,AAPL,2024-08-26T17:20:33-04:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
6,CIK-0000320193,AAPL,2024-08-23T16:30:44-04:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
7,CIK-0000320193,AAPL,2024-08-01T16:30:26-04:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
8,CIK-0000320193,AAPL,2024-05-03T16:35:42-04:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report
9,CIK-0000320193,AAPL,2024-05-02T16:30:34-04:00,https://www.sec.gov/Archives/edgar/data/320193...,8-K - Current report


In [3]:
def extract_main_document_text(filing_url):
    """
    Downloads the filing index page and retrieves the main filing document text.
    Returns plain text or None if anything fails.
    """
    headers = {'User-Agent': 'YourName your@email.com'}
    
    try:
        index_response = requests.get(filing_url, headers=headers)
        index_soup = BeautifulSoup(index_response.content, 'html.parser')
        
        table_rows = index_soup.select('table.tableFile tr')
        doc_link = None

        for row in table_rows:
            cells = row.find_all('td')
            if len(cells) >= 3:
                link_tag = cells[2].find('a')
                if link_tag and link_tag.has_attr('href'):
                    href = link_tag['href']
                    # Ignore viewer or index pages
                    if (
                        href.endswith(('.htm', '.html', '.txt')) and 
                        '-index' not in href and 
                        'ix?doc=' not in href
                    ):
                        doc_link = "https://www.sec.gov" + href
                        break

        if not doc_link:
            print(f"No valid document link found for {filing_url}")
            return None

        # Fetch the actual document content
        doc_response = requests.get(doc_link, headers=headers)
        doc_soup = BeautifulSoup(doc_response.content, 'html.parser')
        text = doc_soup.get_text(separator=' ', strip=True)

        return text

    except Exception as e:
        print(f"Error extracting filing text: {e}")
        return None

# Test it again
sample_url = apple_entries[1]['filing_url']
sample_text = extract_main_document_text(sample_url)
print(sample_text[:5000])


EX-99.1 2 a8-kex991q1202512282024.htm EX-99.1 Document Exhibit 99.1 Apple reports first quarter results All-time records for total company revenue and EPS Services revenue reaches new all-time high CUPERTINO, CALIFORNIA — Apple ® today announced financial results for its fiscal 2025 first quarter ended December 28, 2024. The Company posted quarterly revenue of $124.3 billion, up 4 percent year over year, and quarterly diluted earnings per share of $2.40, up 10 percent year over year. “Today Apple is reporting our best quarter ever, with revenue of $124.3 billion, up 4 percent from a year ago,” said Tim Cook, Apple’s CEO. “We were thrilled to bring customers our best-ever lineup of products and services during the holiday season. Through the power of Apple silicon, we’re unlocking new possibilities for our users with Apple Intelligence, which makes apps and experiences even better and more personal. And we’re excited that Apple Intelligence will be available in even more languages this 

In [4]:


def query_ollama_api(filing_text, model="mistral"):
    """
    Queries mistral locally via Ollama API and returns extracted product info.
    """
    prompt = f"""
You are analyzing a U.S. SEC Form 8-K filing. Your task is to identify whether the company is announcing a **new product or service**.

A new product can be:
- A physical or digital product
- A new service, software, or feature
- A major upgrade or generation of an existing product line

Only return a product if it is explicitly introduced or announced. Do not include vague mentions or general business updates.

Do NOT return anything related to:
- Financial results
- Executive changes
- Legal matters
- Business strategy without specific product details

Only include externally launched products or services available to customers. Do not include internal presentations or financial categories.
If multiple products are mentioned, return only the most prominent new product.

Output your answer in this strict JSON format:
```json
{{
  "new_product": "Product Name or null",
  "product_description": "Short one-sentence description (max 180 characters), or null"
}}

FILING TEXT:
\"\"\"
{filing_text[:5000]}
\"\"\"
"""

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
    }

    try:
        response = requests.post("http://localhost:11434/api/generate", json=payload)
        response.raise_for_status()
        output = response.json()["response"]

        # Extract JSON part from LLM output
        start = output.find("{")
        end = output.rfind("}") + 1
        json_part = output[start:end]

        return json.loads(json_part)

    except Exception as e:
        print(f"Model API call failed: {e}")
        return {"new_product": None, "product_description": None}

In [5]:


def clean_null(value):
    return None if value in [None, "null", "Null", "NULL"] else value

def truncate_description(text, limit=180):
    if text and len(text) > limit:
        return text[:limit - 3] + "..."
    return text

def format_timestamp(iso_str):
    try:
        return datetime.fromisoformat(iso_str.replace("Z", "+00:00")).strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        return iso_str
        
# Processing file, feeding to Ollama model, and formating the response
def process_filings(entries, max_docs=100, output_csv="8k_product_releases.csv"):
    results = []
    for i, entry in enumerate(entries[:max_docs]):
        print(f"\n Processing filing {i+1}/{min(max_docs, len(entries))} for {entry['stock_name']}...")

        filing_text = extract_main_document_text(entry['filing_url'])
        if not filing_text:
            print("Skipping due to missing text.")
            continue

        response = query_ollama_api(filing_text)

        new_product = clean_null(response.get("new_product"))
        product_description = truncate_description(clean_null(response.get("product_description")))

        row = {
            "company_name": entry['company_name'],
            "stock_name": entry['stock_name'],
            "filing_time": format_timestamp(entry['filing_time']),
            "new_product": new_product,
            "product_description": product_description
        }

        print("Result:", row)
        results.append(row)

    # Save to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n Saved to {output_csv}")

    return results


In [6]:
# Run on Apple filings as test
apple_results = process_filings(apple_entries, max_docs=5)



 Processing filing 1/5 for AAPL...
Model API call failed: Expecting value: line 1 column 1 (char 0)
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2025-02-25 16:47:19', 'new_product': None, 'product_description': None}

 Processing filing 2/5 for AAPL...
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2025-01-30 16:30:23', 'new_product': 'Apple Intelligence', 'product_description': 'A new software feature that makes apps and experiences even better and more personal.'}

 Processing filing 3/5 for AAPL...
Model API call failed: Expecting value: line 1 column 1 (char 0)
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2025-01-03 16:30:56', 'new_product': None, 'product_description': None}

 Processing filing 4/5 for AAPL...
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2024-10-31 16:30:25', 'new_product': 'iPhone 16 lineup', 'product_description': 'A new 

In [7]:
# List of companies most likely to have new products announced in their 8-ks
high_product_activity_tickers = [
    "AAPL",   # Apple – phones, chips, OS, services
    "MSFT",   # Microsoft – Windows, Surface, Azure, Copilot
    "GOOG",   # Google/Alphabet – Pixel, AI tools, services
    "AMZN",   # Amazon – Echo, AWS, devices
    "TSLA",   # Tesla – vehicles, energy, software
    "META",   # Meta – VR/AR, social platforms
    "NVDA",   # Nvidia – GPUs, AI hardware/software
    "INTC",   # Intel – chips, fabs, new SKUs
    "AMD",    # AMD – CPU/GPU launches
    "NFLX",   # Netflix – content platform updates
    "CRM",    # Salesforce – new cloud/software tools
    "ADBE",   # Adobe – Creative Cloud, Firefly AI, etc.
    "ORCL",   # Oracle – SaaS & AI product suite
    "IBM",    # IBM – AI, WatsonX, infrastructure tools
    "UBER",   # Uber – platform features, mobility services
    "PLTR",   # Palantir – always announcing something mysterious
    "SNOW",   # Snowflake – cloud data platform upgrades
    "ROKU",   # Consumer tech drops
    "COIN",   # Coinbase – new crypto features/tools
]
tickers = [t for t in high_product_activity_tickers if t in ticker_to_cik]
print("Tickers selected:", tickers)

Tickers selected: ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSLA', 'META', 'NVDA', 'INTC', 'AMD', 'NFLX', 'CRM', 'ADBE', 'ORCL', 'IBM', 'UBER', 'PLTR', 'SNOW', 'ROKU', 'COIN']


In [8]:
# Iterating through tickers and pulling first 10 8-k filings
all_entries = []

for ticker in tickers:
    cik = get_cik(ticker)
    if not cik:
        print(f" No CIK for {ticker}, skipping.")
        continue

    try:
        entries = fetch_8k_entries(cik, count=10, stock_ticker=ticker)
        entries = entries[:8]
        all_entries.extend(entries)
    except Exception as e:
        print(f" Error fetching for {ticker}: {e}")

In [9]:
# Process all collected 8-Ks
all_results = process_filings(all_entries, max_docs=150, output_csv="multi_ticker_8k_products.csv")



 Processing filing 1/150 for AAPL...
Model API call failed: Expecting value: line 1 column 1 (char 0)
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2025-02-25 16:47:19', 'new_product': None, 'product_description': None}

 Processing filing 2/150 for AAPL...
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2025-01-30 16:30:23', 'new_product': 'Apple Intelligence', 'product_description': 'A new software feature that makes apps and experiences even better and more personal, powered by Apple silicon.'}

 Processing filing 3/150 for AAPL...
Model API call failed: Expecting value: line 1 column 1 (char 0)
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2025-01-03 16:30:56', 'new_product': None, 'product_description': None}

 Processing filing 4/150 for AAPL...
Result: {'company_name': 'CIK-0000320193', 'stock_name': 'AAPL', 'filing_time': '2024-10-31 16:30:25', 'new_product': 'iPhone 16 lineu