In [None]:
import os
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import re
import openai

# ================== CONFIG OPTIONS ==================
# These define essential constants and user-configurable settings:
NEW_PRODUCT_KEYWORDS = [
    "new product", "launch", "introduced", "release", 
    "service", "technology", "solution", "innovation",
    "epyc"
]
MAX_TEXT_FOR_LLM = 4000         # Maximum number of characters from each document to send to the LLM
SEC_REQUEST_TIMEOUT = 60        # Number of seconds to wait for SEC server responses
PRINT_LLM_RESPONSE = True       # Whether to print truncated versions of the LLM’s responses for debugging

##############################################################################
# 1) Read OPENAI_API_KEY from the environment
##############################################################################
openai.api_key = os.getenv("OPENAI_API_KEY")
# If the user hasn't exported an API key, warn them:
if not openai.api_key:
    print("[WARNING] No OPENAI_API_KEY found. Please export it before running.")

##############################################################################
# 2) Function to call ChatGPT using your provided client solution
##############################################################################
def call_chatgpt_llm(prompt, model="gpt-4o-mini", max_tokens=1000, temperature=0.7):
    """
    Uses a custom, environment-based OpenAI client.
    1. Creates a client (openai.OpenAI) from the environment variable.
    2. Calls the 'chat.completions.create' endpoint with the user prompt.
    3. Returns the LLM's text content or None if an error occurs.

    Arguments:
    - prompt       : The text prompt sent to the LLM.
    - model        : The name of the LLM model (default: 'gpt-4o-mini').
    - max_tokens   : Maximum tokens in the LLM response.
    - temperature  : Controls the output randomness. Higher => more creative.
    """
    try:
        # Create an OpenAI client instance
        client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        # Use chat completion endpoint
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            max_tokens=max_tokens,
            temperature=temperature,
        )
        # Return the main text content of the first choice
        return response.choices[0].message.content
    except Exception as e:
        # Print the error and return None if an exception arises
        print("[call_chatgpt_llm] OpenAI API error:\n", e)
        return None

##############################################################################
# 3) Functions for SEC data fetching and processing
##############################################################################

def fetch_company_tickers(url="https://www.sec.gov/files/company_tickers.json"):
    """
    Downloads a JSON file from the SEC mapping company tickers to their CIK numbers.
    Returns a dictionary { TICKER_UPPERCASE: 'CIK_as_string' }.
    Used if the local CSV lacks certain tickers or if you want a fallback mechanism.
    """
    headers = {"User-Agent": "MyProductScraper/1.0 (test@example.com)"}
    # Attempt to download the JSON data:
    resp = requests.get(url, headers=headers, timeout=SEC_REQUEST_TIMEOUT)
    if resp.status_code != 200:
        raise Exception("Error fetching ticker JSON data")
    # Parse the JSON into Python data structure
    data = resp.json()

    # Map each ticker and CIK to an uppercased dictionary entry
    ticker_mapping = {}
    for item in data.values():
        ticker = item.get("ticker")
        cik_str = item.get("cik_str")
        if ticker and cik_str:
            ticker_mapping[ticker.upper()] = str(cik_str)
    return ticker_mapping

def get_8k_filings(cik, count=20):
    """
    Retrieves up to 'count' recent 8-K filings from the SEC for a given CIK.
    1. Constructs an Atom feed URL.
    2. Parses the feed, finds 'entry' tags.
    3. For each entry, tries to locate EX-99.1 docs in the detail page table.

    Returns a list of dicts: [ { 'filing_date':..., 'filing_url':... }, ... ].
    """
    headers = {"User-Agent": "MyProductScraper/1.0 (test@example.com)"}
    # Build the SEC feed URL specifying 8-K, the desired company, and entry count
    feed_url = (
        f"https://www.sec.gov/cgi-bin/browse-edgar?"
        f"action=getcompany&CIK={cik}&type=8-K&count={count}&output=atom"
    )
    print(f"[get_8k_filings] Checking feed URL => {feed_url}")
    
    # Attempt to download the feed
    try:
        resp = requests.get(feed_url, headers=headers, timeout=SEC_REQUEST_TIMEOUT)
    except requests.exceptions.RequestException as e:
        print(f"[get_8k_filings] Request error for {feed_url}: {e}")
        return []
    
    # Ensure a successful HTTP status
    if resp.status_code != 200:
        print(f"[get_8k_filings] Non-200 status ({resp.status_code}). Returning empty.")
        return []
    
    # Parse the XML-based response using BeautifulSoup
    soup = BeautifulSoup(resp.content, "lxml-xml")
    entries = soup.find_all("entry")
    if not entries:
        print("[get_8k_filings] No <entry> tags found in Atom feed.")
        return []
    
    filings = []
    for i, entry in enumerate(entries, start=1):
        # Attempt to extract the filing date
        filing_date_tag = entry.find("filing-date")
        filing_date = filing_date_tag.text.strip() if filing_date_tag else None
        
        # Attempt to locate the index page for that filing
        index_url_tag = entry.find("filing-href")
        index_url = index_url_tag.text.strip() if index_url_tag else None
        
        print(f"  - Entry #{i}: date={filing_date}, detail page => {index_url}")
        if not index_url:
            continue
        
        # Attempt to download the detail page for that filing
        try:
            detail_resp = requests.get(index_url, headers=headers, timeout=SEC_REQUEST_TIMEOUT)
        except requests.exceptions.RequestException as e:
            print(f"    -> detail page error for {index_url}: {e}")
            continue
        
        if detail_resp.status_code != 200:
            print(f"    -> detail page status={detail_resp.status_code}, skipping.")
            continue
        
        # Parse the HTML to locate a table row referencing EX-99.1
        detail_soup = BeautifulSoup(detail_resp.content, "html.parser")
        product_doc_url = None
        table_rows = detail_soup.find_all("tr")
        for row in table_rows:
            columns = row.find_all("td")
            if not columns:
                continue
            # Check if 'ex-99.1' is mentioned in the row text
            if any("ex-99.1" in col.get_text(strip=True).lower() for col in columns):
                doc_link = row.find("a", href=True)
                if doc_link:
                    # Construct the full link to the exhibit
                    product_doc_url = "https://www.sec.gov" + doc_link["href"]
                    print(f"[get_8k_filings] Found EX-99.1 doc: {product_doc_url} for filing date={filing_date}")
                    break
        
        if product_doc_url:
            # Store the relevant doc URL and date in a dictionary
            filings.append({
                "filing_date": filing_date,
                "filing_url": product_doc_url
            })
    
    return filings

def fetch_filing_document(url):
    """
    Downloads the EX-99.1 (or relevant doc) from 'url' and returns it as raw text.
    """
    headers = {"User-Agent": "MyProductScraper/1.0 (test@example.com)"}
    print(f"[fetch_filing_document] Downloading EX-99.1 => {url}")
    
    try:
        resp = requests.get(url, headers=headers, timeout=SEC_REQUEST_TIMEOUT)
    except requests.exceptions.ReadTimeout:
        print("  -> Timed out while reading data from the SEC server. Skipping this filing.")
        return None
    except requests.exceptions.RequestException as e:
        print(f"  -> Request error: {e}")
        return None
    
    if resp.status_code != 200:
        print(f"  -> Non-200 status code: {resp.status_code}. Returning None.")
        return None
    
    # Convert the HTML response to plain text
    soup = BeautifulSoup(resp.content, "html.parser")
    text = soup.get_text(separator=" ", strip=True)
    return text

def extract_new_product_section(text):
    """
    Splits the text by lines, searching for any lines that match NEW_PRODUCT_KEYWORDS.
    If none are found, returns the first MAX_TEXT_FOR_LLM chars as a fallback.
    """
    lines = text.splitlines()
    candidate_lines = []
    for line in lines:
        lower_line = line.lower()
        # If any of the product-related keywords appear in the line, keep it
        if any(kw in lower_line for kw in NEW_PRODUCT_KEYWORDS):
            candidate_lines.append(line.strip())
    
    if candidate_lines:
        combined = " ".join(candidate_lines)
        return combined[:MAX_TEXT_FOR_LLM]
    else:
        return text[:MAX_TEXT_FOR_LLM]

def parse_llm_json(llm_response):
    """
    Locates a JSON object (by searching for the first '{...}' block) in the LLM response string.
    Attempts to parse it into a Python dictionary. Returns None on failure.
    """
    if not llm_response:
        return None
    # Find the first JSON block
    match = re.search(r"\{.*\}", llm_response, re.DOTALL)
    if not match:
        return None
    candidate = match.group(0)
    try:
        return json.loads(candidate)
    except json.JSONDecodeError:
        return None

def fill_missing_keys(parsed_data, required_keys):
    """
    Ensures that the dictionary 'parsed_data' has all the keys in 'required_keys',
    setting them to 'N/A' if absent.
    """
    for key in required_keys:
        if key not in parsed_data:
            parsed_data[key] = "N/A"
    return parsed_data

def extract_product_info(filing_text, company_name, stock_ticker, max_retries=2):
    """
    Uses the ChatGPT LLM to parse 'filing_text' and extract details about a new product or service.
    The LLM is prompted to produce exactly 4 keys in valid JSON:
      1) company_name
      2) stock_ticker
      3) new_product
      4) product_description

    If no product is found, new_product must be 'N/A'.
    """
    relevant_text = extract_new_product_section(filing_text)
    
    # Prompt instructing the LLM on the JSON format and what to look for
    prompt_template = f"""You are an assistant that extracts new product or service details from an SEC 8-K filing.
You must output exactly four keys in valid JSON (and nothing else): company_name, stock_ticker, new_product, product_description.

**New Product or Service**:
- Any newly introduced or discussed product, service, technology, solution, or offering in the text.
- If the text explicitly says \"new\", \"launched\", \"introduced\", \"unveiled\", etc., treat that as new_product.
- If there's a product name (e.g. \"EPYC 9005\", \"Cloud AI Suite 2.0\"), treat it as new_product.
- If no product or service is found, set new_product = \"N/A\".

**Product Description**:
- If the 8-K has a clear description, use it.
- If not, invent a plausible product description based on context.
- If new_product = \"N/A\", set product_description = \"N/A\".

**Output**:
- Return only valid JSON with exactly these four keys:
  {{
    \"company_name\": \"...\",
    \"stock_ticker\": \"...\",
    \"new_product\": \"...\",
    \"product_description\": \"...\"
  }}
- No extra text outside the JSON.

Filing text:
{relevant_text}

Metadata:
Company: {company_name}
Stock Ticker: {stock_ticker}

Return ONLY the JSON."""

    required_keys = ["company_name", "stock_ticker", "new_product", "product_description"]
    
    for attempt in range(max_retries):
        # Call the LLM with the constructed prompt
        llm_response = call_chatgpt_llm(prompt_template, model="gpt-4o-mini", max_tokens=1000, temperature=0.7)
        if llm_response is None:
            print(f"[extract_product_info] ChatGPT call returned None for {stock_ticker}, attempt #{attempt+1}")
            time.sleep(1)
            continue
        
        if PRINT_LLM_RESPONSE:
            print(f"[extract_product_info] LLM raw response (truncated): {llm_response[:500]}...")
        
        # Attempt to parse JSON from the LLM response
        data = parse_llm_json(llm_response)
        if not isinstance(data, dict):
            print(f"[extract_product_info] LLM didn't return valid JSON for {stock_ticker}, attempt #{attempt+1}")
            time.sleep(1)
            continue
        
        # Fill in missing keys with 'N/A'
        data = fill_missing_keys(data, required_keys)
        return data
    
    return None

def process_companies_from_csv(csv_path="TICKR&CIK.csv", output_csv="multi_sec_8k_product_releases.csv"):
    """
    Main pipeline:
    1) Reads a CSV containing ticker + CIK data.
    2) For each row, fetches up to 20 recent 8-K filings for that CIK.
    3) Identifies an EX-99.1 doc from each filing and downloads it.
    4) Calls extract_product_info() to parse the doc with ChatGPT.
    5) If a product is found, record the information and move on to the next company.
    6) Writes all results to a CSV file at the end.
    """
    # Load the CSV into a pandas DataFrame
    df = pd.read_csv(csv_path)
    
    # Optionally rename columns to standard ones if the CSV uses different headers
    rename_map = {
        "Symbol": "Ticker",
        "Company": "Company Name",
        "CIK Number": "CIK"
    }
    for old_col, new_col in rename_map.items():
        if old_col in df.columns and new_col not in df.columns:
            df.rename(columns={old_col: new_col}, inplace=True)
    
    # Ensure the DataFrame has at least a Ticker column
    if "Ticker" not in df.columns:
        print("[ERROR] CSV must contain at least a 'Ticker' column.")
        return
    
    print("[INFO] Fetching fallback ticker->CIK map from SEC (may take a moment).")
    # Fallback mapping if the CSV is missing or incomplete
    big_ticker_map = fetch_company_tickers()
    
    results = []
    for idx, row in df.iterrows():
        # Retrieve ticker and do minor cleaning
        ticker = str(row.get("Ticker", "")).strip().upper()
        if not ticker:
            continue
        
        # Retrieve the company name from the CSV or default to the ticker if blank
        company_name = str(row.get("Company Name", ticker)).strip()
        
        # Attempt to get the CIK from the CSV or fallback mapping
        csv_cik = row.get("CIK")
        if pd.isna(csv_cik) or not str(csv_cik).strip():
            cik = big_ticker_map.get(ticker, "")
        else:
            # Remove trailing .0 if present
            try:
                cik_int = int(float(csv_cik))
                cik = str(cik_int)
            except:
                cik = str(csv_cik).replace(".0", "")
        
        if not cik:
            print(f\"\"\"\n=== {company_name} ({ticker}) => No valid CIK. Skipping.\"\"\")
            continue
        
        print(f\"\"\"\n=== Processing {company_name} ({ticker}), CIK={cik} ===\"\"\")
        # Grab up to 20 of the most recent 8-K filings
        filings = get_8k_filings(cik, count=20)
        if not filings:
            print(f\"  -> No EX-99.1 8-K filings found for {ticker} (CIK={cik}).\")
            continue
        
        # Sort the filings by date in descending order
        def parse_date_str(ds):
            try:
                return datetime.strptime(ds, \"%Y-%m-%d\")
            except:
                return datetime(1900, 1, 1)
        filings.sort(key=lambda f: parse_date_str(f[\"filing_date\"]), reverse=True)
        
        # Look for a product mention in each filing until one is found
        found_product = False
        for fdict in filings:
            fdate = fdict[\"filing_date\"]
            furl = fdict[\"filing_url\"]
            if not furl:
                continue
            
            print(f\"  -> Checking filing dated {fdate}, doc link => {furl}\")
            # Download the text of EX-99.1
            doc_text = fetch_filing_document(furl)
            if not doc_text:
                print(\"     -> EX-99.1 text is empty or None, skipping.\")
                continue
            
            # Analyze the text with ChatGPT
            info = extract_product_info(doc_text, company_name, ticker)
            if info is None:
                print(\"     -> ChatGPT call not successful. Skipping this 8-K.\")
                continue
            
            # If a new product is identified, record and break
            if info[\"new_product\"] != \"N/A\":
                print(f\"     => Found product mention => {info['new_product']}\")
                results.append(info)
                found_product = True
                break
            else:
                print(\"     -> LLM says 'N/A' => continuing to next 8-K.\")
            
            # Brief sleep to avoid spamming SEC or LLM with too many requests
            time.sleep(1)
        
        if not found_product:
            print(f\"  -> No new product for {company_name} ({ticker}). (Skipping CSV output)\")
    
    # If any products were found, write them to CSV
    if results:
        df_out = pd.DataFrame(results)
        df_out.to_csv(output_csv, index=False)
        print(f\"\\n[INFO] Found {len(results)} total product mentions. Saved => {output_csv}\")
    else:
        print(\"\\n[INFO] No product announcements found for any companies. No CSV output.\")
    
    print(\"✅ Done! The script has finished processing.\")


if __name__ == \"__main__\":
    # Run the pipeline with default CSV paths
    process_companies_from_csv(\"TICKR&CIK.csv\", \"multi_sec_8k_product_releases.csv\")

[INFO] Fetching fallback ticker->CIK map from SEC (may take a moment).

=== Processing 3M (MMM), CIK=66740 ===
[get_8k_filings] Checking feed URL => https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=66740&type=8-K&count=20&output=atom
  - Entry #1: date=2025-03-13, detail page => https://www.sec.gov/Archives/edgar/data/66740/000110465925023448/0001104659-25-023448-index.htm
  - Entry #2: date=2025-02-26, detail page => https://www.sec.gov/Archives/edgar/data/66740/000006674025000019/0000066740-25-000019-index.htm
[get_8k_filings] Found EX-99.1 doc: https://www.sec.gov/Archives/edgar/data/66740/000006674025000019/a22625-8xkexhibit.htm for filing date=2025-02-26
  - Entry #3: date=2025-02-21, detail page => https://www.sec.gov/Archives/edgar/data/66740/000006674025000015/0000066740-25-000015-index.htm
  - Entry #4: date=2025-02-10, detail page => https://www.sec.gov/Archives/edgar/data/66740/000006674025000012/0000066740-25-000012-index.htm
[get_8k_filings] Found EX-99.1 doc