# Christopher Robles
# PID: 5685818
# Date: 3/26/2025
# LLM Document Analysis

In [5]:
# Install and import necessary libraries
import requests
from bs4 import BeautifulSoup
import re
import csv
from datetime import datetime
import pandas as pd
import os
import ollama
import json
import ast

In [6]:
def fetch_latest_10k(cik):
    """
    Fetch the latest 10-K filing for a given company using its CIK.
    
    Args:
    cik (str): The CIK (Central Index Key) of the company.
    
    Returns:
    tuple: A tuple containing the 10-K URL and the filing date if found, else (None, None).
    """
    cik = cik.zfill(10)
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    headers = {"User-Agent": "Chris (ch379229@ucf.edu)"}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        recent_filings = data["filings"]["recent"]

        # Find the latest 10-K filing
        for i, form in enumerate(recent_filings["form"]):
            if form == "10-K":
                accession_num = recent_filings["accessionNumber"][i].replace("-", "")
                primary_doc = recent_filings["primaryDocument"][i]
                filing_date = recent_filings["filingDate"][i]
                ten_k_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_num}/{primary_doc}"
                print(f"Latest 10-K URL: {ten_k_url}\n")
                return ten_k_url, filing_date

        # If no 10-K filing is found, return None
        return None, None
    except Exception as e:
        print(f"Error fetching SEC data: {e}")
        return None, None

In [7]:
def save_10k_to_file(url, output_file, save_as="html"):
    """
    Save the 10-K document from the given URL to a local file.
    """
    try:
        response = requests.get(url, headers={"User-Agent": "ch379229@ucf.edu"})
        response.raise_for_status()

        extension = "html" if save_as == "html" else "txt"
        output_path = f"{output_file}.{extension}"

        with open(output_path, 'w', encoding='utf-8') as file:
            if save_as == "html":
                file.write(response.text)
            elif save_as == "txt":
                soup = BeautifulSoup(response.text, 'html.parser')
                plain_text = soup.get_text(separator="\n", strip=True)
                file.write(plain_text)

        print(f"10-K document saved to {output_path}")
        return output_path
    except Exception as e:
        print(f"Error saving 10-K: {e}")
        return None

In [11]:
def extract_focused_section(text):
    """
    Extract sections of the 10-K that are likely to contain new product announcements or innovations
    using a broader range of keywords.
    """
    # Expanded keywords to capture broader discussions around innovations, strategic initiatives, etc.
    keywords = r"(innovation|strategic initiative[s]?|research and development|future offering[s]?|pipeline|business strategy)"
    match = re.search(keywords, text, re.IGNORECASE)
    
    if match:
        start_idx = match.start()
        # Capture more text around the keyword match for better context
        return text[start_idx:start_idx + 3000]  # Return more text to provide context
    return None

In [13]:
def extract_company_and_ticker(text):
    """
    Use regex to extract the company name and stock ticker from the 10-K text.
    """
    # Pattern for company name (looks for the company name under "Commission File Number")
    company_name_pattern = r"(?:Commission File Number.*?[\r\n]+)([A-Z\s&]+(?:CORPORATION|INCORPORATED|COMPANY|LIMITED|LLC)?)"
    company_name_match = re.search(company_name_pattern, text, re.IGNORECASE)
    company_name = company_name_match.group(1).strip() if company_name_match else "Unknown"
    
    # Pattern for stock ticker (looks for "Trading Symbol" or similar terms)
    ticker_pattern = r"(?:Trading Symbol.*?[:\s]+)([A-Za-z]+)"
    ticker_match = re.search(ticker_pattern, text, re.IGNORECASE)
    stock_ticker = ticker_match.group(1).strip() if ticker_match else "Unknown"

    return company_name, stock_ticker

In [15]:
def clean_llm_response(response):
    """
    Cleans the LLM response by extracting only the product names and descriptions,
    ignoring the think process or extra explanations.
    """
    # Define a pattern to match bullet points or product-related information
    product_pattern = r"(?<=\*\*).*?(?=\*\*)"  # Matches between bold items like **Product Name**

    # Find all matches of product names or descriptions in the response
    products = re.findall(product_pattern, response)

    # Return a clean list of products or a placeholder if no matches are found
    return products if products else ["No products found"]

In [17]:
def ask_llm_for_field(query):
    """
    Simulate querying an LLM for extracting specific details.
    """
    response = ollama.chat(model="deepseek-r1:1.5b", messages=[{"role": "user", "content": query}]).message.content

    # Clean the response to extract product names/descriptions
    clean_response = clean_llm_response(response)
    
    return clean_response

In [19]:
def extract_information_10k(ten_k_url):
    """
    Extract company name, stock ticker, and product details from the 10-K document.
    """
    # Fetch the 10-K document content
    headers = {
        "User-Agent": "Chris Robles (ch379229@ucf.edu)",
        "Referer": "https://www.sec.gov"
    }
    response = requests.get(ten_k_url, headers=headers)
    if response.status_code != 200:
        print("Error fetching 10-K document:", response.status_code)
        return None
    
    # Parse the document content
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()

    # Step 1: Extract CIK number from the text using regex
    cik_match = re.search(r"(?<=CIK:)\s*\d{10}", text)
    if cik_match:
        cik_str = cik_match.group().strip()
        
        # Use CIK to get the company name and ticker from the JSON file
        company_name, stock_ticker = get_company_info_by_cik(cik_str, company_data)
    else:
        company_name, stock_ticker = "Unknown", "Unknown"

    # Step 2: Extract broader sections for product mentions (unchanged)
    focused_text = extract_focused_section(text)
    if not focused_text:
        print("No specific product-related discussions found in the filing.")
        return company_name, stock_ticker, "N/A", "N/A"

    # Step 3: Ask the LLM to extract details about product innovations (unchanged)
    new_product_query = f"Extract the names of innovations or future products from the following text:\n{focused_text}"
    new_product = ask_llm_for_field(new_product_query)

    product_description_query = f"Provide a short description (max 180 characters) of the future products from the following text:\n{focused_text}"
    product_description = ask_llm_for_field(product_description_query)

    return company_name, stock_ticker, new_product, product_description

In [21]:
def save_to_csv(data, filename="output.csv"):
    file_exists = os.path.isfile(filename)
    
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write header only if the file is newly created
        if not file_exists:
            writer.writerow(["Company Name", "Stock Name", "Filing Time", "New Product", "Product Description"])
        
        # Write data rows
        for row in data:
            writer.writerow(row)

In [23]:
def main():
    
    # Set the CIK (Central Index Key) for the company whose 10-K filing we want to fetch
    cik = "1174922"
    # previously used cik:
    # "789019", "320193", "1045810", "1326801", "1067983", "1730168", "1744489", "1101239", "1315098",
    # "1609711", "1818874", "1141391", "1108524", "858877", "1321655", "1075531", "1174922"
    result = fetch_latest_10k(cik)

    if result:
        
        # If a result is returned, unpack the 10-K URL and filing time
        ten_k_url, filing_time = result
        company_name, stock_name, new_product, product_description = extract_information_10k(ten_k_url)
        
        if company_name:
            data = [[company_name, stock_name, filing_time, new_product, product_description]]
            save_to_csv(data)

if __name__ == "__main__":
    # Execute the main function when the script is run directly
    main()

Latest 10-K URL: https://www.sec.gov/Archives/edgar/data/0001174922/000117492225000039/wynn-20241231.htm



In [29]:
def process_csv(input_file, output_file):
    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        data = list(reader)

    new_data = []
    for row in data:
        products = ast.literal_eval(row['New Product'])
        for product in products:
            new_row = row.copy()
            new_row['New Product'] = product
            new_data.append(new_row)

    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        fieldnames = ['Company Name', 'Stock Name', 'Filing Time', 'New Product', 'Product Description']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(new_data)

# Usage
process_csv('output.csv', 'final_output.csv')