In [1]:
import os
import openai
from sec_api import QueryApi
import boto3
from pdfminer.high_level import extract_text
from datetime import datetime
import uuid
import requests

from models.models import SECFiling

from dotenv import load_dotenv
load_dotenv()


sec_client = QueryApi(api_key=os.environ['SEC_API_KEY'])
openai.api_key = os.environ['OPENAI_API_KEY']

The company https://sec-api.io/ offers an API for retrieving SEC filings, including the files.

They have their own Python client for interacting with thier API. It's the QueryApi class.
There are a few examples of using it here in their documentation: https://sec-api.io/docs/query-api/python-example

In [3]:
def write_to_s3(bucket_name, file_name, object_name):
    s3_client = boto3.client('s3')
    s3_client.upload_file(file_name, bucket_name, object_name)
    s3_url = f'https://{bucket_name}.s3.amazonaws.com/{object_name}'
    return s3_url

def write_to_filings_table(sec_filing: SECFiling):
    """
    Write metadata about a document to a documents collection in DynamoDB
    Accepts an SECFiling object and writes it to the documents table
    """
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('filings')

    item = sec_filing.dict()
    for key, value in item.items():
        if isinstance(value, datetime):
            item[key] = value.strftime('%Y-%m-%d %H:%M:%S')
        elif not isinstance(value, str):
            item[key] = str(value)

    table.put_item(Item=item)

def get_text_from_pdf(file_path):
    text = extract_text(file_path)
    return text

def get_filings(ticker, start_date, end_date, form_type, limit=10000):
    """
    state_date and end_date filter on the filedAt field. Should be in the format "YYYY-MM-DD"
    form_type is the type of filing to retrieve. For example, "10-Q" or "10-K" https://sec-api.io/list-of-sec-filing-types
    """
    # Retrieve filings metadata
    # iterate through the filings and download the filing as a PDF
    # use get_text_from_pdf to extract the text from the PDF
    # upload the PDF to S3
    # create a new SECFiling object and write it to the database

    query = {
        "query": { "query_string": { 
            "query": f"ticker:{ticker} AND filedAt:[{start_date} TO {end_date}] AND formType:\"{form_type}\"",
            "time_zone": "America/New_York"
        } },
        "from": "0",
        "size": f"{limit}",
        "sort": [{ "filedAt": { "order": "desc" } }]
    }

    headers = {'Authorization': os.environ['SEC_API_KEY']}
    filings = sec_client.get_filings(query)
    for filing in filings['filings']:
        cik = filing['cik']
        accession_number = filing['accessionNo'].replace('-', '')
        filename = filing['linkToTxt'].split('/')[-1]

        # # iterate through the documentFormatFiles and identify the document that represents the 10-Q
        # 'type': '10-Q'
        # and we want the documentUrl field from this document. This ithe the url that should be sent to the pdf generator.

        for doc in filing['documentFormatFiles']:
            if doc['type'] == form_type:
                doc_url = doc['documentUrl']
                break

        # download the filing as a PDF instead of a text file
        url = f'https://api.sec-api.io/filing-reader/?token={os.environ["SEC_API_KEY"]}&type=pdf&url={doc_url}'
        response = requests.get(url)
        filename = f'{filing["accessionNo"]}.pdf'
        with open(filename, 'wb') as f:
            f.write(response.content)

        object_name = f'{filing["ticker"]}/{filing["formType"]}/{filing["filedAt"]}/{filename}'
        s3_url = write_to_s3(bucket_name='dto-sec-filings', file_name=filename, object_name=object_name)
        os.remove(filename)
        sec_filing = SECFiling(
            id = str(uuid.uuid4()),
            accessionNo=filing['accessionNo'],
            cik=filing['cik'],
            ticker=filing['ticker'],
            companyName=filing['companyName'],
            companyNameLong=filing['companyNameLong'],
            formType=filing['formType'],
            periodOfReport=filing['periodOfReport'],
            description=filing['description'],
            filedAt=filing['filedAt'],
            linkToTxt=filing['linkToTxt'],
            linkToHtml=filing['linkToHtml'],
            linkToXbrl=filing['linkToXbrl'],
            linkToFilingDetails=filing['linkToFilingDetails'],
            s3Url=s3_url,
            secDocumentUrl=doc_url
        )
        write_to_filings_table(sec_filing)

In [12]:
get_filings('IBM', '2020-01-01', '2021-12-31', '10-Q', 1)

doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/51143/000155837021014734/ibm-20210930x10q.htm
200


In [None]:
# iterate through the documentFormatFiles and identify the document that represents the 10-Q
# 'type': '10-Q'
# and we want the documentUrl field from this document. This ithe the url that should be sent to the pdf generator.

In [6]:
# get the last 5 10-K filings for INTC
get_filings('INTC', '2017-01-01', '2024-12-31', '10-K', 5)


doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/50863/000005086324000010/intc-20231230.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/50863/000005086323000006/intc-20221231.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/50863/000005086322000007/intc-20211225.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/50863/000005086321000010/intc-20201226.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/50863/000005086320000011/a12282019q4-10kdocument.htm
200


In [7]:
# get the last 5 10-K filings for XOM
get_filings('XOM', '2017-01-01', '2024-12-31', '10-K', 1)

200


In [13]:
# get the last 5 10-K filings for Visa
get_filings('V', '2017-01-01', '2024-12-31', '10-K', 5)

doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/1403161/000140316123000099/v-20230930.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/1403161/000140316122000081/v-20220930.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/1403161/000140316121000060/v-20210930.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/1403161/000140316120000070/v-20200930.htm
200
doc_url https://www.sec.gov/ix?doc=/Archives/edgar/data/1403161/000140316119000050/v093019form10k.htm
200


In [11]:
# retrieve all the records from dynamodb

dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('filings')
response = table.scan()
items = response['Items']

In [None]:
# delete everything in the filings table
response = table.scan()
items = response['Items']
for item in items:
    table.delete_item(
        Key={
            'id': item['id']
        }
    )

In [35]:
# delete all rows from the filings table where 'objectKey' is not present
response = table.scan()
items = response['Items']
for item in items:
    if 'objectKey' not in item:
        table.delete_item(
            Key={
                'id': item['id']
            }
        )

In [7]:
import boto3

def retrieve_objects_from_s3(bucket_name, prefix=None):
    # Create an S3 client
    s3 = boto3.client('s3')

    # Specify optional parameters for list_objects_v2
    params = {'Bucket': bucket_name}
    if prefix:
        params['Prefix'] = prefix

    # List objects in the S3 bucket
    response = s3.list_objects_v2(**params)

    # Retrieve object keys from the response
    object_keys = [obj['Key'] for obj in response.get('Contents', [])]

    return object_keys

In [9]:
bucket_name = '<removed bucket name>'
prefix_filter = 'KO/'

objects = retrieve_objects_from_s3(bucket_name, prefix=prefix_filter)
for obj in objects:
    print(obj)

KO/10-K/2024-02-20T13:21:24-05:00/0000021344-24-000009.pdf
KO/10-Q/2023-07-27T11:49:13-04:00/0000021344-23-000048.pdf
KO/10-Q/2023-10-24T14:46:01-04:00/0000021344-23-000060.pdf


In [10]:
# Example usage:
prefix_filter = 'KO/10-Q/'

objects = retrieve_objects_from_s3(bucket_name, prefix=prefix_filter)
for obj in objects:
    print(obj)

KO/10-Q/2023-07-27T11:49:13-04:00/0000021344-23-000048.pdf
KO/10-Q/2023-10-24T14:46:01-04:00/0000021344-23-000060.pdf


In [15]:
import boto3

def generate_presigned_url(bucket_name, object_key, expiration=3600):
    """
    Generate a presigned URL for downloading a file from S3.

    Parameters:
        - bucket_name (str): The name of the S3 bucket.
        - object_key (str): The key of the object (file) in the S3 bucket.
        - expiration (int): The time, in seconds, until the presigned URL expires. Default is 3600 seconds (1 hour).

    Returns:
        str: The presigned URL for downloading the file.
    """
    s3 = boto3.client('s3')
    
    # Generate a presigned URL for the S3 object
    presigned_url = s3.generate_presigned_url(
        'get_object',
        Params={
            'Bucket': bucket_name,
            'Key': object_key
        },
        ExpiresIn=expiration
    )

    return presigned_url

In [None]:
object_key = 'KO/10-Q/2023-07-27T11:49:13-04:00/0000021344-23-000048.pdf'
presigned_url = generate_presigned_url(bucket_name, object_key)
print(presigned_url)

In [42]:
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('filings')

# get a single item randomly and print out the id

response = table.scan()
items = response['Items']
for item in items:
    print(item['ticker'])
    print(item['id'])

GOOG
ab95b122-d0d9-46ca-addc-5113ab5e20fa
AXP
b5fd3fb0-7318-41da-81e0-7b0b7d5ba730
CMG
812982ec-5bc7-4289-9353-e2e89ee65045
GOOGL
70124223-5ba4-45f2-9511-61a9c7b2d285
GOOG
81d9e6f2-e49e-4346-b2a1-a797788ef2f7
ALL
6e78720d-948d-41ad-b533-e1c4ef1b13bd
CMG
30fd3c32-d21e-448c-b01b-8e2a280acd04
GOOGL
35482695-7746-42e1-a6a9-0c4c93620442
CMG
7f56541b-4af3-4e6c-8a7e-cdcb48d5364e
CMG
97778461-3eed-4248-9f0d-db7de5405bf4
CMG
b983ecd9-7811-4ae5-9bfe-47fb2f0553a8
CMG
b5a40fe1-e18e-4d8b-a32a-f23f872ae85d
GOOG
7855da6b-8349-44fd-a948-50612cde7d3f
GOOG
62837236-8cff-4194-bb7c-594afb2e01a7
CMG
ed346837-455d-4557-8c52-393e701ee31e
CMG
7ea81351-c630-42de-b829-9aac97faa44c
BX
e3e7d53e-3ec3-408b-b076-da90f1f8cb49
GOOG
3ebed636-3e06-42ff-b32c-4f566000e02e
GOOGL
586dc888-71c2-4a36-8236-98179807c960
BAC
bc3ba707-cf3f-4275-9d2a-10d3177c9186
AXP
df4e867e-6fb5-455f-a106-e665edeef9ac
GOOGL
ce3696b0-8ce9-426b-8e43-fc47754604bf
CMG
124a55b2-3644-453c-8d83-3cc8a3c73364
BAC
77ce1860-f163-4dcd-9992-18c92b6afaf6
CMG


In [44]:
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('filings')

def get_forward_looking_statement_sentiments(form_id):
    response = table.get_item(
        Key={
            'id': form_id
        }
    )
    forward_looking_statement_sentiments = response['Item']['forwardStatementSentiment']
    return forward_looking_statement_sentiments

axp_form_id = 'b5fd3fb0-7318-41da-81e0-7b0b7d5ba730'
cmg_form_id = 'aace672c-4298-42a5-b20a-1eea79c93483'

axp_forward_looking_statement_sentiments = get_forward_looking_statement_sentiments(axp_form_id)
print(axp_forward_looking_statement_sentiments)


cmg_foward_looking_statement_sentiments = get_forward_looking_statement_sentiments(cmg_form_id)
print(cmg_foward_looking_statement_sentiments)

{'forwardStatements': [{'sentiment': 'neutral', 'text': 'American Express cards issued by us, as well as by\nthird-party banks and other institutions on the American Express network, can be used by Card Members to charge purchases at the\nmillions of merchants around the world that accept cards bearing our logo.'}, {'sentiment': 'positive', 'text': 'We also seek to drive greater usage of the American Express network\nby deepening merchant engagement and increasing Card Member awareness through initiatives such as our Shop Small campaigns\nand expanding our payment options such as through debit and B2B capabilities.'}, {'sentiment': 'neutral', 'text': 'The current Delta\ncobrand agreement runs through the end of 2029 and we expect to continue to make significant investments in this partnership.'}, {'sentiment': 'positive', 'text': 'Second, we seek to build on our strong position in commercial payments by evolving our card value propositions, further\ndifferentiating our corporate card a

KeyError: 'forwardStatementSentiment'