In [None]:
"""
create data frames with the urls to 10-K reports
"""

In [1]:
import numpy as np
import pandas as pd
import pickle
import time

In [2]:
import json
import urllib.request

In [3]:
with open('../data/sp_500_ciks.pickle', 'rb') as read_file:
    sp_500_ciks = pickle.load(read_file)

In [4]:
sp_500_ciks.columns

Index(['Symbol', 'Security', 'ticker', 'cik'], dtype='object')

In [5]:
query_list = list(sp_500_ciks.cik.unique())

In [6]:
len(query_list)
# all companies that were in S&P 500 in last 10 years
# will pull links to reports and filter for the companies in the index at different date ranges

589

In [8]:
# SEC API
TOKEN = "--- REDACTED ----"
# API endpoint
API = "https://api.sec-api.io?token=" + TOKEN

In [None]:
# Ran for one year at a time 2011-2021
# Had API errors if more than 50 in loop, even with pausing for a few seconds before iteration of loop
# So ran and output batches of 50 cik's
# last year run in batches of 25 cik's


In [26]:
start=0
stop=25

errors_list = []

for i in range(0, 24 ):
 
    # to save results
    file_list = []

    for cik_id in query_list[start:stop]:

        # define the filter parameters you want to send to the API 
        payload = {
          "query": { "query_string": { "query": "cik:{}".format(cik_id) + " AND filedAt:{2021-01-01 TO 2021-03-08} AND formType:\"10-K\"" } },
          "from": "0",
          "size": "10",
          "sort": [{ "filedAt": { "order": "desc" } }]
        }

        # format your payload to JSON bytes
        jsondata = json.dumps(payload)
        jsondataasbytes = jsondata.encode('utf-8')   # needs to be bytes

        # instantiate the request 
        req = urllib.request.Request(API)

        # set the correct HTTP header: Content-Type = application/json
        req.add_header('Content-Type', 'application/json; charset=utf-8')
        # set the correct length of your request
        req.add_header('Content-Length', len(jsondataasbytes))

        # send the request to the API
        response = urllib.request.urlopen(req, jsondataasbytes)

        # read the response 
        res_body = response.read()
        # transform the response into JSON
        filings = json.loads(res_body.decode("utf-8"))

        #filing_dict = filings["filings"]

        for info in filings['filings']:
            try:
                f = {}
                f['id'] = info['id']
                f['accessionNo'] = info['accessionNo']
                f['ticker'] = info['ticker']
                f['cik'] = info['cik']
                f['companyName'] = info['companyName']
                f['companyNameLong'] = info['companyNameLong']
                f['formType'] = info['formType']
                f['filedAt'] = info['filedAt']
                f['linkToTxt'] = info['linkToTxt']
                f['linkToHtml'] = info['linkToHtml']
                f['irsNo'] = info['entities'][0]['irsNo']
                f['fiscalYearEnd'] = info['entities'][0]['fiscalYearEnd']
                f['sic'] = info['entities'][0]['sic']
                file_list.append(f)
            except:
                #print('ERROR: ', cik_id, info['cik'])
                errors_list.append(cik_id)
                

        # Wait for 2 seconds before next iteration of loop
        time.sleep(2)
        
    print(start,stop, len(file_list))
    
    file_urls_df = pd.DataFrame(file_list)
    file_urls_df["filed_at"] = pd.to_datetime(file_urls_df.filedAt)
    
    with open('../data/2021/year_2021_P' + str(i) + '_report_urls.pickle', 'wb') as to_write:
        pickle.dump(file_urls_df, to_write)
    
    start=stop
    stop = start+25
    
    time.sleep(5)

print(errors_list)


0 25 22
25 50 19
50 75 18
75 100 20
100 125 17
125 150 20
150 175 21
175 200 20
200 225 22
225 250 20
250 275 17
275 300 22
300 325 18
325 350 21
350 375 22
375 400 16
400 425 19
425 450 19
450 475 20
475 500 22
500 525 14
525 550 16
550 575 16
575 600 8
[]
