In [1]:
import datetime
import numpy as np 
import pandas as pd 
from ratelimit import limits, sleep_and_retry
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
from collections import Counter
import re
import csv
from nltk.tokenize import word_tokenize

Get CIKs

In [2]:

QQQ_path = './update_and_only2025.csv'
# QQQ_path = './test.csv'


try:
    df = pd.read_csv(QQQ_path, encoding = 'utf-8')
    QQQ_cik = df['CIK'].drop_duplicates().tolist()
    QQQ_ticker = df['Symbol'].tolist()
    QQQ_cik_ticker = dict(zip(QQQ_cik, QQQ_ticker))
except UnicodeDecodeError:
    df = pd.read_csv(QQQ_path, encoding = 'ISO-8859-1')
    QQQ_cik = df['CIK'].drop_duplicates().tolist()
    QQQ_ticker = df['Symbol'].tolist()
    QQQ_cik_ticker = dict(zip(QQQ_cik, QQQ_ticker))




Download Reports

In [3]:
def submission_api(cik, ticker, doc_type, headers, start_date, end_date):
    # SEC submissions URL
    rss_url = f'https://data.sec.gov/submissions/CIK{cik}.json'

    # Retrieve the filing data from SEC
    sec_data = requests.get(url=rss_url, headers=headers)

    filings = sec_data.json().get('filings', {}).get('recent', {})

    entries = []

    # Iterate over the filings and filter by type and date range
    for i in range(len(filings['accessionNumber'])):
        filing_date = pd.to_datetime(filings['filingDate'][i])
        filing_type = filings['form'][i]


        if filing_type == doc_type and start_date <= filing_date <= end_date:

            accession_number = filings['accessionNumber'][i].replace('-', '')
            filing_href = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/index.json"

            # Fetch the specific filing details
            filing_response = requests.get(filing_href, headers=headers)

            if filing_response.status_code == 200:
                filing_json = filing_response.json()
                for file in filing_json['directory']['item']:

                    if file['name'].endswith('.htm'):
                        if doc_type.lower() in file['name'] or '10k' in file['name'] or ticker.lower() in file['name']:
                            if 'ex' not in file['name']:
                                html_href = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{file['name']}"
                                entries.append((html_href, filing_type, filing_date))

                    
    return entries


In [4]:
import requests
import time
from tenacity import retry, stop_after_attempt, wait_fixed
import pandas as pd

class LimitRequest:
    SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}

    @retry(stop=stop_after_attempt(5), wait=wait_fixed(2))  # Retry up to 5 times with a 2-second delay
    def _call_sec(url, headers):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response
        else:
            response.raise_for_status()  # Raise exception for failed requests

    @classmethod
    def get(cls, url, headers):
        return cls._call_sec(url, headers)



def get_sec_data(cik, ticker, doc_type, headers, end_date, start_date):


    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    # SEC XBRL data APIs
    xbrl_url = f'https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/AccountsPayableCurrent.json'
    sec_data = requests.get(url=xbrl_url, headers=headers)
    entries = []
    try: 
        units = sec_data.json().get('units', {}).get('USD', [])
    except (ValueError, KeyError, requests.exceptions.RequestException) as e:
        print(f"Error: {e}")
        try:
            return submission_api(cik, ticker, doc_type, headers, start_date, end_date)
        except Exception as e:
            print(f"Error: {e}")
    
    for i in range(len(units)):
        filing_date = pd.to_datetime(units[i]['filed'])
        filing_type = units[i]['form']
        filing_accn = units[i]['accn']
        
        if filing_type == doc_type.upper() and start_date <= filing_date <= end_date:

            filing_href = f"https://www.sec.gov/Archives/edgar/data/{cik}/{filing_accn.replace('-', '')}/index.json"
            filing_response = requests.get(filing_href, headers=headers)

            if filing_response.status_code == 200:
                filing_json = filing_response.json()
                for file in filing_json['directory']['item']:
                    if file['name'].endswith('.htm'):
                        if doc_type.lower() in file['name'] or "".join(doc_type.lower().split("-")) in file['name'] or ticker.lower() in file['name']:
                            if 'ex' not in file['name']:
                                html_href = f"https://www.sec.gov/Archives/edgar/data/{cik}/{filing_accn.replace('-', '')}/{file['name']}"

                                entries.append((html_href, filing_type, filing_date))

    entries = list(dict.fromkeys(entries))

    return entries


            
        
    



In [5]:
def get_document_type(doc):
    """
    Return the document type lowercased

    Parameters
    ----------
    doc : str
        The document string

    Returns
    -------
    doc_type : str
        The document type lowercased
    """
    
    # Regex explaination : Here I am tryng to do a positive lookbehind
    # (?<=a)b (positive lookbehind) matches the b (and only the b) in cab, but does not match bed or debt.
    # More reference : https://www.regular-expressions.info/lookaround.html
    
    type_regex = re.compile(r'(?<=<TYPE>)\w+[^\n]+') # gives out \w
    type_idx = re.search(type_regex, doc).group(0)
    return type_idx

In [6]:
def get_document_format(doc):
    """
    Return the document type lowercased

    Parameters
    ----------
    doc : str
        The document string

    Returns
    -------
    doc_type : str
        The document type lowercased
    """
    
    format_regex = re.compile(r'(?<=<FILENAME>)\w+[^\n]+') # gives out \w
    doc_type  = re.search(format_regex, doc).group(0).lower()
    if doc_type.endswith((".htm", ".html")):
        return 'HTML'
    if doc_type.endswith(".txt"):
        return 'TXT'
    else:
        return None

In [7]:

def get_documents(text):
    document_start_regex = re.compile(r'<DOCUMENT>')
    document_end_regex = re.compile(r'<\/DOCUMENT>')
    
    document_start_indices = [match.start() for match in document_start_regex.finditer(text)]
    document_end_indices = [match.start() for match in document_end_regex.finditer(text)]
    
    documents = []
    for start_index, end_index in zip(document_start_indices, document_end_indices):
        document = text[start_index:end_index]
        documents.append(document)
        
    # If the filing is written in the XBRL content
    if not documents:
        # Parse the XBRL content
        documents.append(text)
    
    return documents

In [8]:
from tqdm import tqdm
def download_fillings(ciks_tickers, root_folder, doc_type, headers, end_date=datetime.datetime.now(), start_date = '1990-01-01'):
    
    for idx, (cik, ticker) in enumerate(ciks_tickers.items()):

        cik = str(cik).zfill(10)
        report_info = get_sec_data(cik, ticker, doc_type, headers, end_date=end_date, start_date=start_date)

        # check if 10-K exists, otherwise skip it
        if not report_info:
            continue
        else:
            folder_path = os.path.join(root_folder, cik)
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

        for index_url, _ , file_date in tqdm(report_info, desc='Downloading {} Fillings'.format(cik), unit='filling'):
            file_date = file_date.strftime('%Y-%m-%d')


            file = LimitRequest.get(url=index_url, headers=headers)


            file_name = os.path.join(folder_path, file_date + '.html')
            with open(file_name,'w+') as f:
                f.write(file.text)
            f.close()


Report starts from 2006; parts of 2005 reports don't have item1A <br>
Only for 10-k reports

In [56]:
root_folder = 'total_sp500_10k-html'
doc_type = '10-K'
headers = {'User-Agent': 'University of Edinburgh s2101369@ed.ac.uk'}
start_date = '2011-01-01',
end_date = datetime.datetime.now()
if not os.path.exists(root_folder):
    os.makedirs(root_folder)
download_fillings(QQQ_cik_ticker, root_folder,doc_type,headers,end_date=end_date,start_date=start_date)

Downloading 0001652044 Fillings: 100%|██████████| 10/10 [00:02<00:00,  3.96filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001067983 Fillings: 100%|██████████| 8/8 [00:02<00:00,  3.28filling/s]
Downloading 0001321655 Fillings: 100%|██████████| 5/5 [00:01<00:00,  4.11filling/s]
Downloading 0001535527 Fillings: 100%|██████████| 6/6 [00:01<00:00,  3.23filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001996810 Fillings: 100%|██████████| 1/1 [00:00<00:00,  3.71filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001404912 Fillings: 100%|██████████| 11/11 [00:04<00:00,  2.29filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000766704 Fillings: 100%|██████████| 8/8 [00:03<00:00,  2.31filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001858681 Fillings: 100%|██████████| 2/2 [00:00<00:00,  2.02filling/s]
Downloading 0001327811 Fillings: 100%|██████████| 12/12 [00:04<00:00,  2.69filling/s]
Downloading 0001692819 Fillings: 100%|██████████| 2/2 [00:00<00:00,  2.41filling/s]
Downloading 0001375365 Fillings: 100%|██████████| 14/14 [00:05<00:00,  2.79filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001811074 Fillings: 100%|██████████| 6/6 [00:01<00:00,  3.01filling/s]
Downloading 0001175454 Fillings: 100%|██████████| 9/9 [00:03<00:00,  2.66filling/s]
Downloading 0001609711 Fillings: 100%|██████████| 17/17 [00:07<00:00,  2.39filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000910521 Fillings: 100%|██████████| 9/9 [00:03<00:00,  2.74filling/s]
Downloading 0001069202 Fillings: 100%|██████████| 14/14 [00:05<00:00,  2.59filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000765880 Fillings: 100%|██████████| 7/7 [00:02<00:00,  2.37filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001754301 Fillings: 100%|██████████| 3/3 [00:01<00:00,  2.34filling/s]
Downloading 0001564708 Fillings: 100%|██████████| 6/6 [00:02<00:00,  2.80filling/s]
Downloading 0001725057 Fillings: 100%|██████████| 6/6 [00:02<00:00,  2.54filling/s]
Downloading 0000922621 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.72filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000014693 Fillings: 100%|██████████| 7/7 [00:04<00:00,  1.63filling/s]
Downloading 0001745999 Fillings: 100%|██████████| 4/4 [00:03<00:00,  1.05filling/s]
Downloading 0000798354 Fillings: 100%|██████████| 13/13 [00:04<00:00,  2.98filling/s]
Downloading 0000216228 Fillings: 100%|██████████| 14/14 [00:05<00:00,  2.35filling/s]
Downloading 0001163165 Fillings: 100%|██████████| 12/12 [00:05<00:00,  2.38filling/s]
Downloading 0000093410 Fillings: 100%|██████████| 13/13 [00:06<00:00,  1.90filling/s]
Downloading 0000064040 Fillings: 100%|██████████| 13/13 [00:03<00:00,  3.33filling/s]
Downloading 0000101829 Fillings: 100%|██████████| 8/8 [00:02<00:00,  3.77filling/s]
Downloading 0001618921 Fillings: 100%|██████████| 10/10 [00:03<00:00,  3.23filling/s]
Downloading 0001109357 Fillings: 100%|██████████| 7/7 [00:02<00:00,  2.41filling/s]
Downloading 0001130310 Fillings: 100%|██████████| 16/16 [00:06<00:00,  2.49filling/s]
Downloading 0000068505 Fillings: 100%|██████████| 15/15 [00:05

Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000753308 Fillings: 100%|██████████| 9/9 [00:06<00:00,  1.47filling/s]
Downloading 0000004281 Fillings: 100%|██████████| 14/14 [00:05<00:00,  2.63filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000060086 Fillings: 100%|██████████| 9/9 [00:03<00:00,  2.65filling/s]
Downloading 0000047217 Fillings: 100%|██████████| 13/13 [00:05<00:00,  2.37filling/s]
Downloading 0000004447 Fillings: 100%|██████████| 13/13 [00:04<00:00,  2.73filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001281761 Fillings: 100%|██████████| 9/9 [00:02<00:00,  3.06filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000036966 Fillings: 100%|██████████| 8/8 [00:02<00:00,  2.79filling/s]
Downloading 0001583708 Fillings: 100%|██████████| 3/3 [00:00<00:00,  4.31filling/s]
Downloading 0000026172 Fillings: 100%|██████████| 15/15 [00:08<00:00,  1.82filling/s]
Downloading 0000031791 Fillings: 100%|██████████| 13/13 [00:04<00:00,  2.92filling/s]
Downloading 0000008670 Fillings: 100%|██████████| 15/15 [00:05<00:00,  2.85filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000019617 Fillings: 100%|██████████| 1/1 [00:00<00:00,  2.58filling/s]
Downloading 0000039899 Fillings: 100%|██████████| 13/13 [00:04<00:00,  2.97filling/s]
Downloading 0001059556 Fillings: 100%|██████████| 15/15 [00:04<00:00,  3.09filling/s]
Downloading 0000009389 Fillings: 100%|██████████| 15/15 [00:04<00:00,  3.27filling/s]
Downloading 0000024545 Fillings: 100%|██████████| 13/13 [00:04<00:00,  3.12filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000086312 Fillings: 100%|██████████| 34/34 [00:11<00:00,  2.89filling/s]
Downloading 0000018926 Fillings: 100%|██████████| 13/13 [00:04<00:00,  2.74filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000315293 Fillings: 100%|██████████| 15/15 [00:04<00:00,  3.16filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000320335 Fillings: 100%|██████████| 10/10 [00:03<00:00,  2.75filling/s]
Downloading 0000701985 Fillings: 100%|██████████| 13/13 [00:04<00:00,  3.14filling/s]
Downloading 0000732712 Fillings: 100%|██████████| 15/15 [00:04<00:00,  3.55filling/s]
Downloading 0000732717 Fillings: 100%|██████████| 1752/1752 [11:21<00:00,  2.57filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000895421 Fillings: 100%|██████████| 2/2 [00:00<00:00,  3.74filling/s]
Downloading 0000793952 Fillings: 100%|██████████| 15/15 [00:04<00:00,  3.16filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000092230 Fillings: 100%|██████████| 5/5 [00:01<00:00,  3.61filling/s]
Downloading 0001701605 Fillings: 100%|██████████| 9/9 [00:02<00:00,  3.20filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000316709 Fillings: 100%|██████████| 5/5 [00:01<00:00,  3.17filling/s]
Downloading 0000849399 Fillings: 100%|██████████| 11/11 [00:03<00:00,  3.24filling/s]
Downloading 0000712515 Fillings: 100%|██████████| 16/16 [00:05<00:00,  2.86filling/s]
Downloading 0000813828 Fillings: 100%|██████████| 11/11 [00:04<00:00,  2.69filling/s]
Downloading 0000794367 Fillings: 100%|██████████| 1365/1365 [09:38<00:00,  2.36filling/s]
Downloading 0000887396 Fillings: 100%|██████████| 6/6 [00:02<00:00,  2.92filling/s]
Downloading 0001466258 Fillings: 100%|██████████| 18/18 [00:05<00:00,  3.27filling/s]
Downloading 0001140859 Fillings: 100%|██████████| 10/10 [00:04<00:00,  2.24filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001116132 Fillings: 100%|██████████| 14/14 [00:05<00:00,  2.53filling/s]
Downloading 0001136869 Fillings: 100%|██████████| 13/13 [00:07<00:00,  1.80filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001156039 Fillings: 100%|██████████| 11/11 [00:03<00:00,  2.91filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001138118 Fillings: 100%|██████████| 10/10 [00:02<00:00,  3.59filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000052988 Fillings: 100%|██████████| 9/9 [00:03<00:00,  2.80filling/s]
Downloading 0001103982 Fillings: 100%|██████████| 10/10 [00:03<00:00,  3.27filling/s]
Downloading 0000202058 Fillings: 100%|██████████| 8/8 [00:02<00:00,  3.12filling/s]
Downloading 0000858470 Fillings: 100%|██████████| 10/10 [00:03<00:00,  3.00filling/s]
Downloading 0000072741 Fillings: 100%|██████████| 9/9 [00:06<00:00,  1.33filling/s]
Downloading 0001075531 Fillings: 100%|██████████| 10/10 [00:03<00:00,  2.61filling/s]
Downloading 0001521332 Fillings: 100%|██████████| 8/8 [00:02<00:00,  2.77filling/s]
Downloading 0001530721 Fillings: 100%|██████████| 7/7 [00:02<00:00,  2.72filling/s]
Downloading 0001326801 Fillings: 100%|██████████| 8/8 [00:02<00:00,  3.45filling/s]
Downloading 0001336917 Fillings: 100%|██████████| 8/8 [00:03<00:00,  2.60filling/s]
Downloading 0001670541 Fillings: 100%|██████████| 2/2 [00:00<00:00,  2.89filling/s]
Downloading 0001140536 Fillings: 100%|██████████| 7/7 [00:02<00:00,  3

Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001095073 Fillings: 100%|██████████| 10/10 [00:05<00:00,  1.92filling/s]
Downloading 0000814547 Fillings: 100%|██████████| 10/10 [00:03<00:00,  3.03filling/s]
Downloading 0001327567 Fillings: 100%|██████████| 7/7 [00:02<00:00,  3.08filling/s]
Downloading 0001543151 Fillings: 100%|██████████| 1/1 [00:00<00:00,  3.90filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0000048898 Fillings: 100%|██████████| 8/8 [00:02<00:00,  2.93filling/s]
Downloading 0001069183 Fillings: 100%|██████████| 13/13 [00:05<00:00,  2.54filling/s]
Downloading 0001316835 Fillings: 100%|██████████| 11/11 [00:03<00:00,  2.99filling/s]
Downloading 0001145197 Fillings: 100%|██████████| 4/4 [00:01<00:00,  2.75filling/s]


Error: Expecting value: line 1 column 1 (char 0)


Downloading 0001393818 Fillings: 100%|██████████| 9/9 [00:02<00:00,  3.33filling/s]
Downloading 0001397187 Fillings: 100%|██████████| 9/9 [00:02<00:00,  3.08filling/s]


In [13]:
root_folder = 'total_sp500_10q-html'
doc_type = '10-Q'
headers = {'User-Agent': 'University of Edinburgh s2101367@ed.ac.uk'}
start_date = '2011-01-01',
end_date = datetime.datetime.now()
if not os.path.exists(root_folder):
    os.makedirs(root_folder)
download_fillings(QQQ_cik_ticker, root_folder,doc_type,headers,end_date=end_date,start_date=start_date)

KeyboardInterrupt: 

# Trash

In [None]:
# ?import requests
# import time
# from tenacity import retry, stop_after_attempt, wait_fixed
# import pandas as pd

# class LimitRequest:
#     SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}

#     @retry(stop=stop_after_attempt(5), wait=wait_fixed(2))  # Retry up to 5 times with a 2-second delay
#     def _call_sec(url, headers):
#         response = requests.get(url, headers=headers)
#         if response.status_code == 200:
#             return response
#         else:
#             response.raise_for_status()  # Raise exception for failed requests

#     @classmethod
#     def get(cls, url, headers):
#         return cls._call_sec(url, headers)



# def get_sec_data(cik, ticker, doc_type, headers, end_date, start_date, start, count):


#     start_date = pd.to_datetime(start_date)
#     end_date = pd.to_datetime(end_date)
    
#     # SEC submissions URL
#     rss_url = f'https://data.sec.gov/submissions/CIK{cik}.json'
#     print(rss_url)

#     # Retrieve the filing data from SEC
#     sec_data = requests.get(url=rss_url, headers=headers)

#     filings = sec_data.json().get('filings', {}).get('recent', {})


#     entries = []
#     # print('filings-form', filings['form'])
#     print('filing_date', filings['filingDate'][1000])
#     # print('doc_type', doc_type)
#     print('start_date', start_date)
#     print('end_date', end_date)
#     # Iterate over the filings and filter by type and date range
#     print('len', len(filings['accessionNumber']))
#     print('len-date', len(filings['filingDate']))

#     print('len-form', filings['form'][1000])

#     for i in range(len(filings['accessionNumber'])):
#         filing_date = pd.to_datetime(filings['filingDate'][i])
#         filing_type = filings['form'][i]

#         # if start_date <= filing_date <= end_date:


#         #     print('filing_date', filing_date)

#         if filing_type == doc_type and start_date <= filing_date <= end_date:


#             accession_number = filings['accessionNumber'][i].replace('-', '')
#             filing_href = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/index.json"
#             print('filing_href', filing_href)
#             # Fetch the specific filing details
#             filing_response = requests.get(filing_href, headers=headers)

#             if filing_response.status_code == 200:
#                 filing_json = filing_response.json()
#                 for file in filing_json['directory']['item']:

#                     if file['name'].endswith('.htm'):
#                         if doc_type.lower() in file['name'] or '10k' in file['name'] or ticker.lower() in file['name']:  # Find HTML document
#                             html_href = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{file['name']}"
#                             # with open('output.txt', 'a') as f:
#                             #     f.write(html_href + "\n")

#                             entries.append((html_href, filing_type, filing_date))
#                             # break  # Stop after finding the HTML file
                    
#     return entries


In [None]:
# class LimitRequest(object):
#     SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
#     @sleep_and_retry
#     @limits(calls=SEC_CALL_LIMIT['calls'], period=SEC_CALL_LIMIT['seconds'])
#     def _call_sec(url,headers):
#         return requests.get(url,headers=headers)
    
#     @classmethod
#     def get(cls,url,headers):
#         return cls._call_sec(url, headers)

# def get_sec_data(cik, doc_type, headers, end_date, start_date, start, count):
#     start_date = pd.to_datetime(start_date)
#     end_date = pd.to_datetime(end_date)
    
#     # Updated SEC API URL
#     rss_url = f'https://data.sec.gov/submissions/CIK{cik}.json'
    
#     # Set proper headers to identify yourself to the SEC
#     sec_data = requests.get(url=rss_url, headers=headers)
#     filings = sec_data.json()['filings']['recent']

    
#     entries = []
    
#     # Loop through the filings and filter by the document type and date range
#     for i in range(len(filings['accessionNumber'])):
#         filing_date = pd.to_datetime(filings['filingDate'][i])
#         filing_type = filings['form'][i]
        
#         if filing_type == doc_type and start_date <= filing_date <= end_date:
#             filing_href = f"https://www.sec.gov/Archives/edgar/data/{cik}/{filings['accessionNumber'][i].replace('-', '')}/index.json"
#             entries.append((filing_href, filing_type, filing_date))
    
#     return entries

In [None]:
# Oupdated API Call
# class LimitRequest(object):
#     SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
#     @sleep_and_retry
#     @limits(calls=SEC_CALL_LIMIT['calls'], period=SEC_CALL_LIMIT['seconds'])
#     def _call_sec(url,headers):
#         return requests.get(url,headers=headers)
    
#     @classmethod
#     def get(cls,url,headers):
#         return cls._call_sec(url, headers)


# def get_sec_data(cik, doc_type, headers,end_date, start_date, start, count):
#     start_date = pd.to_datetime(start_date)
#     end_date = pd.to_datetime(end_date)
#     rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
#         '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
#         .format(cik, doc_type, start, count)
    
#     sec_data = LimitRequest.get(url = rss_url,headers=headers)
#     soup = BeautifulSoup(sec_data.content, 'xml')    
#     entries = [
#         (   entry.content.find('filing-href').getText(),
#             entry.content.find('filing-type').getText(),
#             entry.content.find('filing-date').getText())
#         for entry in soup.find_all('entry')
#         if pd.to_datetime(entry.content.find('filing-date').getText()) <= end_date and pd.to_datetime(entry.content.find('filing-date').getText()) >= start_date]  
#     return entries

In [None]:
# from bs4 import BeautifulSoup
# def get_document_xml(text):
#     # Parse the XBRL content
#     soup = BeautifulSoup(text, 'lxml')
#     body_text = soup.body.get_text()
    
#     return body_text


# headers = {'User-Agent': 'University of Edinburgh s2101368@ed.ac.uk'}
# url = "https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106/aapl-20230930.htm"
# url2 = "https://www.sec.gov/Archives/edgar/data/0000320193/000032019317000070/a10-k20179302017.htm"
# file  = LimitRequest.get(url=url2, headers=headers)
# with open('output1.html', 'w') as f:
#     f.write(file.text)

# # for document in get_documents(file.text):
# #     with open('output5.html', 'w') as f:
# #         f.write(document)


# # Tester

# root_folder = 'data'
# doc_type = '10-K'
# headers = {'User-Agent': 'University of Edinburgh s2101368@ed.ac.uk'}
# start_date = '2006-01-01',
# end_date = datetime.datetime.now()
# for idx, (cik, ticker) in enumerate(QQQ_cik_ticker.items()):

#         cik = str(cik).zfill(10)

#         repo = get_sec_data(cik, ticker, doc_type, headers, end_date, start_date)
#         print(repo)