In [10]:
import datetime
import numpy as np 
import pandas as pd 
from ratelimit import limits, sleep_and_retry
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
from collections import Counter
import re
import csv
from nltk.tokenize import word_tokenize

Get CIKs

In [11]:

QQQ_path = './update_and_only2025.csv'

# QQQ_path = './test.csv'

try:
    df = pd.read_csv(QQQ_path, encoding = 'utf-8')
    QQQ_cik = df['CIK'].drop_duplicates().tolist()
    QQQ_ticker = df['Symbol'].tolist()
    QQQ_cik_ticker = dict(zip(QQQ_cik, QQQ_ticker))
except UnicodeDecodeError:
    df = pd.read_csv(QQQ_path, encoding = 'ISO-8859-1')
    QQQ_cik = df['CIK'].drop_duplicates().tolist()
    QQQ_ticker = df['Symbol'].tolist()
    QQQ_cik_ticker = dict(zip(QQQ_cik, QQQ_ticker))




Download Reports

In [12]:
class LimitRequest(object):
    SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
    @sleep_and_retry
    @limits(calls=SEC_CALL_LIMIT['calls'], period=SEC_CALL_LIMIT['seconds'])
    def _call_sec(url,headers):
        return requests.get(url,headers=headers)
    
    @classmethod
    def get(cls,url,headers):
        return cls._call_sec(url, headers)


def get_sec_data(cik, doc_type, headers,end_date, start_date, start, count):
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    
    sec_data = LimitRequest.get(url = rss_url,headers=headers)
    soup = BeautifulSoup(sec_data.content, 'xml')    
    entries = [
        (   entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        for entry in soup.find_all('entry')
        if pd.to_datetime(entry.content.find('filing-date').getText()) <= end_date and pd.to_datetime(entry.content.find('filing-date').getText()) >= start_date]  
    return entries

In [13]:
def get_document_type(doc):
    """
    Return the document type lowercased

    Parameters
    ----------
    doc : str
        The document string

    Returns
    -------
    doc_type : str
        The document type lowercased
    """
    
    # Regex explaination : Here I am tryng to do a positive lookbehind
    # (?<=a)b (positive lookbehind) matches the b (and only the b) in cab, but does not match bed or debt.
    # More reference : https://www.regular-expressions.info/lookaround.html
    
    type_regex = re.compile(r'(?<=<TYPE>)\w+[^\n]+') # gives out \w
    type_idx = re.search(type_regex, doc).group(0).lower()
    return type_idx

In [14]:
def get_document_format(doc):
    """
    Return the document type lowercased

    Parameters
    ----------
    doc : str
        The document string

    Returns
    -------
    doc_type : str
        The document type lowercased
    """
    
    format_regex = re.compile(r'(?<=<FILENAME>)\w+[^\n]+') # gives out \w
    doc_type  = re.search(format_regex, doc).group(0).lower()
    if doc_type.endswith((".htm", ".html")):
        return 'HTML'
    if doc_type.endswith(".txt"):
        return 'TXT'
    else:
        return None

In [15]:

def get_documents(text):
    document_start_regex = re.compile(r'<DOCUMENT>')
    document_end_regex = re.compile(r'<\/DOCUMENT>')
    
    document_start_indices = [match.start() for match in document_start_regex.finditer(text)]
    document_end_indices = [match.start() for match in document_end_regex.finditer(text)]
    
    documents = []
    for start_index, end_index in zip(document_start_indices, document_end_indices):
        document = text[start_index:end_index]
        documents.append(document)
    
    return documents

In [16]:
from tqdm import tqdm
def download_fillings(ciks, root_folder, doc_type, headers, end_date=datetime.datetime.now(), start_date = '1990-01-01', start=0, count=60):
    doc_type= doc_type.lower()
    for cik in ciks:
        cik = str(cik).zfill(10)
        report_info = get_sec_data(cik, doc_type, headers, end_date=end_date, start_date=start_date, start=start, count=count)
        # check if 10-K exists, otherwise skip it
        if not report_info:
            continue
        else:
            folder_path = os.path.join(root_folder, cik)
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

        for index_url, file_type, file_date in tqdm(report_info, desc='Downloading {} Fillings'.format(cik), unit='filling'):
            if (file_type.lower() == doc_type):
                file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')
                file = LimitRequest.get(url=file_url, headers=headers)
                for document in get_documents(file.text):
                    if get_document_type(document) == doc_type and get_document_format(document) == 'HTML':
                        file_name = os.path.join(folder_path, file_date + '.html')
                        with open(file_name,'w+') as f:
                            f.write(document)
                        f.close()
                    if get_document_type(document) == doc_type and get_document_format(document) == 'TXT':
                        file_name = os.path.join(folder_path, file_date + '.txt')
                        with open(file_name,'w+') as f:
                            f.write(document)
                        f.close()

Report starts from 2006; parts of 2005 reports don't have item1A <br>
Only for 10-k reports

In [19]:
root_folder = 'total_sp500_10q-html'
doc_type = '10-Q'
headers = {'User-Agent': 'University of Edinburgh s2101367@ed.ac.uk'}
start_date = '2006-01-01',
end_date = '2012-01-01'
if not os.path.exists(root_folder):
    os.makedirs(root_folder)
download_fillings(QQQ_cik_ticker, root_folder,doc_type,headers,end_date=end_date,start_date=start_date)

Downloading 0001067983 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.89filling/s]
Downloading 0001404912 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.99filling/s]
Downloading 0001375365 Fillings: 100%|██████████| 2/2 [00:00<00:00,  2.36filling/s]
Downloading 0001175454 Fillings: 100%|██████████| 1/1 [00:00<00:00,  2.58filling/s]
Downloading 0001069202 Fillings: 100%|██████████| 1/1 [00:00<00:00,  2.14filling/s]
Downloading 0000765880 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.84filling/s]
Downloading 0000922621 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.56filling/s]
Downloading 0000014693 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.76filling/s]
Downloading 0000798354 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.89filling/s]
Downloading 0000216228 Fillings: 100%|██████████| 1/1 [00:01<00:00,  1.14s/filling]
Downloading 0000064040 Fillings: 100%|██████████| 1/1 [00:00<00:00,  2.21filling/s]
Downloading 0000101829 Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.82fil

In [1]:
import pandas as pd
path_10q = "/Users/apple/PROJECT/hons_project/data/SP500/10Q/dtm/part-00000-c037b728-f21e-443d-8148-ae09abea42f7-c000.snappy.parquet"
df = pd.read_parquet(path_10q)
print(df.shape)

path_10k = "/Users/apple/PROJECT/hons_project/data/SP500/10K/dtm/part-00000-bbf3389a-a3ed-4008-babf-f89139cffa93-c000.snappy.parquet"
df_10k = pd.read_parquet(path_10k)
print(df_10k.shape)

(18386, 12387)
(7539, 13677)
