In [12]:
# Importing Libraries
import requests, json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys ## all keyboard keys imported
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import os, time
from bs4 import BeautifulSoup
import urllib
import shutil
import pandas as pd
import copy
import warnings


In [13]:
class Scrape_Filing(webdriver.Chrome):
    def __init__(self, driver_path=None, teardown=False):
        if driver_path == None:
            driver_path = os.path.abspath(os.getcwd()) ## Gets current working directory
        self.driver_path = driver_path
        self.teardown = teardown
        os.environ['PATH'] += self.driver_path
        options = webdriver.ChromeOptions()
        #options.add_experimental_option('excludeSwitches', ['enable-logging']) ## stops the warnings related to reading file descriptors logs
        
        download_path = driver_path + '\\path_files'   # set path for when we download the filing to minimize requests sent to server when we parse it 
        self.download_path = download_path
        preferences = {
            "download.default_directory":download_path
            
        }
        options.add_experimental_option("prefs", preferences)
       
        super(Scrape_Filing, self).__init__(options=options) ## use super to instantiate the webdriver.chrome class
        self.implicitly_wait(15)
        # self.maximize_window()

    def __exit__(self, exc_type, exc_val, exc_tb): ## method to close the chrome window
        if self.teardown:
            self.quit()
            
    def find_file(self, name, path):
        for root, dirs, files in os.walk(path):
            if name in files:
                return os.path.join(root, name)
 
        
    def find_cik(self, company="BlackRock Inc."):
        self.get("https://www.sec.gov/edgar/searchedgar/cik.htm")
        company_element = self.find_element_by_id("company")
        company_element.send_keys(company)
        submit_element = self.find_element_by_class_name("search-button")
        submit_element.click() 
       
        try:

            table_element = self.find_element_by_css_selector("table[summary='Results of CIK Lookup']")
            rows = table_element.find_elements(By.TAG_NAME, "tr")

            td_row = rows[0].find_elements(By.TAG_NAME, "td") ## d_row has 2 rows
            
            pre_list = td_row[1].find_elements(By.TAG_NAME, "pre") ## there are 2 <pre> tags in 2nd rows
            cik = pre_list[1].find_elements(By.TAG_NAME, "a")[0].text ## we access the very first <pre> tag and the first a tags text is our cik 
            return cik
        except Exception as e:
            print("try a different name")
            
    def find_filing_address(self, cik, year=2020, quarter=1, only_10k=False): ## by default finds 10k, otherwise choose from quarter=1-4
        new_filename = f"master_{year}_{quarter}.txt"
        if self.find_file(new_filename, self.download_path): ## To prevent re downloading files, and make the program more efficient
            
            print("Found File")
        else:
        
            master_idx_url = f"https://www.sec.gov/Archives/edgar/full-index/{str(year)}/QTR{str(quarter)}/"
            self.get(master_idx_url)
            table_element = self.find_element_by_css_selector("table[summary='heding']")
            rows_element = table_element.find_elements(By.TAG_NAME, "tr")
            master_idx_link = rows_element[11].find_element(By.TAG_NAME, "a")
            master_idx_link.click()
            time.sleep(2) ## sleep to let file download


            filename = max([self.download_path + "\\" + f for f in os.listdir(self.download_path)],key=os.path.getctime)
            print(filename)

            shutil.move(filename,os.path.join(self.download_path,new_filename))
            
        
        stripped_cik = cik.lstrip("0") ## strip the preceding zeroes in the string cik
        add_10q = []
        add_10k = []
        with open(self.download_path + "\\" + new_filename) as fp:
            for line in fp:
                if stripped_cik in line:
                    if "10-Q" in line:
                        add_10q.append(line) 
                    if "10-K" in line:
                        add_10k.append(line)
                        
        if add_10k == [] and add_10q == []:
            print("No filings found")
            return None
                        
        return_address = []
        
        if only_10k == True:
            if add_10k == []:
                return None
            else:
                for line in add_10k:
                    return_address.append(line.split('|')[4])
                complete_address = "https://www.sec.gov/Archives/" + return_address[0].strip("\n")
                return complete_address ## for now only returns complete address of first filing we find that is either a 10-K or 10-Q
            
                        
        ## Single quarter archive has multiple 10-Q, or both 10-K and 10-Q reports for some company(ie. some international companies)
        for line in add_10q:
            return_address.append(line.split('|')[4])
            
        complete_address = "https://www.sec.gov/Archives/" + return_address[0].strip("\n")
        return complete_address ## for now only returns complete address of first filing we find that is either a 10-K or 10-Q
            
        
      
    def find_10k_address(self, cik, year): ## Since companies may release their 10k's in different quarters
        for i in range(1, 5):
            address_10k = self.find_filing_address(cik=cik, year=year, quarter=i, only_10k=True)
            if address_10k != None:
                return address_10k
        return None
                        

In [14]:
class ParseFiling():
    def __init__(self):
        self.filing = dict()
        self.filing['sec_header_content'] = {}
        self.filing['filing_documents'] = None
        self.test = 0
        
    def retrieve_filing(self, file_address):
        response = requests.get(file_address)
        filing = BeautifulSoup(response.content, 'lxml')
        sec_header_tag = filing.find('sec-header')
        

        
        display(sec_header_tag)
        
        # find condendsed consolidated statements of financial condition
        #<table border="0" cellspacing="0" cellpadding="0" style="margin:auto;border-collapse:collapse; width:100%;">
        print("huhhhshh")
        i = 0
#         for filing_document in filing.find('document'):
#             document_filename = filing_document.filename.find(text=True, recursive=False).strip()
#             #print(i, ": ", document_filename)
#             display(document_filename) 
#             i+=1
#             #master_document_dict[document_id]['document_filename'] = document_filename
            
        
                
                
    

In [16]:
BlackRock10k = Scrape_Filing()
#cik = BlackRock10k.find_cik("BlackRock Inc.")
cik = '0001364742'
file_address = BlackRock10k.find_filing_address(cik, year=2019, quarter=2)

Found File


In [30]:
Parse_BR = ParseFiling()
Parse_BR.retrieve_filing(file_address)


None

huhhhshh


In [7]:
cik

'0001364742'

In [17]:
MicrosoftCorp10k = Scrape_Filing()
ms_cik = MicrosoftCorp10k.find_cik("Microsoft Corp") # use abbreviations such as Corp, Ltd, Inc, etc.
print(ms_cik)
MicrosoftCorp10k.find_filing_address(ms_cik, 2019, 3)
MicrosoftCorp10k.find_filing_address(ms_cik, 2019, 4)


0000789019
Found File


IndexError: list index out of range

In [34]:
# general steel holdings
warnings.filterwarnings('ignore')
GS_10k = Scrape_Filing()
gs_cik = GS_10k.find_cik("GENERAL STEEL HOLDINGS INC")
# GS_10k.find_filing_address(gs_cik, 2019, 1)
# GS_10k.find_filing_address(gs_cik, 2019, 2)
# GS_10k.find_filing_address(gs_cik, 2019, 3)
# GS_10k.find_filing_address(gs_cik, 2019, 4)
GS_10k.find_10k_address(gs_cik, 2019)

Found File
Found File


'https://www.sec.gov/Archives/edgar/data/1239188/0001144204-19-017485.txt'

In [18]:
MicrosoftCorp10k = Scrape_Filing()
ms_cik = MicrosoftCorp10k.find_cik("Microsoft Corp")
print(ms_cik)
add_10k = MicrosoftCorp10k.find_10k_address(ms_cik, 2019)
print(add_10k)


0000789019
Found File
Found File
Found File
https://www.sec.gov/Archives/edgar/data/789019/0001564590-19-027952.txt


True

In [4]:
import os
os.path.abspath(os.getcwd())

'C:\\Users\\darsh\\personal-repo\\projects\\sec_scrape'