In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import unicodedata
import numpy as np
import requests
import json

In [2]:
file_path = "/Users/thesavage/Global_Key/Data/final_tickers.xlsx"

In [3]:
companies_df = pd.read_excel(file_path)

In [4]:
companies_df = companies_df.astype(str)
companies_df["SEC ID"] = companies_df["SEC ID"].apply(lambda x: x.zfill(10))

In [5]:
companies_df

Unnamed: 0,Ticker,Name,SEC ID,CountryCode
0,,,0001448983,CN
1,000029.SZ,ShenZen Special Economic Zone Real Estate,0980000109,CN
2,0123.HK,Yuexiu Property,0980000104,HK
3,0RDM.IL,ABN AMRO GROUP NV ABN AMRO GROU,0001699968,NL
4,5021.T,"Cosmo Energy Holdings Co., Ltd.",0001668393,JP
...,...,...,...,...
7412,ZURVY,ZURICH INSURANCE GROUP,0001277799,CH
7413,ZVO,Zovio Inc,0001305323,US
7414,ZY,"Zymergen, Inc.",0001645842,US
7415,ZYME,Zymeworks,0001403752,CA


In [6]:
us_companies = companies_df.loc[companies_df["CountryCode"] == "US"]

In [7]:
us_companies.tail(50)

Unnamed: 0,Ticker,Name,SEC ID,CountryCode
7337,XPL,Solitario Exploration & Royalty Corp,917225,US
7338,XPO,"Express-1 Expedited Solutions, Inc.",1166003,US
7339,XRAY,DENTSPLY INTERNATIONAL INC,818479,US
7341,XRX,Xerox Corp,108772,US
7342,XSPA,"XpresSpa Group, Inc.",1410428,US
7344,XTNT,"Xtant Medical Holdings, Inc.",1453593,US
7347,XYL,Xylem Inc.,1524472,US
7348,Y,Alleghany Corp,775368,US
7353,YELP,Yelp Incorporated,1345016,US
7354,YETI,YETI Holdings,1670592,US


In [8]:
header = {"User-Agent": "cflan1278@gmail.com"}

In [9]:
def get_cik(ticker, df):
    cik = df[df["Ticker"] == ticker]["SEC ID"].values[0]
    return cik

In [10]:
def get_endpoint(ticker, df, endpoint = "submissions"):
    cik = get_cik(ticker, df)
    endpoint_url = f"https://data.sec.gov/{endpoint}/CIK{cik}.json"
    return cik, endpoint_url

In [11]:
def filings_df(endpoint):
    edgar_json = requests.get(endpoint, headers = header).json()
    company_filings = edgar_json["filings"]["recent"]
    filings_df = pd.DataFrame(company_filings)
    return filings_df

In [12]:
def certain_file(file_type, df):
    return df[df["form"] == file_type] 

In [13]:
def create_request_link(cik, access_num, file_name):
    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{access_num}/{file_name}"


In [14]:
def access_s_1(ticker):
    cik, endpoint = get_endpoint(ticker, us_companies)
    filings = filings_df(endpoint)
    s_1 = filings[filings["form"] == "S-1"].copy()
    s_1["accessionNumber"] = s_1.apply(lambda row: row["accessionNumber"].replace("-", ""), axis = 1)
    s_1["file_endpoint"] = s_1.apply(lambda row: create_request_link(cik, row['accessionNumber'], row['primaryDocument']), axis=1)
    request_url = s_1["file_endpoint"].values[-1]
    print(request_url)
    return requests.get(request_url, headers = header)


In [15]:
# found this on stack over flow just cleans the text more.


def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''
        
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [16]:
s_1_text = access_s_1("ACIA").content

https://www.sec.gov/Archives/edgar/data/0001651235/000119312515409344/d46988ds1.htm


In [17]:
def split_pages(sec_doc):
    soup = BeautifulSoup(sec_doc, "html.parser")
    text = soup.find("text")
    
    # find page breaks
    page_breaks = text.find_all("hr", {"width" : "100%"})
    # make a list of page breaks
    page_breaks = [str(page_break) for page_break in page_breaks]
    
    if len(page_breaks) > 0:
        #create pattern
        regex_pattern = "|".join(map(re.escape, page_breaks))

        #split document along breaks (this stores pages in a list)
        split_filing = re.split(regex_pattern, str(text))
    
    return split_filing

In [18]:
def clean_filing(split_filing):
    '''
    create pages for the documents (keys of dictionary)
    saves the table and normalized text of each corresponding page
    
    '''
    # this is our filing when populated
    # keys = page #, values = cleaned page content
    cleaned_filing = {}
    
    for index, page in enumerate(split_filing):
        page_num = index # create page number 
        
        # clean and normalize pages of filing
        # then store in above dictionary
        
        soup = BeautifulSoup(page, "html5") # adds tags and restructure html
        page_text = soup.html.body.get_text(" ", strip = True)
        
        normalized_text = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text)).replace("  ", " ")
        cleaned_filing[page_num] = normalized_text
        
    return cleaned_filing
    
        
# do the same for tables
    

In [19]:
# search for certain words and use regex expressions to find the managment page
search_words = {
    "key_words" : ["MANAGEMENT", "Chief Executive Officer", "Executive Officers", "Position", "Chief", "Name Age", "Director"],
    "regex" : []
}


In [20]:
def find_page(filing_dict, key_dict = 
              {
    "key_words" : ["MANAGEMENT", "Chief Executive Officer", "Executive Officers", "Position", "Chief", "Name Age", "Director"],
    "regex" : []
                  
}
             ):
    
    # dictionary whose key is page number
    # value is how may desired matches were on page
    num_matches = {}
    
    for page_num, content in filing_dict.items():
        num_matches[page_num] = 0
        key_words = key_dict["key_words"]
        # for each key word in the key word list
        # this list has the count for each word on each list
        counter = [content.count(word) for word in key_words]
        num_matches[page_num] += sum(counter) # assign the page number (key)
                                              # the num of word matches on that page
            
        # assuming the page with the most matches is the page we want to scrape
        
    return max(num_matches, key = num_matches.get)
        
    

In [21]:
def find_table(page_num, doc_split):
    """
    Given the matched page we will search this page for
    the executive managment table. We will clean and parse the
    table and return a df of that table.
    """
    
    # get rows fir the table
    page = doc_split[page_num]
    page_soup = BeautifulSoup(page, "html.parser")
    table_rows = page_soup.find("table").find_all("tr")
    
    # parse table 
    parsed_table = [
        [data.get_text(strip = True) for data in row]
        for row in table_rows
    ]
    
    # clean table more 
    cleaner_table = [
        [item for item in array if item != ""]
        for array in parsed_table
    ]
    
    # format table each array within array is a row of the table
    cleaned_table = [array for array in cleaner_table if len(array) != 0]
    
    # create dictionary that we can turn into a dataframe
    to_df_dict = {key : [] for key in cleaned_table[0]} # keys are the first item (array) in "cleaned table"
    
    # populate the dictionary
    for array in cleaned_table[1:]:
        for index, item in enumerate(array):
            to_df_dict[array[index]].append(item)
            
    # create and return df        
    df = pd.DataFrame(to_df_dict)
    
    
    return df

In [22]:
def exec_df(ticks):
    concat_list = []
    for tick in ticks:
        try:
            s1_text = access_s_1(tick).content
            split_s1 = split_pages(s1_text)
            cleaned_document = clean_filing(split_s1)
            page = find_page(cleaned_document)
            df = find_table(page, split_s1)
            concat_list.append(df)
            print(concat_list)
        except Exception as e:
            print(e)
            
            
    big_df = pd.concat(concat_list)
        
    big_df["company_tick"] = tick
        
        
        
    return big_df
exec_df(["ACIA"])           

https://www.sec.gov/Archives/edgar/data/0001651235/000119312515409344/d46988ds1.htm
'Murugesan “Raj” Shanmugaraj'


ValueError: No objects to concatenate

In [None]:
soup = BeautifulSoup(split[96], "html.parser")

In [None]:
tables = soup.find_all("table")
first_table = tables[0]
table_rows = first_table.find_all("tr")

In [None]:
parsed_table = [
    [data.get_text(strip = True) for data in row.find_all("td")]
    for row in table_rows
]

cleaner_table = [
    [item for item in array if item != ""]
    for array in parsed_table
]

cleaned_table = [array for array in cleaner_table if len(array) != 0]


In [None]:
parsed_table

In [None]:
cleaner_table

In [None]:
cleaned_table 

In [None]:
keys = cleaned_table[0]

to_df = {key : [] for key in keys}

contents = cleaned_table[1:]

for array in contents:
    for index, item in enumerate(array):
        to_df[keys[index]].append(item) 

to_df

In [None]:
pd.DataFrame(to_df)