In [3]:
pip install requests_html

Collecting requests_html
  Downloading requests_html-0.10.0-py3-none-any.whl (13 kB)
Collecting pyquery
  Downloading pyquery-1.4.3-py3-none-any.whl (22 kB)
Collecting pyppeteer>=0.0.14
  Downloading pyppeteer-1.0.2-py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 1.2 MB/s 
Collecting w3lib
  Downloading w3lib-1.22.0-py2.py3-none-any.whl (20 kB)
Collecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Collecting parse
  Downloading parse-1.19.0.tar.gz (30 kB)
Collecting pyee<9.0.0,>=8.1.0
  Downloading pyee-8.2.2-py2.py3-none-any.whl (12 kB)
Collecting urllib3<2.0.0,>=1.25.8
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 13.1 MB/s 
Collecting websockets<11.0,>=10.0
  Downloading websockets-10.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (111 kB)
[K     |████████████████████████████████| 111 kB 42.6 MB/s 
Collecting cssselect>0

In [10]:
import time
import requests
import pandas as pd
import numpy as np
from urllib.error import HTTPError
from requests_html import HTMLSession


In [4]:
def get_cik(session, ticker):
    print(f'Getting CIK code for {ticker}')

    cik_code = ""

    try:
        url = f'https://sec.report/Ticker/{ticker}'
        r = session.get(url, timeout=30)

        cik_text = r.html.xpath('/html/body/div[1]/div/h2[1]')[0].text
        cik_code = cik_text.split(' ')[2]
        cik_code_numbers = filter(str.isdigit, cik_code)
        cik_code = "".join(cik_code_numbers)

        print(f'{ticker} CIK: {cik_code}')
    except HTTPError as err:
        print(f'Requesting {ticker}: {err}')
    except:
        print(f'Couldn\'t get CIK for {ticker}')
    
    return cik_code

In [5]:
class FormsNotFoundException(Exception):
    """Raise when neither S-1 nor F-1 form is found"""


In [6]:
def get_submissions(cik_code):
    print(f'Getting submissions for {cik_code}')
    url = f'https://data.sec.gov/submissions/CIK{cik_code}.json'
    subs = ''
    
    if cik_code != '':
        try:
            headers = {
                'User-Agent': 'Mike Brover (mike.brover@gmail.com)',
                'From': 'john.testorsu@gmail.com'
            }
            res = requests.get(url, headers=headers, timeout=20)
            subs = res.json()
        except:
            print(f'Failed to get submissions for {cik_code}')

    return subs

In [7]:
def get_access_codes(submissions):
    accession_number = ''
    primary_document = ''

    if submissions != '':
        try:
            recent_filings = submissions['filings']['recent']
            index = -1
            for i, form_name in enumerate(recent_filings['form']):
                if ('S-1' in form_name) or ('F-1' in form_name):
                    index = i
                    break

            if index == -1:
                print('Neither S-1 nor F-1 form was found')
                return accession_number, primary_document

            accession_number = recent_filings['accessionNumber'][index].replace('-', '')
            primary_document = recent_filings['primaryDocument'][index]
        except:
            print('Failed to get access codes')

    return accession_number, primary_document

In [8]:
def get_report_url(cik_code, accession_number, primary_document):
    if cik_code == '' or accession_number == '' or primary_document == '':
        return 'Not found'
    return f'https://www.sec.gov/Archives/edgar/data/{cik_code}/{accession_number}/{primary_document}'

In [12]:
!gdown --id 1dY2urwIk-1sdwFdXz8fZymxtfGqS0JeC

Downloading...
From: https://drive.google.com/uc?id=1dY2urwIk-1sdwFdXz8fZymxtfGqS0JeC
To: /content/data_features_001 (3).zip
  0% 0.00/57.1k [00:00<?, ?B/s]100% 57.1k/57.1k [00:00<00:00, 44.1MB/s]


In [13]:
from zipfile import ZipFile
file_name = '/content/data_features_001 (3).zip'

with ZipFile(file_name, 'r') as zipi:
  zipi.extractall()
  print('Done')

Done


In [16]:
df = pd.read_csv('/content/data_features_001 (3).csv', index_col=None, na_values=['NA'])

In [None]:
tickers = df['Symbol'].to_numpy() 

print(tickers)

session = HTMLSession()
report_urls = {}
for i, ticker in enumerate(tickers):
    try:
        print(f'{i}/{len(tickers)} - {ticker}')
        cik_code = get_cik(session, ticker)
        sub_json = get_submissions(cik_code)
        accession_number, primary_document = get_access_codes(sub_json)
        report_url = get_report_url(cik_code, accession_number, primary_document)

        report_urls[ticker] = [report_url]
        print('------------------')
        time.sleep(0.2)
    except:
        print(f'Failed to get url of {ticker}')

out_df = pd.DataFrame.from_dict(report_urls, orient='index', columns=['URL'])
print(out_df)
out_df.to_csv('/content/drive/My Drive/IPO_Project/data/processed_data/report_urls.csv') 