In [159]:
import pandas as pd
from bs4 import BeautifulSoup as BS
import requests
import pickle
import re

In [None]:
#13F-HR, 13-HR/A, 13F-NT, and 13-NT/A XML

In [243]:
def get_13F_url(cik,date_prior):
    """
    date_prior = str YYYYMMDD
    """
    
    res = requests.get("https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&dateb=%s&owner=exclude&count=100"%(cik,date_prior))
    soup = BS(res.content,"lxml")
    table = soup.findAll('tr')[5:]
    url = None
    for row in table:
        
        cols = row.findAll('td')
        if '13F' in cols[0].text or '13-' in cols[0].text:
            url = cols[1].find('a',href=True)['href']
            break
    
    if not url:
        print("No 13F Found")
        return
    
    url = "https://www.sec.gov" + url
    res = res = requests.get(url)
    soup = BS(res.content,"lxml")
    doc_url = None
    for row in soup.findAll('tr')[1:]:
    
        cols = row.findAll('td')
        if 'informationtable' in cols[3].text.lower().replace(' ','') and '.html' in  cols[2].text:
            doc_url = cols[2].find('a',href=True)['href']
            break
    
    if not doc_url:
        print("No 13F Document Found")
        return
    
    doc_url = "https://www.sec.gov" + doc_url
    return doc_url

def fund_to_13F_url_mapping(funds,date_prior):
    
    """
    input: funds - a dict. keys = fund names, vals = fund CIK's
    date_prior = str YYYYMMDD
    output a dict. keys = fund names, vals = 13F document urls
    """
    fund_to_13F = {}
    
    for fund in funds:
        cik = funds[fund]
        doc_url = get_13F_url(cik,date_prior)
        fund_to_13F[fund] = doc_url
        
    return fund_to_13F

def cusip_to_ticker(cusip):
    
    url = "https://quotes.fidelity.com/mmnet/SymLookup.phtml?reqforlookup=REQUESTFORLOOKUP&for=stock&by=cusip&criteria="
    res = requests.get(url + cusip)
    soup = BS(res.content,"lxml")
    ticker = soup.findAll('table')[7].find('a',href=True).text
    return ticker

def get_cusips(url_list):
    
    cusips = set()
    for _, url in url_list.items():
        """
        res = requests.get(url)
        soup = BS(res.content,"lxml")
        cusip = [i.text for i in soup.findAll('cusip')]
        cusips.update(cusip)"""
        res = requests.get(url)
        soup = BS(res.content,"lxml")
        rows = soup.find_all('tr')[11:]
        for row in rows:
        
            position = row.find_all('td')
            cusip = position[2].text
            cusips.add(cusip)
        
    return cusips   



def create_funds_to_stock_mapping(funds_url,cusip_tickers):
    
    output = {}
    for fund, url in funds_url.items():
        print("Parsing %s \nwith url %s"%(fund,url))
        output[fund] = parse_13F_doc_html(url,cusip_tickers)
        print("Parsed %s Rows"%(len(output[fund])))
    return output
        
    
def parse_13F_doc_html(url,cusip_tickers):
    
    """
    input - url of 13f doc
    output - dict of relevant column vals
    """
    res = requests.get(url)
    soup = BS(res.content,"lxml")
    rows = soup.find_all('tr')[11:]
    output =  []
    for row in rows:

        dic = {}
        position = row.find_all('td')
        dic["NAME_OF_ISSUER"] = position[0].text
        dic["TICKER"] = cusip_tickers[position[2].text]
        dic["TITLE_OF_CLASS"] = position[1].text
        dic["CUSIP"] = position[2].text
        dic["VALUE"] = int(re.sub('[^0-9]+', '',position[3].text))
        dic["SHARES"] = int(re.sub('[^0-9]+', '',position[4].text))
        dic["PUT/CALL"] = position[6].text
        
        output.append(dic)
    
    return output
        
    
def parse_13F_doc_xml(url,cusip_tickers):
    
    """
    input - url of 13f doc
    output - dict of relevant column vals
    """
    res = requests.get(url)
    soup = BS(res.content,"lxml")
    rows = soup.findAll('infotable')
    output = []
    
    
    for row in rows:
        row_dict  = {}
        if row.cusip and row.cusip.text in cusip_tickers:
            row_dict['cusip'] = row.cusip.text
            row_dict['name'] = row.nameofissuer.text
            row_dict['ticker'] = cusip_tickers[row.cusip.text]
            if row.value:
                row_dict['value'] = int(re.sub('[^0-9]+', '',row.value.text))
            if row.sshprnamt:
                row_dict['shares'] = int(re.sub('[^0-9]+', '',row.sshprnamt.text))
            if row.putcall: 
                row_dict['put/call'] = row.putcall.text
    
        output.append(row_dict)
    
    return output    

In [128]:
fund_ciks = {'Bridgewater Associates':'0001350694',
 'Renaissance Technologies': '0001037389',
 'Man Group': '0001037389',
 'Millennium Management': '0001273087',
 'Elliott Management': '0001048445',
 'BlackRock ': ' 0001364742',
 'Two Sigma Investments': '0001179392',
 'TCI Fund Management': '0001647251',
 'Citadel': '0001423053',
 'D.E. Shaw Group': '0001009207',
 'AQR Capital Management': '0001167557',
 'Davidson Kempner Capital Management': '0001595082',
 'Farallon Capital Management.': '0000909661',
 'Baupost Group': '0001061768',
 'Marshall Wace': '0001318757',
 'Capula Investment Management': '0001557017',
 'Canyon Capital': '0001074034',
 'Viking Global Investors': '0001103804',
 'Point72 Asset Management': '0001603466',
 'York Capital Management': '0001480532',
 'Element Capital Management': '0001535630',
 'Cevian Capital': '0001365341',
 'GoldenTree Asset Management': '0001278951',
 'Graham Capital Management': '0001315421',
 'Anchorage Capital Group': '0001300714',
 'King Street Capital Management': '0001218199',
 'Angelo Gordon': '0000860662',
 'D1 Capital Partners': '0001747057',
 'ExodusPoint Capital': '0001736225',
 'Pershing Square': '0001336528',
 'Lone Pine Capital': '0001061165',
}

In [114]:
pickle.dump(fund_ciks,open("Fund CIKs.pickle","wb"))

In [210]:
funds_13F_url = fund_to_13F_url_mapping(fund_ciks,'20210101')

In [217]:
pickle.dump(funds_13F_url,open("funds_13F_url.pickle","wb"))

In [211]:
funds_13F_url

{'Bridgewater Associates': 'https://www.sec.gov/Archives/edgar/data/1350694/000156761920019382/xslForm13F_X01/form13fInfoTable.xml',
 'Renaissance Technologies': 'https://www.sec.gov/Archives/edgar/data/1037389/000103738920000322/xslForm13F_X01/renaissance13Fq32020_holding.xml',
 'Man Group': 'https://www.sec.gov/Archives/edgar/data/1037389/000103738920000322/xslForm13F_X01/renaissance13Fq32020_holding.xml',
 'Millennium Management': 'https://www.sec.gov/Archives/edgar/data/1273087/000127308720000034/xslForm13F_X01/MLP_Filing_20200930_final.xml',
 'Elliott Management': 'https://www.sec.gov/Archives/edgar/data/1048445/000156761920019906/xslForm13F_X01/form13fInfoTable.xml',
 'BlackRock ': 'https://www.sec.gov/Archives/edgar/data/1364742/000142645920000005/xslForm13F_X01/form13fInfoTable.xml',
 'Two Sigma Investments': 'https://www.sec.gov/Archives/edgar/data/1179392/000091957420007141/xslForm13F_X01/infotable.xml',
 'TCI Fund Management': 'https://www.sec.gov/Archives/edgar/data/1647251

In [218]:
cusips = get_cusips(funds_13F_url)

In [224]:
pickle.dump(cusips,open("cusips.pickle","wb"))

In [223]:
len(cusips)

6834

In [234]:
cusip_tickers = {}
for i,j in enumerate(cusips):
    
    if not i%100:
        print("completed ", i)
        pickle.dump(cusip_tickers,open("cusip_tickers.pickle","wb"))
    
    cusip_tickers[j] = cusip_to_ticker(j)

completed  0
completed  100
completed  200
completed  300
completed  400
completed  500
completed  600
completed  700
completed  800
completed  900
completed  1000
completed  1100
completed  1200
completed  1300
completed  1400
completed  1500
completed  1600
completed  1700
completed  1800
completed  1900
completed  2000
completed  2100
completed  2200
completed  2300
completed  2400
completed  2500
completed  2600
completed  2700
completed  2800
completed  2900
completed  3000
completed  3100
completed  3200
completed  3300
completed  3400
completed  3500
completed  3600
completed  3700
completed  3800
completed  3900
completed  4000
completed  4100
completed  4200
completed  4300
completed  4400
completed  4500
completed  4600
completed  4700
completed  4800
completed  4900
completed  5000
completed  5100
completed  5200
completed  5300
completed  5400
completed  5500
completed  5600
completed  5700
completed  5800
completed  5900
completed  6000
completed  6100
completed  6200
comp

In [236]:
len([t for t in cusip_tickers.values() if not t])

502

In [237]:
pickle.dump(cusip_tickers,open("cusip_tickers.pickle","wb"))

In [244]:
funds_to_stocks = create_funds_to_stock_mapping(funds_13F_url,cusip_tickers)

Parsing Bridgewater Associates 
with url https://www.sec.gov/Archives/edgar/data/1350694/000156761920019382/xslForm13F_X01/form13fInfoTable.xml
Parsed 430 Rows
Parsing Renaissance Technologies 
with url https://www.sec.gov/Archives/edgar/data/1037389/000103738920000322/xslForm13F_X01/renaissance13Fq32020_holding.xml
Parsed 3333 Rows
Parsing Man Group 
with url https://www.sec.gov/Archives/edgar/data/1037389/000103738920000322/xslForm13F_X01/renaissance13Fq32020_holding.xml
Parsed 3333 Rows
Parsing Millennium Management 
with url https://www.sec.gov/Archives/edgar/data/1273087/000127308720000034/xslForm13F_X01/MLP_Filing_20200930_final.xml
Parsed 4381 Rows
Parsing Elliott Management 
with url https://www.sec.gov/Archives/edgar/data/1048445/000156761920019906/xslForm13F_X01/form13fInfoTable.xml
Parsed 1 Rows
Parsing BlackRock  
with url https://www.sec.gov/Archives/edgar/data/1364742/000142645920000005/xslForm13F_X01/form13fInfoTable.xml
Parsed 45426 Rows
Parsing Two Sigma Investments 
w

In [248]:
pd.DataFrame(funds_to_stocks['Two Sigma Investments'])

Unnamed: 0,NAME_OF_ISSUER,TICKER,TITLE_OF_CLASS,CUSIP,VALUE,SHARES,PUT/CALL
0,1LIFE HEALTHCARE INC,ONEM,COM,68269G107,6721,237002,
1,1LIFE HEALTHCARE INC,ONEM,COM,68269G107,547,19300,Call
2,1LIFE HEALTHCARE INC,ONEM,COM,68269G107,346,12200,Put
3,21VIANET GROUP INC,VNET,SPONSORED ADS A,90138A103,1747,75431,
4,2U INC,TWOU,COM,90214J101,25205,744392,
...,...,...,...,...,...,...,...
3008,ZUMIEZ INC,ZUMZ,COM,989817101,586,21053,
3009,ZUORA INC,ZUO,COM CL A,98983V106,362,34990,
3010,ZYNGA INC,ZNGA,CL A,98986T108,44721,4903566,
3011,ZYNGA INC,ZNGA,CL A,98986T108,2394,262500,Call


In [249]:
pickle.dump(funds_to_stocks,open("funds_to_stocks.pickle","wb"))

In [250]:
funds_to_include = [i for i in funds_to_stocks if len(funds_to_stocks[i])>30]

In [253]:
pickle.dump(funds_to_include,open("funds_to_include.pickle","wb"))