In [3]:
import nltk
import wrds
import pandas as pd

In [4]:
conn = wrds.Connection()

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
Loading library list...
Done


In [5]:
print(conn.list_libraries())

['aha_sample', 'ahasamp', 'auditsmp', 'auditsmp_all', 'block', 'block_all', 'boardex_trial', 'boardsmp', 'calcbench_trial', 'calcbnch', 'cboe', 'cboe_all', 'ciq', 'ciq_capstrct', 'ciq_common', 'ciq_keydev', 'ciq_pplintel', 'ciqsamp', 'ciqsamp_common', 'ciqsamp_transcripts', 'columnar', 'comp', 'comp_bank', 'comp_bank_daily', 'comp_execucomp', 'comp_filings', 'comp_global', 'comp_global_daily', 'comp_na_annual_all', 'comp_na_daily_all', 'comp_na_monthly_all', 'comp_segments_hist', 'comp_segments_hist_daily', 'compa', 'compb', 'compg', 'compm', 'compsamp_snapshot', 'compseg', 'contrib', 'contrib_ceo_turnover', 'contrib_char_returns', 'contrib_general', 'contrib_intangible_value', 'contrib_kpss', 'contrib_liva', 'contrib_shale', 'crsp', 'crsp_a_ccm', 'crsp_a_indexes', 'crsp_a_stock', 'crsp_a_treasuries', 'crsp_q_mutualfunds', 'csmar', 'csmar_financial', 'csmar_funds', 'csmar_trade', 'dealscan', 'djones', 'djones_all', 'dmef', 'dmef_all', 'doe', 'doe_all', 'etfg_samp', 'etfgsamp', 'eurekah

In [6]:
### Get S&P500 Index Membership from CRSP
### I opt for the monthly frequency of the data,
### but one can choose to work with crsp.dsp500list
### if more precise date range is needed.

sp500 = conn.raw_sql("""
                        select a.*, b.date, b.ret
                        from crsp.msp500list as a,
                        crsp.msf as b
                        where a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/2000'
                        order by date;
                        """, date_cols=['start', 'ending', 'date'])


### Add Other Company Identifiers from CRSP.MSENAMES
### - You don't need this step if only PERMNO is required
### - This step aims to add TICKER, SHRCD, EXCHCD and etc.

mse = conn.raw_sql("""
                        select comnam, ncusip, namedt, nameendt,
                        permno, shrcd, exchcd, hsiccd, ticker
                        from crsp.msenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
mse['nameendt']=mse['nameendt'].fillna(pd.to_datetime('today'))

# Merge with SP500 data
sp500_full = pd.merge(sp500, mse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \
                            & (sp500_full.date<=sp500_full.nameendt)]


### Add Other Company Identifiers from CRSP.MSENAMES
### - You don't need this step if only PERMNO is required
### - This step aims to add TICKER, SHRCD, EXCHCD and etc.

mse = conn.raw_sql("""
                        select comnam, ncusip, namedt, nameendt,
                        permno, shrcd, exchcd, hsiccd, ticker
                        from crsp.msenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
mse['nameendt']=mse['nameendt'].fillna(pd.to_datetime('today'))

# Merge with SP500 data
sp500_full = pd.merge(sp500, mse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \
                            & (sp500_full.date<=sp500_full.nameendt)]


### Add Compustat Identifiers
### - Link with Compustat's GVKEY and IID if need to work with
###   fundamental data
### - Linkage is done through crsp.ccmxpf_linktable

ccm=conn.raw_sql("""
                  select gvkey, liid as iid, lpermno as permno,
                  linktype, linkprim, linkdt, linkenddt
                  from crsp.ccmxpf_linktable
                  where substr(linktype,1,1)='L'
                  and (linkprim ='C' or linkprim='P')
                  """, date_cols=['linkdt', 'linkenddt'])

# if linkenddt is missing then set to today date
ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today'))

# Merge the CCM data with S&P500 data
# First just link by matching PERMNO
sp500ccm = pd.merge(sp500_full, ccm, how='left', on=['permno'])

# Then set link date bounds
sp500ccm = sp500ccm.loc[(sp500ccm['date']>=sp500ccm['linkdt'])\
                        &(sp500ccm['date']<=sp500ccm['linkenddt'])]

# Rearrange columns for final output

sp500ccm = sp500ccm.drop(columns=['namedt', 'nameendt', 'linktype', \
                                  'linkprim', 'linkdt', 'linkenddt'])
sp500ccm = sp500ccm[['date', 'permno', 'comnam', 'ncusip',\
                     'shrcd', 'exchcd', 'hsiccd', 'ticker', \
                     'gvkey', 'iid', 'start', 'ending', 'ret']]


### Add CIKs and Link with SEC Index Files using CIK

names = conn.raw_sql(""" select gvkey, cik, sic, naics, gind, gsubind from comp.names """)

# Merge sp500 constituents table with names table
sp500cik = pd.merge(sp500ccm, names, on='gvkey',  how='left')
sp500cik.head(20)

Unnamed: 0,date,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,ret,cik,sic,naics,gind,gsubind
0,2000-01-31,40416.0,AVON PRODUCTS INC,05430310,11.0,1.0,2844.0,AVP,1920,1,1967-05-18,2015-03-20,-0.035985,8868,2844,325620,303020,30302010
1,2000-01-31,44062.0,SPRINGS INDUSTRIES INC,85178310,11.0,1.0,2221.0,SMI,9963,1,1967-06-29,2000-12-11,-0.089202,93102,2211,313210,252010,25201020
2,2000-01-31,26403.0,DISNEY WALT CO,25468710,11.0,1.0,4833.0,DIS,3980,1,1976-07-01,2022-03-31,0.241453,1744489,4888,515120,502020,50202010
3,2000-01-31,60628.0,FEDEX CORP,31428X10,11.0,1.0,4513.0,FDX,4598,1,1980-11-06,2022-03-31,-0.033588,1048911,4513,492110,203010,20301010
4,2000-01-31,69032.0,MORGAN STANLEY DEAN WITTER & CO,61744644,11.0,1.0,6282.0,MWD,12124,1,1995-09-22,2022-03-31,-0.069002,895421,6211,523110,402030,40203020
5,2000-01-31,21186.0,WESTVACO CORP,96154810,11.0,1.0,2631.0,W,11446,1,1957-03-01,2022-03-31,-0.159004,1159297,2631,322130,151030,15103020
6,2000-01-31,52978.0,HASBRO INC,41805610,11.0,1.0,3944.0,HAS,5518,1,1984-09-13,2022-03-31,-0.204752,46080,3944,339930,252020,25202010
7,2000-01-31,21371.0,CARDINAL HEALTH INC,14149Y10,11.0,1.0,5122.0,CAH,2751,1,1997-05-27,2022-03-31,-0.003916,721371,5122,424210,351020,35102010
8,2000-01-31,75333.0,BURLINGTON RESOURCES INC,12201410,11.0,1.0,1311.0,BR,15084,1,1993-11-23,2006-03-31,-0.030246,833320,1311,211111,101020,10102020
9,2000-01-31,23317.0,ENRON CORP,29356110,11.0,1.0,1311.0,ENE,6127,1,1951-09-06,2001-11-29,0.529577,1024401,5172,422720,551050,55105010


In [7]:
len(sp500cik)

133640

In [8]:
sp500_2020 = sp500cik.loc[sp500cik.date=='12/31/2020'][['date', 'permno',
                                                      'comnam',
                                               'ncusip', 'gvkey', 'iid', 'cik', 'ticker', 'sic', 'naics']]

In [9]:
sp500_2020

Unnamed: 0,date,permno,comnam,ncusip,gvkey,iid,cik,ticker,sic,naics
125642,2020-12-31,90373.0,DIGITAL REALTY TRUST INC,25386810,160991,01,0001297996,DLR,6798,531120
125643,2020-12-31,69032.0,MORGAN STANLEY DEAN WITTER & CO,61744644,012124,01,0000895421,MS,6211,523110
125644,2020-12-31,57665.0,NIKE INC,65410610,007906,01,0000320187,NKE,3021,316210
125645,2020-12-31,57568.0,BALL CORP,05849810,001988,01,0000009389,BLL,3411,332431
125646,2020-12-31,85072.0,RALPH LAUREN CORP,75121210,064891,01,0001037038,RL,2300,315
...,...,...,...,...,...,...,...,...,...,...
126138,2020-12-31,67598.0,HEALTHPEAK PROPERTIES INC,42250P10,013125,01,0000765880,PEAK,6798,531120
126139,2020-12-31,59328.0,INTEL CORP,45814010,006008,01,0000050863,INTC,3674,334413
126140,2020-12-31,14277.0,SCHLUMBERGER LTD,80685710,009465,01,0000087347,SLB,1389,213112
126141,2020-12-31,75591.0,IDEX CORP,45167R10,015267,01,0000832101,IEX,3561,333914


In [10]:
len(sp500_2020)

501

In [11]:
sp500_2021 = sp500cik.loc[sp500cik.date=='12/31/2021'][['date', 'permno',
                                                      'comnam',
                                               'ncusip', 'gvkey', 'iid', 'cik', 'ticker', 'sic', 'naics']]

In [12]:
sp500_2021

Unnamed: 0,date,permno,comnam,ncusip,gvkey,iid,cik,ticker,sic,naics
131643,2021-12-31,83621.0,ANSYS INC,03662Q10,063080,01,0001013462,ANSS,7372,511210
131644,2021-12-31,53065.0,INTERPUBLIC GROUP COS INC,46069010,006136,01,0000051644,IPG,7311,541810
131645,2021-12-31,80681.0,ESSEX PROPERTY TRUST INC,29717810,030293,01,0000920522,ESS,6798,531110
131646,2021-12-31,89071.0,ACCENTURE PLC IRELAND,G1151C10,143357,01,0001467373,ACN,8742,541611
131647,2021-12-31,73139.0,STRYKER CORP,86366710,010115,01,0000310764,SYK,3842,339113
...,...,...,...,...,...,...,...,...,...,...
132138,2021-12-31,93096.0,DOLLAR GENERAL CORP NEW,25667710,004016,02,0000029534,DG,5331,452319
132139,2021-12-31,19502.0,WALGREENS BOOTS ALLIANCE INC,93142710,011264,01,0001618921,WBA,5912,446110
132140,2021-12-31,69796.0,CONSTELLATION BRANDS INC,21036P10,002710,02,0000016918,STZ,2082,312120
132141,2021-12-31,62092.0,THERMO FISHER SCIENTIFIC INC,88355610,010530,01,0000097745,TMO,3826,334516


In [13]:
len(sp500_2021)

500

In [15]:
# get s&p 500 cik cilk numbers
sp500_cik = sp500_2020.cik

In [19]:
len(sp500_cik)

501

In [20]:
for i in sp500_cik:
    print(i)

0001297996
0000895421
0000320187
0000009389
0001037038
0001123360
0000059558
0001596783
0000352541
0001534701
0000815556
0001555280
0000821189
0000728535
0000020286
0000075362
0000910606
0001915657
0000310764
0001012100
0000103379
0001604778
0001045810
0001306830
0000915913
0001166691
0001108524
0000021344
0000783280
0000106640
0000879169
0000823768
0000920148
0001551182
0000319201
0000105770
0001336920
0001707925
0001492633
0000866787
0001053507
0000106535
0000051434
0000091142
0001378946
0001757898
0000745732
0000058492
0001043604
0000217346
0001489393
0000031462
0001116132
0000943819
0000096021
0001786842
0000029905
0000723254
0001091667
0000093556
0001358071
0000859737
0001013462
0000820313
0000060086
0000027904
0001701605
0001564708
0000066740
0001267238
0000091419
0000813828
0000006201
0000039911
0000091440
0000059478
0001442145
0001286681
0001090872
0001300514
0000036270
0001000228
0001067701
0000049826
0000872589
0000040704
0000702165
0000079879
0001043277
0000002488
0000354950

In [21]:
from sec_edgar_downloader import Downloader

In [22]:
dl = Downloader("data")


In [23]:
dl.get("10-Q", "AAPL", after="2020-01-01", before="2022-01-01")



6

In [24]:
from bs4 import BeautifulSoup

In [27]:
with open("data/sec-edgar-filings/AAPL/10-Q/0000320193-20-000010/filing-details.html") as fp:
    soup = BeautifulSoup(fp, "html.parser")

In [31]:
print(soup.get_text())




Document


27000000002400000000P1Y0false--09-26Q1202000003201930.000010.000011260000000012600000000444323600043849590004443236000438495900013600000020000000000.04650.0050.003502900000000P1YP1YP1Y

0000320193


2019-09-29
2019-12-28



0000320193

aapl:A0.000Notesdue2025Member



2019-09-29
2019-12-28



0000320193

aapl:A0.875NotesDue2025Member



2019-09-29
2019-12-28



0000320193

aapl:A1.375NotesDue2024Member



2019-09-29
2019-12-28



0000320193

us-gaap:CommonStockMember



2019-09-29
2019-12-28



0000320193

aapl:A3.050NotesDue2029Member



2019-09-29
2019-12-28



0000320193

aapl:A2.000NotesDue2027Member



2019-09-29
2019-12-28



0000320193

aapl:A1.625NotesDue2026Member



2019-09-29
2019-12-28



0000320193

aapl:A1.375NotesDue2029Member



2019-09-29
2019-12-28



0000320193

aapl:A3.600NotesDue2042Member



2019-09-29
2019-12-28



0000320193

aapl:A0.500Notesdue2031Member



2019-09-29
2019-12-28



0000320193

aapl:A1.000NotesDue2022Member



2019-09-29
2019-12-28

In [43]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [44]:
# removing the stop words from the 10 Q corpus
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dennisfenchenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [50]:
ten_q_apple_corpus = soup.get_text()

ten_q_apple_corpus_tokens = word_tokenize(ten_q_apple_corpus)

stop_words = set(stopwords.words('english'))

ten_q_apple_filtered = [word for word in ten_q_apple_corpus_tokens if not
word.lower() in stop_words]

In [53]:
print("The length of the original 10Q apple corpus is " + str(len
(ten_q_apple_corpus)))
print("The length of the filtered 10Q apple corpus is " + str(len
(ten_q_apple_filtered)))

The length of the original 10Q apple corpus is 125649
The length of the filtered 10Q apple corpus is 14853
