In [None]:
"""
Annual 10-K filing for Q4 2019 are not in the Google Cloud Platform database

Text files of data available by quarter from SEC website

A few companies sent 2019 reports in Q3 2020 

prepare Q3 2020 data and extract the 2019 period_end_date reports
"""

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
#The NUM data set contains numeric data, one row per data point in the financial statements.
NUM_q = pd.read_csv('../data/2020q3/num.txt', delimiter='\t' , 
                     parse_dates=['ddate'] )

In [3]:
NUM_q.shape

(2351640, 9)

In [4]:
NUM_q.head()

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote
0,0001661920-20-000055,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,24944890.0,
1,0001469709-20-000039,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,3109520000.0,
2,0001437557-20-000069,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,95122430.0,
3,0001628280-20-010192,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,103292500.0,
4,0001469709-20-000042,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,2901311.0,


In [5]:
# detailed explaination of each field provided by SEC
# update field names to match the GCP labels

NUM_q.columns = ['adsh', 'measure_tag', 'version', 'coreg', 
                  'period_end_date', 'number_of_quarters','units', 'value','footnote']

In [6]:
NUM_q.head()

Unnamed: 0,adsh,measure_tag,version,coreg,period_end_date,number_of_quarters,units,value,footnote
0,0001661920-20-000055,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,24944890.0,
1,0001469709-20-000039,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,3109520000.0,
2,0001437557-20-000069,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,95122430.0,
3,0001628280-20-010192,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,103292500.0,
4,0001469709-20-000042,EntityCommonStockSharesOutstanding,dei/2018,,2020-06-30,0,shares,2901311.0,


In [7]:
NUM_q.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351640 entries, 0 to 2351639
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   adsh                object        
 1   measure_tag         object        
 2   version             object        
 3   coreg               object        
 4   period_end_date     datetime64[ns]
 5   number_of_quarters  int64         
 6   units               object        
 7   value               float64       
 8   footnote            object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 161.5+ MB


In [8]:
#The SUB (submissions) data set contains summary information about an entire EDGAR submission. 

SUB_q = pd.read_csv('../data/2020q3/sub.txt', delimiter='\t', 
                     parse_dates=['period'], 
                     dtype={'sic':  'str', 'ein':'str', 'cik':'str', 'fy':'str', 'fp':'str'})

In [9]:
SUB_q.shape

(14832, 36)

In [10]:
SUB_q.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
0,0000006955-20-000039,6955,ENERPAC TOOL GROUP CORP,3590,US,WI,MENOMONEE FALLS,53051,ATTN: BRYAN JOHNSON,N86 W12500 WESTBROOK CROSSING,...,2020-05-31,2020.0,Q3,20200701,2020-07-01 14:13:00.0,0,1,epac10q5312020_htm.xml,1,
1,0000014846-20-000022,14846,BRT APARTMENTS CORP.,6798,US,NY,GREAT NECK,11021-3190,60 CUTTER MILL RD,SUITE 303,...,2020-06-30,,,20200701,2020-07-01 16:12:00.0,0,0,brt-20200630_htm.xml,1,
2,0000016918-20-000171,16918,"CONSTELLATION BRANDS, INC.",2080,US,NY,VICTOR,14564,207 HIGH POINT DRIVE,BUILDING 100,...,2020-06-30,,,20200701,2020-07-01 08:12:00.0,0,0,stz-20200630_htm.xml,1,
3,0000016918-20-000173,16918,"CONSTELLATION BRANDS, INC.",2080,US,NY,VICTOR,14564,207 HIGH POINT DRIVE,BUILDING 100,...,2020-05-31,2021.0,Q1,20200701,2020-07-01 13:59:00.0,0,1,stz-20200531_htm.xml,1,
4,0000027879-20-000012,27879,DELMARVA POWER & LIGHT CO /DE/,4931,US,DE,WILMINGTON,19899,800 KING ST,PO BOX 231,...,2020-06-30,,,20200701,2020-07-01 15:33:00.0,0,0,exc-20200701_htm.xml,1,


In [None]:
# check forms included in this file
# only keep the 10-K and 10-K/A

SUB_q.form.unique()

In [None]:
# check all US located companies
# will drop non-US companies

SUB_q.countryba.unique()

In [None]:
#SUB_q.columns

In [None]:
#SUB_q.info()

In [11]:
# Will merge NUM and SUB files and then filter like how GCP queries were filtered
# per details from SEC of how to merge these datasets on adsh

all_q = pd.merge(NUM_q, SUB_q, how='left', on=['adsh'])

In [12]:
all_q.shape

(2351640, 44)

In [None]:
#all_q.head()

In [None]:
#all_q.tail()

In [13]:
# pickle out wide Q3 2020 submissions

import pickle

with open('Q3_2020.pickle', 'wb') as to_write:
    pickle.dump(all_q, to_write)