In [None]:
"""
Annual 10-K filing for Q4 2019 are not in the Google Cloud Platform database

Downdloaded zip file from SEC website containing Q4 2019 10-K

there are 4 txt files and notes provided by SEC regarding ile contents and the data 
relationships to join the files

2 of the files NUM and SUB are needed  

the column names will need adjusted to be consistant with the GCP db field naming
"""

In [None]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
#The NUM data set contains numeric data, one row per data point in the financial statements.
NUM_q4 = pd.read_csv('../data/2019q4/num.txt', delimiter='\t' , 
                     parse_dates=['ddate'] )

In [3]:
NUM_q4.shape

(2402955, 9)

In [4]:
NUM_q4.head()

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote
0,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,FerrellgasLP,2018-05-31,0,USD,100000000.0,
1,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,FerrellgasLP,2017-07-31,0,USD,175000000.0,
2,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,FerrellgasLP,2012-05-31,0,USD,140000000.0,
3,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,,2018-05-31,0,USD,100000000.0,
4,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,,2018-04-30,0,USD,100000000.0,


In [5]:
# detailed explaination of each field provided by SEC
# update field names to match the GCP labels

NUM_q4.columns = ['adsh', 'measure_tag', 'version', 'coreg', 
                  'period_end_date', 'number_of_quarters','units', 'value','footnote']

In [6]:
NUM_q4.head()

Unnamed: 0,adsh,measure_tag,version,coreg,period_end_date,number_of_quarters,units,value,footnote
0,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,FerrellgasLP,2018-05-31,0,USD,100000000.0,
1,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,FerrellgasLP,2017-07-31,0,USD,175000000.0,
2,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,FerrellgasLP,2012-05-31,0,USD,140000000.0,
3,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,,2018-05-31,0,USD,100000000.0,
4,0001558370-19-008908,DerivativeNotionalAmount,invest/2013,,2018-04-30,0,USD,100000000.0,


In [7]:
NUM_q4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2402955 entries, 0 to 2402954
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   adsh                object        
 1   measure_tag         object        
 2   version             object        
 3   coreg               object        
 4   period_end_date     datetime64[ns]
 5   number_of_quarters  int64         
 6   units               object        
 7   value               float64       
 8   footnote            object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 165.0+ MB


In [8]:
#The SUB (submissions) data set contains summary information about an entire EDGAR submission. 

SUB_q4 = pd.read_csv('../data/2019q4/sub.txt', delimiter='\t', 
                     parse_dates=['period'], 
                     dtype={'sic':  'str', 'ein':'str', 'cik':'str', 'fy':'str', 'fp':'str'})

In [9]:
SUB_q4.shape

(12664, 36)

In [10]:
SUB_q4.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
0,0000018498-19-000043,18498,GENESCO INC,5661,US,TN,NASHVILLE,37217,GENESCO PK 1415 MURFREESBORO RD,,...,2019-09-30,,,20191001,2019-10-01 16:24:00.0,0,0,a8-krepurchase100119_htm.xml,1,
1,0000032604-19-000040,32604,EMERSON ELECTRIC CO,3600,US,MO,ST LOUIS,63136,8000 W FLORISSANT AVE,P O BOX 4100,...,2019-09-30,,,20191001,2019-10-01 06:58:00.0,0,0,form8-kreviewannouncem_htm.xml,1,
2,0000034088-19-000051,34088,EXXON MOBIL CORP,2911,US,TX,IRVING,75039-2298,5959 LAS COLINAS BLVD,,...,2019-09-30,,,20191001,2019-10-01 17:16:00.0,0,0,r8k100119_htm.xml,1,
3,0000037996-19-000071,37996,FORD MOTOR CO,3711,US,MI,DEARBORN,48126,ONE AMERICAN ROAD,,...,2019-09-30,,,20191001,2019-10-01 07:05:00.0,1,0,a8-kdated9x30x2019_htm.xml,1,
4,0000063754-19-000166,63754,MCCORMICK & CO INC,2090,US,MD,HUNT VALLEY,21031,24 SCHILLING ROAD,SUITE 1,...,2019-09-30,,,20191001,2019-10-01 07:22:00.0,0,0,mkc-8xkx08x31x2019_htm.xml,1,


In [11]:
# check forms included in this file
# only keep the 10-K and 10-K/A

SUB_q4.form.unique()

array(['8-K', '10-Q', '10-Q/A', '10-K', '20-F', 'S-1', '10-K/A', 'S-1/A',
       '8-K/A', '6-K', 'S-4', 'S-4/A', '6-K/A', 'POS AM', 'F-1', '40-F/A',
       '20-F/A', '10-KT', '40-F', 'S-11/A', '8-K12B'], dtype=object)

In [12]:
# check all US located companies
# will drop non-US companies

SUB_q4.countryba.unique()

array(['US', 'CA', 'CN', 'BM', 'GB', 'IL', 'BE', 'MO', 'KY', 'SK', 'AU',
       'IE', 'ZA', 'GR', 'SG', 'BR', 'NL', 'CH', 'JP', 'EC', 'PH', 'MY',
       'HK', 'TW', 'CY', 'GE', 'RS', 'CZ', 'UA', 'DE', 'PR', 'LU', 'BG',
       nan, 'FR', 'AR', 'JE', 'TH', 'SE', 'VI', 'VG', 'MD', 'CO', 'GU',
       'ID', 'PT', 'BS', 'AE', 'EE', 'DK', 'MX', 'AI', 'MC', 'MT', 'KR',
       'KH', 'TR', 'IN', 'IT', 'KG', 'DO'], dtype=object)

In [13]:
SUB_q4.columns

Index(['adsh', 'cik', 'name', 'sic', 'countryba', 'stprba', 'cityba', 'zipba',
       'bas1', 'bas2', 'baph', 'countryma', 'stprma', 'cityma', 'zipma',
       'mas1', 'mas2', 'countryinc', 'stprinc', 'ein', 'former', 'changed',
       'afs', 'wksi', 'fye', 'form', 'period', 'fy', 'fp', 'filed', 'accepted',
       'prevrpt', 'detail', 'instance', 'nciks', 'aciks'],
      dtype='object')

In [14]:
SUB_q4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12664 entries, 0 to 12663
Data columns (total 36 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   adsh        12664 non-null  object        
 1   cik         12664 non-null  object        
 2   name        12664 non-null  object        
 3   sic         12662 non-null  object        
 4   countryba   12649 non-null  object        
 5   stprba      11842 non-null  object        
 6   cityba      12649 non-null  object        
 7   zipba       12642 non-null  object        
 8   bas1        12649 non-null  object        
 9   bas2        5093 non-null   object        
 10  baph        12652 non-null  object        
 11  countryma   12604 non-null  object        
 12  stprma      11847 non-null  object        
 13  cityma      12604 non-null  object        
 14  zipma       12593 non-null  object        
 15  mas1        12600 non-null  object        
 16  mas2        5018 non-n

In [15]:
# Will merge NUM and SUB files and then filter like how GCP queries were filtered
# per details from SEC of how to merge these datasets on adsh

all_q4 = pd.merge(NUM_q4, SUB_q4, how='left', on=['adsh'])

In [16]:
all_q4.shape

(2402955, 44)

In [None]:
#all_q4.head()

In [None]:
#all_q4.tail()

In [17]:
# pickle out wide q4_2019 submissions

import pickle

with open('Q4_2019.pickle', 'wb') as to_write:
    pickle.dump(all_q4, to_write)