In [None]:
"""
Annual 10-K filing for Q4 2019 are not in the Google Cloud Platform database

Text files of data available by quarter from SEC website

Many companies sent 2019 reports in Q1 2020 

prepare Q1 2020 data and extract the 2019 period_end_date reports
"""

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
import pandas.io.sql as pd_sql

from pandasql import sqldf
# PandaSQL needs to be able to reference the global variables already defined
pysqldf = lambda q: sqldf(q, globals())

In [3]:
import psycopg2 as pg

# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   

In [4]:
#The NUM data set contains numeric data, one row per data point in the financial statements.
NUM_q = pd.read_csv('../data/2020q1/num.txt', delimiter='\t' , 
                     parse_dates=['ddate'] )

In [5]:
NUM_q.shape

(3005393, 9)

In [6]:
NUM_q.head()

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote
0,0000028823-20-000056,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,7000000.0,
1,0000030697-20-000002,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,1661000.0,
2,0000931148-20-000024,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,200000.0,
3,0001281761-20-000010,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,28000000.0,
4,0000055785-20-000016,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,180000000.0,


In [7]:
# detailed explaination of each field provided by SEC
# update field names to match the GCP labels

NUM_q.columns = ['adsh', 'measure_tag', 'version', 'coreg', 
                  'period_end_date', 'number_of_quarters','units', 'value','footnote']

In [8]:
NUM_q.head()

Unnamed: 0,adsh,measure_tag,version,coreg,period_end_date,number_of_quarters,units,value,footnote
0,0000028823-20-000056,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,7000000.0,
1,0000030697-20-000002,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,1661000.0,
2,0000931148-20-000024,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,200000.0,
3,0001281761-20-000010,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,28000000.0,
4,0000055785-20-000016,DecreaseInUnrecognizedTaxBenefitsIsReasonablyP...,us-gaap/2018,,2019-12-31,0,USD,180000000.0,


In [9]:
NUM_q.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005393 entries, 0 to 3005392
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   adsh                object        
 1   measure_tag         object        
 2   version             object        
 3   coreg               object        
 4   period_end_date     datetime64[ns]
 5   number_of_quarters  int64         
 6   units               object        
 7   value               float64       
 8   footnote            object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 206.4+ MB


In [10]:
#The SUB (submissions) data set contains summary information about an entire EDGAR submission. 

SUB_q = pd.read_csv('../data/2020q1/sub.txt', delimiter='\t', 
                     parse_dates=['period'], 
                     dtype={'sic':  'str', 'ein':'str', 'cik':'str', 'fy':'str', 'fp':'str'})

In [11]:
SUB_q.shape

(13561, 36)

In [12]:
SUB_q.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
0,0000004977-20-000002,4977,AFLAC INC,6321,US,GA,COLUMBUS,31999,1932 WYNNTON RD,,...,2019-12-31,,,20200102,2020-01-02 07:45:00.0,0,0,afl8-kxdecember2019xot_htm.xml,1,
1,0000018498-20-000002,18498,GENESCO INC,5661,US,TN,NASHVILLE,37217,GENESCO PK 1415 MURFREESBORO RD,,...,2019-12-31,,,20200102,2020-01-02 17:05:00.0,0,0,a8-ktogast010220_htm.xml,1,
2,0000052795-20-000004,52795,ANIXTER INTERNATIONAL INC,5063,US,IL,GLENVIEW,60026,2301 PATRIOT BLVD,,...,2019-12-31,,,20200102,2020-01-02 06:38:00.0,1,0,a8-kxsecondamendedmerg_htm.xml,1,
3,0000082811-20-000002,82811,REGAL BELOIT CORP,3621,US,WI,BELOIT,53511,200 STATE ST,,...,2019-12-31,,,20200102,2020-01-02 09:09:00.0,0,0,a8-kdec272019cover_htm.xml,1,
4,0000091767-20-000003,91767,SONOCO PRODUCTS CO,2650,US,SC,HARTSVILLE,29551-0160,ONE NORTH SECOND ST,P O BOX 160,...,2019-12-31,,,20200102,2020-01-02 11:55:00.0,1,0,son-20191231_htm.xml,1,


In [None]:
# check forms included in this file
# only keep the 10-K and 10-K/A

SUB_q.form.unique()

In [None]:
# check all US located companies
# will drop non-US companies

SUB_q.countryba.unique()

In [None]:
SUB_q.columns

In [None]:
SUB_q.info()

In [13]:
# Will merge NUM and SUB files and then filter like how GCP queries were filtered
# per details from SEC of how to merge these datasets on adsh

all_q = pd.merge(NUM_q, SUB_q, how='left', on=['adsh'])

In [14]:
all_q.shape

(3005393, 44)

In [None]:
#all_q.head()

In [None]:
#all_q.tail()

In [None]:
#all_q.columns

In [None]:
import pickle

with open('Q1_2020.pickle', 'wb') as to_write:
    pickle.dump(all_q, to_write)