In [None]:
"""
Annual 10-K filing for Q4 2019 are not in the Google Cloud Platform database

Text files of data available by quarter from SEC website

Many companies sent 2019 reports in Q2 2020 

prepare Q2 2020 data and extract the 2019 period_end_date reports
"""

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
import pandas.io.sql as pd_sql

from pandasql import sqldf
# PandaSQL needs to be able to reference the global variables already defined
pysqldf = lambda q: sqldf(q, globals())

In [3]:
import psycopg2 as pg

# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   

In [4]:
#The NUM data set contains numeric data, one row per data point in the financial statements.
NUM_q = pd.read_csv('../data/2020q2/num.txt', delimiter='\t' , 
                     parse_dates=['ddate'] )

In [5]:
NUM_q.shape

(2197732, 9)

In [6]:
NUM_q.head()

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote
0,0001108827-20-000044,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2019-12-31,0,USD,26800000.0,
1,0001108827-20-000044,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2020-03-31,0,USD,21000000.0,
2,0001418135-20-000018,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2020-03-31,0,USD,31000000.0,
3,0001418135-20-000018,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2019-12-31,0,USD,40000000.0,
4,0001628280-20-006067,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2020-03-31,0,USD,2830000.0,


In [7]:
# detailed explaination of each field provided by SEC
# update field names to match the GCP labels

NUM_q.columns = ['adsh', 'measure_tag', 'version', 'coreg', 
                  'period_end_date', 'number_of_quarters','units', 'value','footnote']

In [8]:
NUM_q.head()

Unnamed: 0,adsh,measure_tag,version,coreg,period_end_date,number_of_quarters,units,value,footnote
0,0001108827-20-000044,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2019-12-31,0,USD,26800000.0,
1,0001108827-20-000044,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2020-03-31,0,USD,21000000.0,
2,0001418135-20-000018,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2020-03-31,0,USD,31000000.0,
3,0001418135-20-000018,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2019-12-31,0,USD,40000000.0,
4,0001628280-20-006067,DeferredCompensationLiabilityClassifiedNoncurrent,us-gaap/2018,,2020-03-31,0,USD,2830000.0,


In [9]:
NUM_q.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197732 entries, 0 to 2197731
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   adsh                object        
 1   measure_tag         object        
 2   version             object        
 3   coreg               object        
 4   period_end_date     datetime64[ns]
 5   number_of_quarters  int64         
 6   units               object        
 7   value               float64       
 8   footnote            object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 150.9+ MB


In [10]:
#The SUB (submissions) data set contains summary information about an entire EDGAR submission. 

SUB_q = pd.read_csv('../data/2020q2/sub.txt', delimiter='\t', 
                     parse_dates=['period'], 
                     dtype={'sic':  'str', 'ein':'str', 'cik':'str', 'fy':'str', 'fp':'str'})

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [11]:
SUB_q.shape

(16411, 36)

In [12]:
SUB_q.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
0,0000006201-20-000051,6201,AMERICAN AIRLINES GROUP INC.,4512,US,TX,FORT WORTH,76155,1 SKYVIEW DRIVE,,...,2020-03-31,,,20200401,2020-04-01 16:15:00.0,0,0,a8k040120drawdownofrev_htm.xml,2,4515.0
1,0000018498-20-000016,18498,GENESCO INC,5661,US,TN,NASHVILLE,37217,GENESCO PK 1415 MURFREESBORO RD,,...,2020-01-31,2019.0,FY,20200401,2020-04-01 15:29:00.0,0,1,gcofy202010-kq4_htm.xml,1,
2,0000020286-20-000029,20286,CINCINNATI FINANCIAL CORP,6331,US,OH,FAIRFIELD,45014,6200 S GILMORE RD,,...,2020-03-31,,,20200401,2020-04-01 09:54:00.0,0,0,locationchange8-k04x01_htm.xml,1,
3,0000049196-20-000017,49196,HUNTINGTON BANCSHARES INC/MD,6021,US,OH,COLUMBUS,43287,HUNTINGTON CTR,41 S HIGH ST HC0917,...,2020-03-31,,,20200401,2020-04-01 16:53:00.0,0,0,hban-2020x04x01x8k_htm.xml,1,
4,0000074145-20-000014,74145,OKLAHOMA GAS & ELECTRIC CO,4911,US,OK,OKLAHOMA CITY,73101-0321,321 NORTH HARVEY,PO BOX 321,...,2020-03-31,,,20200401,2020-04-01 14:34:00.0,0,0,oge-20200401_htm.xml,1,


In [13]:
# check forms included in this file
# only keep the 10-K and 10-K/A

SUB_q.form.unique()

array(['8-K', '10-K', '10-Q', '10-K/A', '20-F', 'F-4', '8-K12B', 'POS AM',
       '8-K/A', 'S-1/A', '40-F/A', '20-F/A', '10-KT', '10-Q/A', 'S-4/A',
       '6-K', 'S-1', '40-F', 'F-1', 'POS EX', 'S-3/A', 'S-4', '6-K/A',
       'F-1/A', '10-KT/A', '10-12G', '8-K12B/A', 'S-11/A', '424B3'],
      dtype=object)

In [14]:
# check all US located companies
# will drop non-US companies

SUB_q.countryba.unique()

array(['US', 'CA', 'GR', 'CO', 'BM', 'IL', 'DE', 'GB', 'MC', 'CL', 'KY',
       'FR', 'RS', 'IT', 'CN', 'SG', 'TR', 'MH', 'NL', 'IE', 'BR', 'DK',
       'CY', 'LU', 'ZA', nan, 'MY', 'AN', 'PA', 'MX', 'CH', 'DO', 'HK',
       'PE', 'AR', 'TW', 'JP', 'PH', 'SK', 'GE', 'IN', 'CZ', 'PR', 'RU',
       'ID', 'AU', 'KR', 'AI', 'UY', 'BE', 'JO', 'NO', 'JE', 'TH', 'PT',
       'EE', 'AE', 'VG', 'NZ', 'VI', 'SE', 'GU', 'BS', 'UA', 'MT', 'BG'],
      dtype=object)

In [None]:
#SUB_q.columns

In [None]:
#SUB_q.info()

In [15]:
# Will merge NUM and SUB files and then filter like how GCP queries were filtered
# per details from SEC of how to merge these datasets on adsh

all_q = pd.merge(NUM_q, SUB_q, how='left', on=['adsh'])

In [16]:
all_q.shape

(2197732, 44)

In [None]:
#all_q.head()

In [None]:
#all_q.tail()

In [None]:
#all_q.columns

In [18]:
import pickle

with open('Q2_2020.pickle', 'wb') as to_write:
    pickle.dump(all_q, to_write)

In [None]:
# apply filters to keep only 10-K reports from US companies
f1_all_q1 = all_q1[all_q1['countryba'] == 'US']
f1_all_q1.shape

In [None]:
f2_all_q1 = f1_all_q1[f1_all_q1['form'].isin(['10-K', '10-K/A'])]
f2_all_q1.shape

In [None]:
# fp is Fiscal Period Focus.  The 10-K is to have a fiscal period focus of FY
f2_all_q1.fp.unique()

In [None]:
# keep only the columns that match GCP fields
# rename columns 

f3_all_q1 = f2_all_q1[['name', 'period', 'number_of_quarters', 'adsh', 'cik', 'ein', 
           'sic', 'fy', 'form', 'filed', 'measure_tag', 'value', 'units' ]]
f3_all_q1.shape

In [None]:
f4_all_q1 = f3_all_q1[f3_all_q1['period'] > dt.datetime(2018,12,31)]

f4_all_q1.head()


In [None]:
f4_all_q1.shape

In [None]:
max(f4_all_q1.period)

In [None]:
f4_all_q1.period.value_counts()

In [None]:
## This looks good, companies > $75 MM would be required to file their annual report
## for FY ending 12/31/2019 within 90 days - by 3/31/2020

In [None]:
# rename columns so will match GCP labeling

f4_all_q1.columns = ['company_name', 'period_end_date', 'number_of_quarters',
                    'submission_number', 'central_index_key', 'ein',
                    'sic', 'fiscal_year', 'form', 'date_filed',
                    'measure_tag', 'value', 'units']

In [None]:
f4_all_q1.head()

In [None]:
f4_all_q1.company_name.nunique()

In [None]:
# filter to only measurements queried and downloaded from GCP

query = """ 
SELECT *
FROM f4_all_q1 as a 
where measure_tag in ('Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'Goodwill',
       'GrossProfit', 'IncreaseDecreaseInInventories', 'InterestExpense',
       'InventoryNet', 'Liabilities', 'LiabilitiesAndStockholdersEquity',
       'LiabilitiesCurrent', 'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss',
       'OperatingExpenses', 'OperatingIncomeLoss', 'ProfitLoss',
       'RepaymentsOfLongTermDebt', 'Revenues', 'SalesRevenueGoodsNet',
       'SalesRevenueNet', 'StockholdersEquity', 'TotalAsset',
       'WorkingCapital')
"""

f5_all_q1 = pysqldf(query)
f5_all_q1.head()



In [None]:
f5_all_q1.shape

In [None]:
# check all units in USD
f5_all_q1.units.unique()

In [None]:
# keep only USD,  as found in EDA, some non-US companies file the US form and no requirement they don't

f6_all_q1 = f5_all_q1[f5_all_q1['units'] == 'USD']
f6_all_q1.shape

In [None]:
# df is tall on each measurement 
# pivot to wide


wide_sec = f6_all_q1.pivot_table(index=['company_name','period_end_date', 'number_of_quarters',
                                              'submission_number','central_index_key',
                                             'ein', 'sic', 'fiscal_year', 'form',
                                             'date_filed'],
                        columns='measure_tag', values='value').reset_index()
wide_sec.head()


In [None]:
wide_sec.shape

In [None]:
wide_sec.company_name.nunique()

In [None]:
#### COMBINE THIS Q2 2020 submissions with the Q12020 submissions

# pickle the model_data df
import pickle

with open('q2_2020.pickle', 'wb') as to_write:
    pickle.dump(wide_sec, to_write)

In [None]:
## LOOKING UP 2020 bankruptcies in 2019 sec data

In [None]:
all_q1[all_q1['name'] == 'ASCENA RETAIL GROUP, INC.']

In [None]:
wide_sec[wide_sec.company_name == 'CHAPARRAL ENERGY, INC']

In [None]:
##################

In [None]:
# combine these annual reports with GCP 2019 file download that contains 1/1/2019 - 9/30/2019 priod end dates

In [None]:

query = """
SELECT *
FROM sec_all_usd
WHERE  period_end_date > '2018-12-31' 
;
"""

gcp_data = pd_sql.read_sql(query, connection)
gcp_data.head()

In [None]:
gcp_data.shape

In [None]:
wide_sec

In [None]:
gcp_data.company_name.nunique()

In [None]:
gcp_data.columns

In [None]:
# pivot wide


wide_gcp = gcp_data.pivot_table(index=['company_name','period_end_date', 'number_of_quarters',
                                              'submission_number','central_index_key',
                                             'ein', 'sic', 'fiscal_year_end', 'fiscal_year', 'form',
                                             'date_filed'],
                        columns='measure_tag', values='value').reset_index()
wide_gcp.head()

In [None]:
wide_gcp.shape

In [None]:
# stack the 2 df with 2019 financial reports

sec_2019 = pd.concat([wide_gcp, wide_sec], axis=0)
sec_2019.shape

In [None]:
sec_2019[['company_name', 'period_end_date']].value_counts()

In [None]:
sec_2019[sec_2019['company_name'] == 'PLURALSIGHT, INC.']

In [None]:
# like with handling 2014-2018 data, will keep where quarters zero or 4
# then forward fill and back fill
# will filter to have only one row for company and period_end_date


In [None]:
# fill forward within groupby, then back fill within groupby

# leave the NaN's to apply additional logics

cols = ['Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'Goodwill',
       'GrossProfit', 'IncreaseDecreaseInInventories', 'InterestExpense',
       'InventoryNet', 'Liabilities', 'LiabilitiesAndStockholdersEquity',
       'LiabilitiesCurrent', 'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss',
       'OperatingExpenses', 'OperatingIncomeLoss', 'ProfitLoss',
       'RepaymentsOfLongTermDebt', 'Revenues', 'SalesRevenueGoodsNet',
       'SalesRevenueNet', 'StockholdersEquity', 
       'WorkingCapital']


sec_2019.update(sec_2019.groupby(['company_name', 'period_end_date'])[cols].ffill())

In [None]:

sec_2019.update(sec_2019.groupby(['company_name', 'period_end_date'])[cols].bfill())

In [None]:
# like done for 2014 - 2018
# If assume companies that have some missing fields have balanced financials, can fill in the missing fields

In [None]:
f1_wide_sec = sec_2019.copy()

In [None]:
# fill in StockholdersEquity 

f1_wide_sec['StockholdersEquity'].fillna(f1_wide_sec['LiabilitiesAndStockholdersEquity'] - f1_wide_sec['Liabilities'], inplace=True)

In [None]:
f1_wide_sec['StockholdersEquity'].isna().value_counts()

In [None]:
## fill in LiabilitiesAndStockholdersEquity

f1_wide_sec['LiabilitiesAndStockholdersEquity'].fillna(f1_wide_sec['StockholdersEquity'] + f1_wide_sec['Liabilities'], inplace=True)

In [None]:
f1_wide_sec['LiabilitiesAndStockholdersEquity'].isna().value_counts()

In [None]:
## fill in Liabilities

f1_wide_sec['Liabilities'].fillna(f1_wide_sec['LiabilitiesAndStockholdersEquity'] - f1_wide_sec['StockholdersEquity'], inplace=True)

In [None]:
f1_wide_sec['Liabilities'].isna().value_counts()

In [None]:
# majority of companies use the AssetsCurrent field.  Few used CurrentAsset

f1_wide_sec['AssetsCurrent'].fillna(f1_wide_sec['CurrentAsset'], inplace=True)

In [None]:
f1_wide_sec['Assets'].isna().value_counts()

In [None]:
## Different companies put Revenues in different fields

f1_wide_sec['Revenue_any'] = f1_wide_sec['Revenues']

f1_wide_sec['Revenue_any'].fillna(f1_wide_sec['SalesRevenueNet'], inplace=True)
f1_wide_sec['Revenue_any'].fillna(f1_wide_sec['SalesRevenueGoodsNet'], inplace=True)

In [None]:
f1_wide_sec.columns

In [None]:
f1_wide_sec.head()

In [None]:
# sort df
f1_wide_sec.sort_values(by=['company_name', 'period_end_date'], inplace=True)
f1_wide_sec.head()

#date_filed

In [None]:
# filter to where Assets >= 100MM
# this is the threshold for the bankruptcy list from UCLA

f2_wide_sec = f1_wide_sec[f1_wide_sec['Assets'] >= 1e8]
f2_wide_sec.shape

In [None]:
f2_wide_sec.info()

In [None]:
# period_end_date back to a datetime

f2_wide_sec['period_end_date']= pd.to_datetime(f2_wide_sec['period_end_date'])

In [None]:
f3_wide_sec  = f2_wide_sec.groupby(['company_name', 'period_end_date']).tail(1)
f3_wide_sec.shape

In [None]:
f3_wide_sec['company_name'].value_counts()

In [None]:
f3_wide_sec[f3_wide_sec['company_name'] == 'ULTA BEAUTY, INC.']

In [None]:
f3_wide_sec.period_end_date.value_counts()

In [None]:
f3_wide_sec.company_name.nunique()