In [None]:
"""

Extra text files had to be downloaded from SEC website because
annual 10-K filing for Q4 2019 are not in the Google Cloud Platform database

these files were merged and prepared to be like extracts from Google Cloud SEC database

stack all pickled files together and further refine data to one 2019 report per company

saved prepared 2019 financial report data to local postgres db

TEST HOLD OUT DATA
"""

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
import pandas.io.sql as pd_sql

from pandasql import sqldf
# PandaSQL needs to be able to reference the global variables already defined
pysqldf = lambda q: sqldf(q, globals())

In [3]:
import pickle

In [4]:
import psycopg2 as pg

# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   

In [5]:
with open('q4_2019.pickle', 'rb') as read_file:
    q4_2019 = pickle.load(read_file)
    
q4_2019.shape

(2402955, 44)

In [7]:
with open('q1_2020.pickle', 'rb') as read_file:
    q1_2020 = pickle.load(read_file)
    
q1_2020.shape

(3005393, 44)

In [16]:
combined = pd.concat([q4_2019,q1_2020], axis=0 )
combined.shape

(5408348, 44)

In [17]:
## stack with Q2 2020

with open('q2_2020.pickle', 'rb') as read_file:
    q2_2020 = pickle.load(read_file)
    
q2_2020.shape


(2197732, 44)

In [18]:
combined2 = pd.concat([combined,q2_2020], axis=0 )
combined2.shape

(7606080, 44)

In [19]:
## stack with Q3 2020

with open('Q3_2020.pickle', 'rb') as read_file:
    Q3_2020 = pickle.load(read_file)
    
Q3_2020.shape

(2351640, 44)

In [20]:
combined3 = pd.concat([combined2,Q3_2020], axis=0 )
combined3.shape

(9957720, 44)

In [21]:
combined3.columns

Index(['adsh', 'measure_tag', 'version', 'coreg', 'period_end_date',
       'number_of_quarters', 'units', 'value', 'footnote', 'cik', 'name',
       'sic', 'countryba', 'stprba', 'cityba', 'zipba', 'bas1', 'bas2', 'baph',
       'countryma', 'stprma', 'cityma', 'zipma', 'mas1', 'mas2', 'countryinc',
       'stprinc', 'ein', 'former', 'changed', 'afs', 'wksi', 'fye', 'form',
       'period', 'fy', 'fp', 'filed', 'accepted', 'prevrpt', 'detail',
       'instance', 'nciks', 'aciks'],
      dtype='object')

In [None]:
########################

In [28]:
# apply filters to keep only 10-K reports from US companies
f1_all_q1 = combined3[combined3['countryba'] == 'US']
f1_all_q1.shape

(8639330, 44)

In [29]:
f2_all_q1 = f1_all_q1[f1_all_q1['form'].isin(['10-K', '10-K/A'])]
f2_all_q1.shape

(2840043, 44)

In [30]:
# fp is Fiscal Period Focus.  The 10-K is to have a fiscal period focus of FY
f2_all_q1.fp.unique()

array(['FY', nan], dtype=object)

In [31]:
# keep only the columns that match GCP fields
# rename columns 

f3_all_q1 = f2_all_q1[['name', 'period', 'number_of_quarters', 'adsh', 'cik', 'ein', 
           'sic', 'fy', 'form', 'filed', 'measure_tag', 'value', 'units' ]]
f3_all_q1.shape

(2840043, 13)

In [32]:
f4_all_q1 = f3_all_q1[f3_all_q1['period'] > dt.datetime(2018,12,31)]

f4_all_q1.head()


Unnamed: 0,name,period,number_of_quarters,adsh,cik,ein,sic,fy,form,filed,measure_tag,value,units
0,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,100000000.0,USD
1,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,175000000.0,USD
2,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,140000000.0,USD
3,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,100000000.0,USD
4,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,100000000.0,USD


In [33]:
f4_all_q1.shape

(2812070, 13)

In [34]:
max(f4_all_q1.period)

Timestamp('2020-07-31 00:00:00')

In [35]:
f4_all_q1.period.value_counts()

2019-12-31    2272972
2019-09-30     122524
2020-06-30      95914
2020-03-31      77205
2020-01-31      73990
2019-10-31      29898
2020-04-30      21467
2019-08-31      21196
2020-05-31      19749
2019-07-31      15313
2019-06-30      14425
2019-11-30      14373
2020-02-29      14261
2020-07-31      12780
2019-03-31       3126
2019-04-30       1816
2019-02-28        739
2019-01-31        236
2019-05-31         86
Name: period, dtype: int64

In [36]:
# rename columns so will match GCP labeling

f4_all_q1.columns = ['company_name', 'period_end_date', 'number_of_quarters',
                    'submission_number', 'central_index_key', 'ein',
                    'sic', 'fiscal_year', 'form', 'date_filed',
                    'measure_tag', 'value', 'units']

In [37]:
f4_all_q1.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year,form,date_filed,measure_tag,value,units
0,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,100000000.0,USD
1,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,175000000.0,USD
2,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,140000000.0,USD
3,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,100000000.0,USD
4,FERRELLGAS PARTNERS L P,2019-07-31,0,0001558370-19-008908,922358,431698480,5900,2019,10-K,20191015,DerivativeNotionalAmount,100000000.0,USD


In [38]:
f4_all_q1.company_name.nunique()

5058

In [None]:
#f4_all_q1[f4_all_q1.company_name == 'HORNBECK OFFSHORE SERVICES INC /LA']

In [39]:
# filter to only measurements queried and downloaded from GCP

query = """ 
SELECT *
FROM f4_all_q1 as a 
where measure_tag in ('Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'Goodwill',
       'GrossProfit', 'IncreaseDecreaseInInventories', 'InterestExpense',
       'InventoryNet', 'Liabilities', 'LiabilitiesAndStockholdersEquity',
       'LiabilitiesCurrent', 'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss',
       'OperatingExpenses', 'OperatingIncomeLoss', 'ProfitLoss',
       'RepaymentsOfLongTermDebt', 'Revenues', 'SalesRevenueGoodsNet',
       'SalesRevenueNet', 'StockholdersEquity', 'TotalAsset',
       'WorkingCapital')
"""

f5_all_q1 = pysqldf(query)
f5_all_q1.head()



Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year,form,date_filed,measure_tag,value,units
0,OIL-DRI CORP OF AMERICA,2019-07-31 00:00:00.000000,4,0000074046-19-000049,74046,362048898,3990,2019,10-K,20191010,DeferredIncomeTaxesAndTaxCredits,7270000.0,USD
1,OIL-DRI CORP OF AMERICA,2019-07-31 00:00:00.000000,4,0000074046-19-000049,74046,362048898,3990,2019,10-K,20191010,DeferredIncomeTaxesAndTaxCredits,-406000.0,USD
2,"ASCENA RETAIL GROUP, INC.",2019-07-31 00:00:00.000000,4,0001498301-19-000092,1498301,300641353,5600,2019,10-K,20191010,DeferredIncomeTaxesAndTaxCredits,-371300000.0,USD
3,"ASCENA RETAIL GROUP, INC.",2019-07-31 00:00:00.000000,4,0001498301-19-000092,1498301,300641353,5600,2019,10-K,20191010,DeferredIncomeTaxesAndTaxCredits,-47100000.0,USD
4,"ASCENA RETAIL GROUP, INC.",2019-07-31 00:00:00.000000,4,0001498301-19-000092,1498301,300641353,5600,2019,10-K,20191010,DeferredIncomeTaxesAndTaxCredits,-10000000.0,USD


In [40]:
f5_all_q1.shape

(359251, 13)

In [41]:
# check all units in USD
f5_all_q1.units.unique()

array(['USD', 'EUR', 'SAR', 'ILS', 'CAD', 'CHF'], dtype=object)

In [42]:
# keep only USD,  as found in EDA, some non-US companies file the US form and no requirement they don't

f6_all_q1 = f5_all_q1[f5_all_q1['units'] == 'USD']
f6_all_q1.shape

(359190, 13)

In [43]:
# df is tall on each measurement 
# pivot to wide


wide_sec = f6_all_q1.pivot_table(index=['company_name','period_end_date', 'number_of_quarters',
                                              'submission_number','central_index_key',
                                             'ein', 'sic', 'fiscal_year', 'form',
                                             'date_filed'],
                        columns='measure_tag', values='value').reset_index()
wide_sec.head()


measure_tag,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year,form,date_filed,...,NetIncomeLoss,OperatingExpenses,OperatingIncomeLoss,ProfitLoss,RepaymentsOfLongTermDebt,Revenues,SalesRevenueGoodsNet,SalesRevenueNet,StockholdersEquity,WorkingCapital
0,1 800 FLOWERS COM INC,2020-06-30 00:00:00.000000,0,0001437749-20-019622,1084869,113117311,5990,2020,10-K,20200911,...,,,,,,,,,334907000.0,
1,1 800 FLOWERS COM INC,2020-06-30 00:00:00.000000,4,0001437749-20-019622,1084869,113117311,5990,2020,10-K,20200911,...,44851670.0,490274000.0,55506670.0,,,171200000.0,,,,
2,"10X GENOMICS, INC.",2019-12-31 00:00:00.000000,0,0001193125-20-052640,1770787,455614458,3826,2019,10-K,20200227,...,,,,,,,,,-2211000.0,
3,"10X GENOMICS, INC.",2019-12-31 00:00:00.000000,1,0001193125-20-052640,1770787,455614458,3826,2019,10-K,20200227,...,-17967000.0,,,,,49025750.0,,,,
4,"10X GENOMICS, INC.",2019-12-31 00:00:00.000000,4,0001193125-20-052640,1770787,455614458,3826,2019,10-K,20200227,...,-54166000.0,174249700.0,-53237330.0,,704000.0,154430300.0,,,,


In [44]:
wide_sec.shape

(13434, 48)

In [45]:
wide_sec.company_name.nunique()

5049

In [None]:
##################

In [None]:
# combine these annual reports with GCP 2019 file download that contains 1/1/2019 - 9/30/2019 priod end dates

In [46]:

query = """
SELECT *
FROM sec_all_usd
WHERE  period_end_date > '2018-12-31' 
;
"""

gcp_data = pd_sql.read_sql(query, connection)
gcp_data.head()

Unnamed: 0,submission_number,company_name,measure_tag,period_end_date,value,units,number_of_quarters,version,central_index_key,ein,sic,fiscal_year_end,form,fiscal_year,fiscal_period_focus,date_filed,date_accepted
0,0000004457-19-000024,AMERCO /NV/,Assets,2019-03-31,11891710000.0,USD,0,us-gaap/2018,4457,880106815,7510,331,10-K,2018,FY,2019-05-29,2019-05-29 17:04:00 UTC
1,0000004457-19-000024,AMERCO /NV/,CashAndCashEquivalentsAtCarryingValue,2019-03-31,673701000.0,USD,0,us-gaap/2018,4457,880106815,7510,331,10-K,2018,FY,2019-05-29,2019-05-29 17:04:00 UTC
2,0000004457-19-000024,AMERCO /NV/,CashAndCashEquivalentsPeriodIncreaseDecrease,2019-03-31,-85687000.0,USD,4,us-gaap/2018,4457,880106815,7510,331,10-K,2018,FY,2019-05-29,2019-05-29 17:04:00 UTC
3,0000004457-19-000024,AMERCO /NV/,CostsAndExpenses,2019-03-31,3147720000.0,USD,4,us-gaap/2018,4457,880106815,7510,331,10-K,2018,FY,2019-05-29,2019-05-29 17:04:00 UTC
4,0000004457-19-000024,AMERCO /NV/,DeferredIncomeTaxLiabilities,2019-03-31,963273000.0,USD,0,us-gaap/2018,4457,880106815,7510,331,10-K,2018,FY,2019-05-29,2019-05-29 17:04:00 UTC


In [47]:
gcp_data.shape

(16495, 17)

In [48]:
gcp_data.company_name.nunique()

762

In [49]:
gcp_data.columns

Index(['submission_number', 'company_name', 'measure_tag', 'period_end_date',
       'value', 'units', 'number_of_quarters', 'version', 'central_index_key',
       'ein', 'sic', 'fiscal_year_end', 'form', 'fiscal_year',
       'fiscal_period_focus', 'date_filed', 'date_accepted'],
      dtype='object')

In [50]:
# pivot wide

wide_gcp = gcp_data.pivot_table(index=['company_name','period_end_date', 'number_of_quarters',
                                              'submission_number','central_index_key',
                                             'ein', 'sic', 'fiscal_year_end', 'fiscal_year', 'form',
                                             'date_filed'],
                        columns='measure_tag', values='value').reset_index()
wide_gcp.head()

measure_tag,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,NetCashProvidedByUsedInInvestingActivities,NetCashProvidedByUsedInOperatingActivities,NetIncomeLoss,OperatingExpenses,OperatingIncomeLoss,ProfitLoss,RepaymentsOfLongTermDebt,Revenues,SalesRevenueNet,StockholdersEquity
0,1 800 FLOWERS COM INC,2019-06-30,0,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,,,,,,,,,342711000.0
1,1 800 FLOWERS COM INC,2019-06-30,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,-32560000.0,78100000.0,34766000.0,481013000.0,45108000.0,,,,,
2,3AM TECHNOLOGIES INC,2019-05-31,0,0001078782-19-000724,1667615,0,3670,531,2019,10-K/A,...,,,,,,,,,,-29699.0
3,3AM TECHNOLOGIES INC,2019-05-31,4,0001078782-19-000724,1667615,0,3670,531,2019,10-K/A,...,,-25386.0,-34311.0,34311.0,,-34311.0,,,,
4,8I ENTERPRISES ACQUISITION CORP.,2019-07-31,0,0001493152-19-014287,1753648,0,6770,731,2019,10-K,...,,,,,,,,,,5000010.0


In [51]:
wide_gcp.shape

(2067, 44)

In [52]:
# stack the 2 df with 2019 financial reports

sec_2019 = pd.concat([wide_gcp, wide_sec], axis=0)
sec_2019.shape

(15501, 50)

In [53]:
sec_2019[['company_name', 'period_end_date']].value_counts()

company_name                          period_end_date           
HARTMAN VREIT XXI, INC.               2019-12-31 00:00:00.000000    15
USA TECHNOLOGIES INC                  2019-06-30 00:00:00.000000    12
FLOTEK INDUSTRIES INC/CN/             2019-12-31 00:00:00.000000     9
TRADEWEB MARKETS INC.                 2019-12-31 00:00:00.000000     8
ALBERTSONS COMPANIES, INC.            2020-02-29 00:00:00.000000     8
                                                                    ..
BRADY CORP                            2019-04-30 00:00:00            1
COMTECH TELECOMMUNICATIONS CORP /DE/  2019-04-30 00:00:00            1
AMERICAS CARMART INC                  2019-01-31 00:00:00            1
COMTECH TELECOMMUNICATIONS CORP /DE/  2019-01-31 00:00:00            1
MICROSOFT CORP                        2019-03-31 00:00:00            1
Length: 6000, dtype: int64

In [None]:
# like with handling 2014-2018 data, will keep where quarters zero or 4
# then forward fill and back fill
# will filter to have only one row for company and period_end_date


In [55]:

sec_df2 = sec_2019[sec_2019['number_of_quarters'].isin([0,4])]
sec_df2.shape

(11864, 50)

In [56]:
# fill forward within groupby, then back fill within groupby

# leave the NaN's to apply additional logics

cols = ['Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'Goodwill',
       'GrossProfit', 'IncreaseDecreaseInInventories', 'InterestExpense',
       'InventoryNet', 'Liabilities', 'LiabilitiesAndStockholdersEquity',
       'LiabilitiesCurrent', 'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss',
       'OperatingExpenses', 'OperatingIncomeLoss', 'ProfitLoss',
       'RepaymentsOfLongTermDebt', 'Revenues', 'SalesRevenueGoodsNet',
       'SalesRevenueNet', 'StockholdersEquity', 
       'WorkingCapital']


sec_df2.update(sec_df2.groupby(['company_name', 'period_end_date'])[cols].ffill())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [57]:

sec_df2.update(sec_df2.groupby(['company_name', 'period_end_date'])[cols].bfill())

In [None]:
# like done for 2014 - 2018
# If assume companies that have some missing fields have balanced financials, can fill in the missing fields

In [58]:
f1_wide_sec = sec_df2.copy()

In [59]:
# fill in StockholdersEquity 

f1_wide_sec['StockholdersEquity'].fillna(f1_wide_sec['LiabilitiesAndStockholdersEquity'] - f1_wide_sec['Liabilities'], inplace=True)

In [60]:
f1_wide_sec['StockholdersEquity'].isna().value_counts()

False    11593
True       271
Name: StockholdersEquity, dtype: int64

In [61]:
## fill in LiabilitiesAndStockholdersEquity

f1_wide_sec['LiabilitiesAndStockholdersEquity'].fillna(f1_wide_sec['StockholdersEquity'] + f1_wide_sec['Liabilities'], inplace=True)

In [62]:
f1_wide_sec['LiabilitiesAndStockholdersEquity'].isna().value_counts()

False    11767
True        97
Name: LiabilitiesAndStockholdersEquity, dtype: int64

In [63]:
## fill in Liabilities

f1_wide_sec['Liabilities'].fillna(f1_wide_sec['LiabilitiesAndStockholdersEquity'] - f1_wide_sec['StockholdersEquity'], inplace=True)

In [64]:
f1_wide_sec['Liabilities'].isna().value_counts()

False    11629
True       235
Name: Liabilities, dtype: int64

In [65]:
# majority of companies use the AssetsCurrent field.  Few used CurrentAsset

f1_wide_sec['AssetsCurrent'].fillna(f1_wide_sec['CurrentAsset'], inplace=True)

In [66]:
f1_wide_sec['Assets'].isna().value_counts()

False    11730
True       134
Name: Assets, dtype: int64

In [67]:
## Different companies put Revenues in different fields

f1_wide_sec['Revenue_any'] = f1_wide_sec['Revenues']

f1_wide_sec['Revenue_any'].fillna(f1_wide_sec['SalesRevenueNet'], inplace=True)
f1_wide_sec['Revenue_any'].fillna(f1_wide_sec['SalesRevenueGoodsNet'], inplace=True)

In [68]:
f1_wide_sec.columns

Index(['company_name', 'period_end_date', 'number_of_quarters',
       'submission_number', 'central_index_key', 'ein', 'sic',
       'fiscal_year_end', 'fiscal_year', 'form', 'date_filed', 'Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'GrossProfit',
       'Liabilities', 'LiabilitiesAndStockholdersEquity', 'LiabilitiesCurrent',
       'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActiv

In [69]:
f1_wide_sec.head()

measure_tag,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,Revenues,SalesRevenueNet,StockholdersEquity,Goodwill,IncreaseDecreaseInInventories,InterestExpense,InventoryNet,SalesRevenueGoodsNet,WorkingCapital,Revenue_any
0,1 800 FLOWERS COM INC,2019-06-30 00:00:00,0,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,342711000.0,,,,,,,
1,1 800 FLOWERS COM INC,2019-06-30 00:00:00,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,342711000.0,,,,,,,
2,3AM TECHNOLOGIES INC,2019-05-31 00:00:00,0,0001078782-19-000724,1667615,0,3670,531.0,2019,10-K/A,...,,,-29699.0,,,,,,,
3,3AM TECHNOLOGIES INC,2019-05-31 00:00:00,4,0001078782-19-000724,1667615,0,3670,531.0,2019,10-K/A,...,,,-29699.0,,,,,,,
4,8I ENTERPRISES ACQUISITION CORP.,2019-07-31 00:00:00,0,0001493152-19-014287,1753648,0,6770,731.0,2019,10-K,...,,,5000010.0,,,,,,,


In [70]:
# sort df
f1_wide_sec.sort_values(by=['company_name', 'period_end_date'], inplace=True)
f1_wide_sec.head()

#date_filed

measure_tag,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,Revenues,SalesRevenueNet,StockholdersEquity,Goodwill,IncreaseDecreaseInInventories,InterestExpense,InventoryNet,SalesRevenueGoodsNet,WorkingCapital,Revenue_any
0,1 800 FLOWERS COM INC,2019-06-30 00:00:00,0,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,342711000.0,,,,,,,
1,1 800 FLOWERS COM INC,2019-06-30 00:00:00,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,342711000.0,,,,,,,
0,1 800 FLOWERS COM INC,2020-06-30 00:00:00.000000,0,0001437749-20-019622,1084869,113117311,5990,,2020,10-K,...,171200000.0,,334907000.0,53003000.0,6956667.0,,95060500.0,,,171200000.0
1,1 800 FLOWERS COM INC,2020-06-30 00:00:00.000000,4,0001437749-20-019622,1084869,113117311,5990,,2020,10-K,...,171200000.0,,334907000.0,53003000.0,6956667.0,,95060500.0,,,171200000.0
2,"10X GENOMICS, INC.",2019-12-31 00:00:00.000000,0,0001193125-20-052640,1770787,455614458,3826,,2019,10-K,...,154430300.0,,-2211000.0,,4142000.0,2099667.0,11920000.0,,,154430300.0


In [71]:
# filter to where Assets >= 100MM
# this is the threshold for the bankruptcy list from UCLA

f2_wide_sec = f1_wide_sec[f1_wide_sec['Assets'] >= 1e8]
f2_wide_sec.shape

(7814, 51)

In [72]:
f2_wide_sec.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7814 entries, 0 to 13433
Data columns (total 51 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   company_name                                                   7814 non-null   object 
 1   period_end_date                                                7814 non-null   object 
 2   number_of_quarters                                             7814 non-null   int64  
 3   submission_number                                              7814 non-null   object 
 4   central_index_key                                              7814 non-null   object 
 5   ein                                                            7814 non-null   object 
 6   sic                                                            7814 non-null   object 
 7   fiscal_year_end                                            

In [73]:
# period_end_date back to a datetime

f2_wide_sec['period_end_date']= pd.to_datetime(f2_wide_sec['period_end_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f2_wide_sec['period_end_date']= pd.to_datetime(f2_wide_sec['period_end_date'])


In [80]:
f3_wide_sec  = f2_wide_sec.groupby(['company_name', 'period_end_date']).tail(1)
f3_wide_sec.shape

(3874, 51)

In [75]:
f3_wide_sec['company_name'].value_counts()

CULP INC                       3
CARPENTER TECHNOLOGY CORP      2
PCSB FINANCIAL CORP            2
REX AMERICAN RESOURCES CORP    2
EXPRESS, INC.                  2
                              ..
TUSCAN HOLDINGS CORP.          1
CINEMARK HOLDINGS, INC.        1
PLUMAS BANCORP                 1
VERACYTE, INC.                 1
MIDDLESEX WATER CO             1
Name: company_name, Length: 3530, dtype: int64

In [76]:
f4_wide_sec = f3_wide_sec[f3_wide_sec['period_end_date'] < dt.datetime(2020,1,1)]

In [77]:
f4_wide_sec.shape

(3485, 51)

In [78]:
f4_wide_sec.company_name.nunique()

3484

In [81]:
#########
# add industry and industry division

query_ind = "SELECT * FROM sic_codes;"

sic_codes = pd_sql.read_sql(query_ind, connection)
sic_codes.columns

Index(['SIC Code', 'Industry', 'Unnamed: 2'], dtype='object')

In [82]:
ind_wide_sec = pd.merge(f4_wide_sec, sic_codes, how='left', left_on='sic', right_on='SIC Code' )

In [83]:
ind_wide_sec.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,Goodwill,IncreaseDecreaseInInventories,InterestExpense,InventoryNet,SalesRevenueGoodsNet,WorkingCapital,Revenue_any,SIC Code,Industry,Unnamed: 2
0,1 800 FLOWERS COM INC,2019-06-30,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,,,,,,5990,"Retail-Retail Stores, NEC",
1,"10X GENOMICS, INC.",2019-12-31,4,0001193125-20-052640,1770787,455614458,3826,,2019,10-K,...,,4142000.0,2099667.0,11920000.0,,,154430300.0,3826,Laboratory Analytical Instruments,
2,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2019-12-31,4,0001493152-20-005206,1591890,461119100,6331,,2019,10-K,...,,,,,,,,6331,"Fire, Marine & Casualty Insurance",
3,"1895 BANCORP OF WISCONSIN, INC.",2019-12-31,4,0001564590-20-014188,1751692,0,6036,,2019,10-K,...,,,4583000.0,,,,,6036,"Savings Institutions, Not Federally Chartered",
4,1LIFE HEALTHCARE INC,2019-12-31,4,0001564590-20-013666,1404123,760707204,8011,,2019,10-K,...,21301000.0,158000.0,704000.0,3521500.0,,,,8011,Services-Offices & Clinics of Doctors of Medicine,


In [84]:
ind_wide_sec.shape

(3485, 54)

In [85]:
# add overall industry division discription

In [86]:
ind_wide_sec['div_code'] = ind_wide_sec['sic'].str[:2]

In [87]:
query_div = "SELECT * FROM sic_divisions;"

sic_div = pd_sql.read_sql(query_div, connection)
sic_div.columns

Index(['div_code', 'Division'], dtype='object')

In [88]:

div_wide_sec = pd.merge(ind_wide_sec, sic_div, how='left', on='div_code')

div_wide_sec.shape

(3485, 56)

In [89]:
div_wide_sec['Division'].value_counts()

Manufacturing                                                         1122
Finance, Insurance and Real Estate                                    1078
Services                                                               481
Transportation, Communications, Electric, Gas and Sanitary service     310
Retail Trade                                                           190
Mining                                                                 158
Wholesale Trade                                                         90
Construction                                                            44
Agriculture, Forestry and Fishing                                       12
Name: Division, dtype: int64

In [90]:
div_wide_sec['Division'].count()

3485

In [None]:
## then to db table? 

In [91]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://amybutler:localhost@localhost:5432/bankruptcy')

In [92]:
div_wide_sec.to_sql('sec_prep_2019', engine, index=False, if_exists='replace')

In [93]:
#check table saved and can be queried
query = 'SELECT * FROM sec_prep_2019 LIMIT 5;'
df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,InterestExpense,InventoryNet,SalesRevenueGoodsNet,WorkingCapital,Revenue_any,SIC Code,Industry,Unnamed: 2,div_code,Division
0,1 800 FLOWERS COM INC,2019-06-30,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,,,,5990,"Retail-Retail Stores, NEC",,59,Retail Trade
1,"10X GENOMICS, INC.",2019-12-31,4,0001193125-20-052640,1770787,455614458,3826,,2019,10-K,...,2099667.0,11920000.0,,,154430300.0,3826,Laboratory Analytical Instruments,,38,Manufacturing
2,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2019-12-31,4,0001493152-20-005206,1591890,461119100,6331,,2019,10-K,...,,,,,,6331,"Fire, Marine & Casualty Insurance",,63,"Finance, Insurance and Real Estate"
3,"1895 BANCORP OF WISCONSIN, INC.",2019-12-31,4,0001564590-20-014188,1751692,0,6036,,2019,10-K,...,4583000.0,,,,,6036,"Savings Institutions, Not Federally Chartered",,60,"Finance, Insurance and Real Estate"
4,1LIFE HEALTHCARE INC,2019-12-31,4,0001564590-20-013666,1404123,760707204,8011,,2019,10-K,...,704000.0,3521500.0,,,,8011,Services-Offices & Clinics of Doctors of Medicine,,80,Services


In [94]:

with open('sec_test_data.pickle', 'wb') as to_write:
    pickle.dump(wide_sec, to_write)