In [None]:
"""
prepare training/validation set with data between 1/1/2014 - 12/31/2018

filter SEC data to companies with assets > $100MM
as initial target list is bankruptcies for large public 
companies with over this amount of assets

examine amount of missing data points in SEC data

pickle filtered df

"""

In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import datetime

In [2]:
from pandasql import sqldf
# PandaSQL needs to be able to reference the global variables already defined (namely, `df`)
pysqldf = lambda q: sqldf(q, globals())

In [3]:
# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   
                              

In [4]:
# few fields for first initial model
# all 2019 data held out for test set

query = """
SELECT *
FROM sec_wide_all
WHERE  period_end_date < '2019-01-01' 
;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()


Unnamed: 0,company_name,period_end_date,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,date_filed,...,OperatingIncomeLoss,ProfitLoss,RepaymentsOfLongTermDebt,Revenues,SalesRevenueNet,StockholdersEquity,TotalAsset,Industry,div_code,Division
0,"'MKTG, INC.'",2014-03-31 00:00:00.000000,0001019056-14-000881,886475,61340408,7310,331,2013,10-K,2014-06-27 00:00:00.000000,...,2830011.0,2293272.0,,,,12190322.0,,Services-Advertising,73,Services
1,"024 PHARMA, INC.",2015-12-31 00:00:00.000000,0001683168-17-000653,1307969,201862731,3089,1231,2016,10-K,2017-03-24 00:00:00.000000,...,,,,,,137994.0,,"Plastics Products, NEC",30,Manufacturing
2,"024 PHARMA, INC.",2016-12-31 00:00:00.000000,0001683168-17-000653,1307969,201862731,3089,1231,2016,10-K,2017-03-24 00:00:00.000000,...,,,,1079541.0,,759035.0,,"Plastics Products, NEC",30,Manufacturing
3,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0001437749-14-016921,1084869,113117311,5990,630,2014,10-K,2014-09-12 00:00:00.000000,...,23706000.0,14675000.0,,,756345000.0,183199000.0,,"Retail-Retail Stores, NEC",59,Retail Trade
4,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0001437749-15-017184,1084869,113117311,5990,630,2015,10-K,2015-09-11 00:00:00.000000,...,23706000.0,14675000.0,,,756345000.0,183228000.0,,"Retail-Retail Stores, NEC",59,Retail Trade


In [5]:
sec_data.columns

Index(['company_name', 'period_end_date', 'submission_number',
       'central_index_key', 'ein', 'sic', 'fiscal_year_end', 'fiscal_year',
       'form', 'date_filed', 'Assets', 'AssetsCurrent',
       'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'GrossProfit',
       'Liabilities', 'LiabilitiesAndStockholdersEquity', 'LiabilitiesCurrent',
       'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss

In [6]:
sec_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173062 entries, 0 to 173061
Data columns (total 47 columns):
 #   Column                                                         Non-Null Count   Dtype  
---  ------                                                         --------------   -----  
 0   company_name                                                   173062 non-null  object 
 1   period_end_date                                                173062 non-null  object 
 2   submission_number                                              173062 non-null  object 
 3   central_index_key                                              173062 non-null  object 
 4   ein                                                            173062 non-null  object 
 5   sic                                                            173062 non-null  object 
 6   fiscal_year_end                                                173062 non-null  int64  
 7   fiscal_year                                    

In [24]:
# look at companies who don't have "Assets" filled in, determine if company using another field
sec_data[sec_data['Assets'].isna()].company_name.unique()


array(['024 PHARMA, INC.', '1 800 FLOWERS COM INC', '12 RETECH CORP', ...,
       'ZYNEX INC', 'ZYNGA INC', 'ZZLL INFORMATION TECHNOLOGY, INC'],
      dtype=object)

In [21]:
examine_one = sec_data[sec_data['company_name']=='ZYNGA INC'].sort_values(by='period_end_date')
examine_one.to_excel('examine_one_co.xlsx', index=False)

# ZYNGA keeps re-submitting the annual report for a previous end_date.  Not different data, 
# just less data than originally submitted

In [23]:
# will examine another company
examine_two = sec_data[sec_data['company_name']=='ZZLL INFORMATION TECHNOLOGY, INC'].sort_values(by='period_end_date')
examine_two.to_excel('examine_two_co.xlsx', index=False)

## ZZLL submitted updated information


In [25]:
# 12 RETECH CORP

examine_3 = sec_data[sec_data['company_name']=='12 RETECH CORP'].sort_values(by='period_end_date')
examine_3.to_excel('examine_3_co.xlsx', index=False)

# resubmitting prior period_end the following year but not submitting new informaton.


In [40]:
#############

In [None]:
# Filter SEC data to be the companies with over 100MM in assets
# as bankruptcy list is companies who had over this amount

In [32]:

query= """
SELECT  company_name, period_end_date, date_filed, Assets, AssetsCurrent, Liabilities, LiabilitiesCurrent,
    NetIncomeLoss, LiabilitiesAndStockholdersEquity, StockholdersEquity, OperatingIncomeLoss, Revenues, 
    EarningsPerShareBasic, EarningsPerShareDiluted, Division
FROM sec_data
WHERE Assets > 100000000
;
"""

over_100MM = pysqldf(query)


In [33]:
over_100MM.shape

(37265, 15)

In [34]:
over_100MM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37265 entries, 0 to 37264
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   company_name                      37265 non-null  object 
 1   period_end_date                   37265 non-null  object 
 2   date_filed                        37265 non-null  object 
 3   Assets                            37265 non-null  float64
 4   AssetsCurrent                     24474 non-null  float64
 5   Liabilities                       27473 non-null  float64
 6   LiabilitiesCurrent                24444 non-null  float64
 7   NetIncomeLoss                     33473 non-null  float64
 8   LiabilitiesAndStockholdersEquity  34558 non-null  float64
 9   StockholdersEquity                32473 non-null  float64
 10  OperatingIncomeLoss               27249 non-null  float64
 11  Revenues                          16673 non-null  float64
 12  Earn

In [35]:
# create new field that counts how many of the following fields are not null

over_100MM['value_count'] = over_100MM.apply(lambda x: x.count(), axis=1)
over_100MM.head()

Unnamed: 0,company_name,period_end_date,date_filed,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,OperatingIncomeLoss,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,value_count
0,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,2014-09-12 00:00:00.000000,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183199000.0,23706000.0,,0.24,0.23,Retail Trade,13
1,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,2015-09-11 00:00:00.000000,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183228000.0,23706000.0,,0.24,0.23,Retail Trade,13
2,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,2015-09-11 00:00:00.000000,501946000.0,159968000.0,291690000.0,123607000.0,,501946000.0,208449000.0,37617000.0,,0.31,0.3,Retail Trade,13
3,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,2016-09-16 00:00:00.000000,497073000.0,155095000.0,286817000.0,123607000.0,,497073000.0,208449000.0,37617000.0,,0.31,0.3,Retail Trade,13
4,1 800 FLOWERS COM INC,2016-06-30 00:00:00.000000,2016-09-16 00:00:00.000000,506514000.0,166659000.0,263928000.0,120861000.0,,506514000.0,242586000.0,43282000.0,,0.57,0.55,Retail Trade,13


In [41]:
over_100MM.shape

(37265, 16)

In [42]:
over_100MM.company_name.nunique()

5376

In [43]:
over_100MM.period_end_date.nunique()

60

In [38]:
# for each company and period_end_date -- sort df by non_null_count, and then sort by date_filed the SEC 10-K form

# if same value count for later submitted report - keep the later submitted report

over_100MM.sort_values(by=['company_name', 'period_end_date', 'value_count', 'date_filed'], inplace=True)
over_100MM.head(2)

Unnamed: 0,company_name,period_end_date,date_filed,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,OperatingIncomeLoss,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,value_count
0,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,2014-09-12 00:00:00.000000,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183199000.0,23706000.0,,0.24,0.23,Retail Trade,13
1,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,2015-09-11 00:00:00.000000,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183228000.0,23706000.0,,0.24,0.23,Retail Trade,13


In [44]:
# keep one row for each company and period_end_date
# with way data sorted, keep the last row in company name , period end date grouping

filtered_over100MM  = over_100MM.groupby(['company_name', 'period_end_date']).tail(1)

In [45]:
filtered_over100MM.shape

(20479, 16)

In [46]:
filtered_over100MM.company_name.nunique()

5376

In [47]:
filtered_over100MM.period_end_date.nunique()

60

In [48]:
filtered_over100MM.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20479 entries, 1 to 37262
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   company_name                      20479 non-null  object 
 1   period_end_date                   20479 non-null  object 
 2   date_filed                        20479 non-null  object 
 3   Assets                            20479 non-null  float64
 4   AssetsCurrent                     14205 non-null  float64
 5   Liabilities                       16163 non-null  float64
 6   LiabilitiesCurrent                14194 non-null  float64
 7   NetIncomeLoss                     18571 non-null  float64
 8   LiabilitiesAndStockholdersEquity  20136 non-null  float64
 9   StockholdersEquity                18504 non-null  float64
 10  OperatingIncomeLoss               14959 non-null  float64
 11  Revenues                          9686 non-null   float64
 12  Earn

In [51]:
# pickle the filtered df
import pickle

with open('SEC_filtered_over100MM.pickle', 'wb') as to_write:
    pickle.dump(filtered_over100MM, to_write)