In [None]:
"""
query SEC data cleaned and prepared 
containing companies with assets >= $100MM
as initial target list is bankruptcies for large public 
companies with over this amount of assets

Merge with prepared bankruptcy list

TRAINING AND VALIDATION DATA
"""

In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import datetime

In [2]:
from pandasql import sqldf
# PandaSQL needs to be able to reference the global variables already defined (namely, `df`)
pysqldf = lambda q: sqldf(q, globals())

In [3]:
# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   
                              

In [4]:
# few fields for first initial model
# all 2019 data held out for test set

query = """
SELECT *
FROM sec_wide_table
WHERE  period_end_date < '2019-01-01' 
;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()


Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SalesRevenueNet,StockholdersEquity,WorkingCapital,FY_end,FY_end_day,FY_end_month,keep_row,Industry,div_code,Division
0,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0,0001437749-17-015969,1084869,113117311,5990,630,2017,10-K,...,756345000.0,183228000.0,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade
1,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,0,0001437749-18-017027,1084869,113117311,5990,630,2018,10-K,...,1121506000.0,208449000.0,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade
2,1 800 FLOWERS COM INC,2016-06-30 00:00:00.000000,0,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,1173024000.0,242586000.0,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade
3,1 800 FLOWERS COM INC,2017-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,1193625000.0,282239000.0,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade
4,1 800 FLOWERS COM INC,2018-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,314904000.0,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade


In [5]:
sec_data.columns

Index(['company_name', 'period_end_date', 'number_of_quarters',
       'submission_number', 'central_index_key', 'ein', 'sic',
       'fiscal_year_end', 'fiscal_year', 'form', 'date_filed', 'Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'Goodwill',
       'GrossProfit', 'IncreaseDecreaseInInventories', 'InterestExpense',
       'InventoryNet', 'Liabilities', 'LiabilitiesAndStockholdersEquity',
       'LiabilitiesCurrent', 'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUs

In [6]:
sec_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19897 entries, 0 to 19896
Data columns (total 56 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   company_name                                                   19897 non-null  object 
 1   period_end_date                                                19897 non-null  object 
 2   number_of_quarters                                             19897 non-null  int64  
 3   submission_number                                              19897 non-null  object 
 4   central_index_key                                              19897 non-null  object 
 5   ein                                                            19897 non-null  object 
 6   sic                                                            19897 non-null  object 
 7   fiscal_year_end                                           

In [7]:
sec_data.shape

(19897, 56)

In [8]:
# add the labels for the target
# sheet LIST contains 2015-2019 bankruptcies where company names where adjusted 
# to be exactly like in SEC filings

brd_list = pd.read_excel('debtor_list_ein_lookup.xlsx', sheet_name='LIST')
#brd_labels

In [9]:
brd_list['FULL_NAME'] = brd_list['name_in_sec_data'].str.upper()
brd_list['bankruptcy_date'] = brd_list['date_filed']
brd_list.head()

Unnamed: 0,debtor_name,date_filed,chapter_filing,ein,name_in_sec_data,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,FULL_NAME,bankruptcy_date
0,"21st Century Oncology Holdings, Inc.",2017-05-25,Chapter 11,261747745.0,"21st Century Oncology Holdings, Inc.",,,,,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25
1,Aceto Corporation,2019-02-19,Chapter 11,111720520.0,ACETO CORP,,,,,ACETO CORP,2019-02-19
2,Adeptus Health Inc.,2017-04-19,Chapter 11,465037387.0,Adeptus Health Inc.,,,,,ADEPTUS HEALTH INC.,2017-04-19
3,"Aeropostale, Inc.",2016-05-04,Chapter 11,311443880.0,AEROPOSTALE INC,,,,,AEROPOSTALE INC,2016-05-04
4,Allied Nevada Gold Corp.,2015-03-10,Chapter 11,0.0,Allied Nevada Gold Corp.,,,,,ALLIED NEVADA GOLD CORP.,2015-03-10


In [10]:
brd_list.shape

(132, 11)

In [11]:
brd_list['FULL_NAME'].nunique()

132

In [12]:
brd_labels = brd_list[['FULL_NAME', 'bankruptcy_date']]

In [13]:
# in SEC if company_name in the list of FULL_NAME , target = 1 else target = 0

In [14]:
add_brd = pd.merge(sec_data, brd_labels, how='left', left_on='company_name', right_on='FULL_NAME')


In [15]:
add_brd.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,WorkingCapital,FY_end,FY_end_day,FY_end_month,keep_row,Industry,div_code,Division,FULL_NAME,bankruptcy_date
0,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0,0001437749-17-015969,1084869,113117311,5990,630,2017,10-K,...,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
1,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,0,0001437749-18-017027,1084869,113117311,5990,630,2018,10-K,...,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
2,1 800 FLOWERS COM INC,2016-06-30 00:00:00.000000,0,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
3,1 800 FLOWERS COM INC,2017-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
4,1 800 FLOWERS COM INC,2018-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,630,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT


In [16]:
add_brd.shape

(19897, 58)

In [18]:
add_brd[add_brd['bankruptcy_date'].notnull()].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,WorkingCapital,FY_end,FY_end_day,FY_end_month,keep_row,Industry,div_code,Division,FULL_NAME,bankruptcy_date
26,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-12-31 00:00:00.000000,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,,1231,31,12,1,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25
27,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2015-12-31 00:00:00.000000,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,,1231,31,12,1,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25
205,ACETO CORP,2014-06-30 00:00:00.000000,0,0001144204-17-057835,2034,111720520,5122,630,2017,10-K/A,...,,630,30,6,1,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19
206,ACETO CORP,2015-06-30 00:00:00.000000,0,0001144204-18-051414,2034,111720520,5122,630,2018,10-K,...,,630,30,6,1,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19
207,ACETO CORP,2016-06-30 00:00:00.000000,4,0001144204-18-051414,2034,111720520,5122,630,2018,10-K,...,,630,30,6,1,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19


In [61]:
#add_brd.info()

In [19]:
#put back to a datetime variable

add_brd['period_end_date'] = pd.to_datetime(add_brd['period_end_date'])


In [20]:
## if bankruptcy date is within 1 year of period_end_date , then target = 1

## add column that calculates difference in 2 dates

add_brd['time_delta'] =  add_brd['bankruptcy_date'] - add_brd['period_end_date'] 
add_brd['days'] = add_brd["time_delta"].dt.days
add_brd.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,FY_end_day,FY_end_month,keep_row,Industry,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days
0,1 800 FLOWERS COM INC,2014-06-30,0,0001437749-17-015969,1084869,113117311,5990,630,2017,10-K,...,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
1,1 800 FLOWERS COM INC,2015-06-30,0,0001437749-18-017027,1084869,113117311,5990,630,2018,10-K,...,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
2,1 800 FLOWERS COM INC,2016-06-30,0,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
3,1 800 FLOWERS COM INC,2017-06-30,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
4,1 800 FLOWERS COM INC,2018-06-30,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,30,6,1,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,


In [21]:
add_brd['target'] = np.where((add_brd['days'] >= 0) & (add_brd['days'] <=365) , 1, 0)



In [22]:
add_brd[add_brd['bankruptcy_date'].notnull()].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,FY_end_month,keep_row,Industry,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days,target
26,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-12-31,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,12,1,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25,876 days,876.0,0
27,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2015-12-31,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,12,1,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25,511 days,511.0,0
205,ACETO CORP,2014-06-30,0,0001144204-17-057835,2034,111720520,5122,630,2017,10-K/A,...,6,1,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19,1695 days,1695.0,0
206,ACETO CORP,2015-06-30,0,0001144204-18-051414,2034,111720520,5122,630,2018,10-K,...,6,1,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19,1330 days,1330.0,0
207,ACETO CORP,2016-06-30,4,0001144204-18-051414,2034,111720520,5122,630,2018,10-K,...,6,1,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19,964 days,964.0,0


In [23]:
add_brd[add_brd['target'] == 1].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,FY_end_month,keep_row,Industry,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days,target
209,ACETO CORP,2018-06-30,4,0001144204-18-051414,2034,111720520,5122,630,2018,10-K,...,6,1,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19,234 days,234.0,1
405,AEROPOSTALE INC,2016-01-31,4,0001168213-16-000111,1168213,311443880,5600,131,2015,10-K,...,1,1,Retail-Apparel & Accessory Stores,56,Retail Trade,AEROPOSTALE INC,2016-05-04,94 days,94.0,1
729,ALLIED NEVADA GOLD CORP.,2014-12-31,4,0001376610-15-000004,1376610,0,1040,1231,2014,10-K,...,12,1,Gold and Silver Ores,10,Mining,ALLIED NEVADA GOLD CORP.,2015-03-10,69 days,69.0,1
779,"ALPHA NATURAL RESOURCES, INC.",2014-12-31,4,0001301063-15-000015,1301063,421638663,1221,1231,2014,10-K,...,12,1,Bituminous Coal & Lignite Surface Mining,12,Mining,"ALPHA NATURAL RESOURCES, INC.",2015-08-03,215 days,215.0,1
794,"ALTA MESA HOLDINGS, LP",2018-12-31,4,0001518403-19-000021,1518403,203565150,1311,1231,2018,10-K,...,12,1,Crude Petroleum & Natural Gas,13,Mining,"ALTA MESA HOLDINGS, LP",2019-09-11,254 days,254.0,1


In [24]:
add_brd.target.value_counts()

0    19797
1      100
Name: target, dtype: int64

In [25]:
add_brd.FULL_NAME.nunique()

127

In [26]:
add_brd.company_name.nunique()

5373

In [27]:
## examine further

add_brd.to_excel('review_full_df.xlsx', index=False)

In [28]:
# pickle the model_data df
import pickle

with open('training_data.pickle', 'wb') as to_write:
    pickle.dump(add_brd, to_write)