In [None]:
"""
query SEC data cleaned and prepared 
containing companies with assets >= $100MM
as initial target list is bankruptcies for large public 
companies with over this amount of assets

Merge with prepared bankruptcy list

TRAINING AND VALIDATION DATA
"""

In [27]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import datetime

In [28]:
from pandasql import sqldf
# PandaSQL needs to be able to reference the global variables already defined (namely, `df`)
pysqldf = lambda q: sqldf(q, globals())

In [29]:
# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   
                              

In [30]:
# few fields for first initial model
# all 2019 data held out for test set

query = """
SELECT *
FROM sec_prep_data
WHERE  period_end_date < '2019-01-01' 
;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()


Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,RepaymentsOfLongTermDebt,Revenues,SalesRevenueGoodsNet,SalesRevenueNet,StockholdersEquity,WorkingCapital,Revenue_any,Industry,div_code,Division
0,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0,0001437749-17-015969,1084869,113117311,5990,630,2017,10-K,...,,,,756345000.0,183228000.0,,756345000.0,"Retail-Retail Stores, NEC",59,Retail Trade
1,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,0,0001437749-18-017027,1084869,113117311,5990,630,2018,10-K,...,,,,1121506000.0,208449000.0,,1121506000.0,"Retail-Retail Stores, NEC",59,Retail Trade
2,1 800 FLOWERS COM INC,2016-06-30 00:00:00.000000,0,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,1173024000.0,,1173024000.0,242586000.0,,1173024000.0,"Retail-Retail Stores, NEC",59,Retail Trade
3,1 800 FLOWERS COM INC,2017-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,1193625000.0,,1193625000.0,282239000.0,,1193625000.0,"Retail-Retail Stores, NEC",59,Retail Trade
4,1 800 FLOWERS COM INC,2018-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,1151921000.0,,,314904000.0,,1151921000.0,"Retail-Retail Stores, NEC",59,Retail Trade


In [31]:
sec_data.columns

Index(['company_name', 'period_end_date', 'number_of_quarters',
       'submission_number', 'central_index_key', 'ein', 'sic',
       'fiscal_year_end', 'fiscal_year', 'form', 'date_filed', 'Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'Goodwill',
       'GrossProfit', 'IncreaseDecreaseInInventories', 'InterestExpense',
       'InventoryNet', 'Liabilities', 'LiabilitiesAndStockholdersEquity',
       'LiabilitiesCurrent', 'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUs

In [32]:
sec_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19897 entries, 0 to 19896
Data columns (total 53 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   company_name                                                   19897 non-null  object 
 1   period_end_date                                                19897 non-null  object 
 2   number_of_quarters                                             19897 non-null  int64  
 3   submission_number                                              19897 non-null  object 
 4   central_index_key                                              19897 non-null  object 
 5   ein                                                            19897 non-null  object 
 6   sic                                                            19897 non-null  object 
 7   fiscal_year_end                                           

In [33]:
sec_data.shape

(19897, 53)

In [34]:
# add the labels for the target
# sheet LIST contains 2015-2019 bankruptcies where company names where adjusted 
# to be exactly like in SEC filings

brd_list = pd.read_excel('debtor_list_ein_lookup.xlsx', sheet_name='FULL_LIST')
#brd_labels

In [35]:
brd_list['FULL_NAME'] = brd_list['name_in_sec_data'].str.upper()
brd_list['bankruptcy_date'] = brd_list['date_filed']
brd_list.head()

Unnamed: 0,debtor_name,date_filed,chapter_filing,name_in_sec_data,Unnamed: 4,Unnamed: 5,Unnamed: 6,FULL_NAME,bankruptcy_date
0,First Mariner Bancorp,2014-02-10,Chapter 11,First Mariner Bancorp,,,,FIRST MARINER BANCORP,2014-02-10
1,USEC Inc.,2014-03-05,Chapter 11,USEC INC,,,,USEC INC,2014-03-05
2,"Global Geophysical Services, Inc.",2014-03-25,Chapter 11,GLOBAL GEOPHYSICAL SERVICES INC,,,,GLOBAL GEOPHYSICAL SERVICES INC,2014-03-25
3,James River Coal Company,2014-04-07,Chapter 11,"JAMES RIVER GROUP HOLDINGS, LTD.",,,,"JAMES RIVER GROUP HOLDINGS, LTD.",2014-04-07
4,Momentive Performance Materials Inc.,2014-04-13,Chapter 11,Momentive Performance Materials Inc.,,,,MOMENTIVE PERFORMANCE MATERIALS INC.,2014-04-13


In [36]:
brd_list.shape

(201, 9)

In [37]:
brd_list['FULL_NAME'].nunique()

194

In [38]:
brd_labels = brd_list[['FULL_NAME', 'bankruptcy_date']]

In [39]:
add_brd = pd.merge(sec_data, brd_labels, how='left', left_on='company_name', right_on='FULL_NAME')


In [40]:
add_brd.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SalesRevenueGoodsNet,SalesRevenueNet,StockholdersEquity,WorkingCapital,Revenue_any,Industry,div_code,Division,FULL_NAME,bankruptcy_date
0,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,0,0001437749-17-015969,1084869,113117311,5990,630,2017,10-K,...,,756345000.0,183228000.0,,756345000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
1,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,0,0001437749-18-017027,1084869,113117311,5990,630,2018,10-K,...,,1121506000.0,208449000.0,,1121506000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
2,1 800 FLOWERS COM INC,2016-06-30 00:00:00.000000,0,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,1173024000.0,242586000.0,,1173024000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
3,1 800 FLOWERS COM INC,2017-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,1193625000.0,282239000.0,,1193625000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT
4,1 800 FLOWERS COM INC,2018-06-30 00:00:00.000000,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,,,314904000.0,,1151921000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT


In [41]:
add_brd.shape

(19919, 55)

In [42]:
add_brd[add_brd['bankruptcy_date'].notnull()].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SalesRevenueGoodsNet,SalesRevenueNet,StockholdersEquity,WorkingCapital,Revenue_any,Industry,div_code,Division,FULL_NAME,bankruptcy_date
26,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-12-31 00:00:00.000000,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,,,-493439000.0,,,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25
27,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2015-12-31 00:00:00.000000,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,,,-700952000.0,,,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25
70,"AAC HOLDINGS, INC.",2014-12-31 00:00:00.000000,0,0001564590-18-003044,1606180,352496142,8093,1231,2017,10-K,...,,,97474000.0,,,"Services-Specialty Outpatient Facilities, NEC",80,Services,"AAC HOLDINGS, INC.",2020-06-20
71,"AAC HOLDINGS, INC.",2015-12-31 00:00:00.000000,0,0001564590-19-011552,1606180,352496142,8093,1231,2018,10-K,...,,,141654000.0,,,"Services-Specialty Outpatient Facilities, NEC",80,Services,"AAC HOLDINGS, INC.",2020-06-20
72,"AAC HOLDINGS, INC.",2016-12-31 00:00:00.000000,4,0001564590-19-011552,1606180,352496142,8093,1231,2018,10-K,...,,,165106000.0,,,"Services-Specialty Outpatient Facilities, NEC",80,Services,"AAC HOLDINGS, INC.",2020-06-20


In [43]:
#add_brd.info()

In [44]:
#put back to a datetime variable

add_brd['period_end_date'] = pd.to_datetime(add_brd['period_end_date'])


In [45]:
## if bankruptcy date is within 1 year of period_end_date , then target = 1

## add column that calculates difference in 2 dates

add_brd['time_delta'] =  add_brd['bankruptcy_date'] - add_brd['period_end_date'] 
add_brd['days'] = add_brd["time_delta"].dt.days
add_brd.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,StockholdersEquity,WorkingCapital,Revenue_any,Industry,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days
0,1 800 FLOWERS COM INC,2014-06-30,0,0001437749-17-015969,1084869,113117311,5990,630,2017,10-K,...,183228000.0,,756345000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
1,1 800 FLOWERS COM INC,2015-06-30,0,0001437749-18-017027,1084869,113117311,5990,630,2018,10-K,...,208449000.0,,1121506000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
2,1 800 FLOWERS COM INC,2016-06-30,0,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,242586000.0,,1173024000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
3,1 800 FLOWERS COM INC,2017-06-30,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,282239000.0,,1193625000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,
4,1 800 FLOWERS COM INC,2018-06-30,4,0001437749-19-018360,1084869,113117311,5990,630,2019,10-K,...,314904000.0,,1151921000.0,"Retail-Retail Stores, NEC",59,Retail Trade,,NaT,NaT,


In [46]:
add_brd['target'] = np.where( abs(add_brd['days']) <= 548 , 1, 0)

In [47]:
add_brd[add_brd['bankruptcy_date'].notnull()].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,WorkingCapital,Revenue_any,Industry,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days,target
26,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-12-31,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,,,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25,876 days,876.0,0
27,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2015-12-31,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,,,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25,511 days,511.0,1
70,"AAC HOLDINGS, INC.",2014-12-31,0,0001564590-18-003044,1606180,352496142,8093,1231,2017,10-K,...,,,"Services-Specialty Outpatient Facilities, NEC",80,Services,"AAC HOLDINGS, INC.",2020-06-20,1998 days,1998.0,0
71,"AAC HOLDINGS, INC.",2015-12-31,0,0001564590-19-011552,1606180,352496142,8093,1231,2018,10-K,...,,,"Services-Specialty Outpatient Facilities, NEC",80,Services,"AAC HOLDINGS, INC.",2020-06-20,1633 days,1633.0,0
72,"AAC HOLDINGS, INC.",2016-12-31,4,0001564590-19-011552,1606180,352496142,8093,1231,2018,10-K,...,,,"Services-Specialty Outpatient Facilities, NEC",80,Services,"AAC HOLDINGS, INC.",2020-06-20,1267 days,1267.0,0


In [48]:
add_brd[add_brd['target'] == 1].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,WorkingCapital,Revenue_any,Industry,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days,target
27,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2015-12-31,4,0001047469-16-015101,1503518,261747745,8011,1231,2015,10-K,...,,,Services-Offices & Clinics of Doctors of Medicine,80,Services,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2017-05-25,511 days,511.0,1
74,"AAC HOLDINGS, INC.",2018-12-31,4,0001564590-19-011552,1606180,352496142,8093,1231,2018,10-K,...,,,"Services-Specialty Outpatient Facilities, NEC",80,Services,"AAC HOLDINGS, INC.",2020-06-20,537 days,537.0,1
209,ACETO CORP,2018-06-30,4,0001144204-18-051414,2034,111720520,5122,630,2018,10-K,...,,,"Wholesale-Drugs, Proprietaries & Druggists' Su...",51,Wholesale Trade,ACETO CORP,2019-02-19,234 days,234.0,1
293,ADEPTUS HEALTH INC.,2015-12-31,4,0001558370-16-003672,1602367,465037387,8060,1231,2015,10-K,...,,364687000.0,Services-Hospitals,80,Services,ADEPTUS HEALTH INC.,2017-04-19,475 days,475.0,1
403,AEROPOSTALE INC,2015-01-31,4,0001168213-16-000111,1168213,311443880,5600,131,2015,10-K,...,,1838663000.0,Retail-Apparel & Accessory Stores,56,Retail Trade,AEROPOSTALE INC,2016-05-04,459 days,459.0,1


In [49]:
add_brd.target.value_counts()

0    19681
1      238
Name: target, dtype: int64

In [50]:
add_brd.FULL_NAME.nunique()

184

In [51]:
add_brd.company_name.nunique()

5373

In [52]:
## examine further

add_brd.to_excel('review_training_df.xlsx', index=False)

In [53]:
# pickle the model_data df
import pickle

with open('training_data.pickle', 'wb') as to_write:
    pickle.dump(add_brd, to_write)