In [None]:
"""
query SEC data cleaned and prepared for 2019 and saved in postgres db

Merge with prepared bankruptcy list

TEST HOLD OUT DATA
"""

In [60]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import datetime

In [61]:
from pandasql import sqldf
# PandaSQL needs to be able to reference the global variables already defined (namely, `df`)
pysqldf = lambda q: sqldf(q, globals())

In [62]:
# Postgres info to connect

connection_args = {
    'host': 'localhost',  
    'dbname': 'bankruptcy',  
    'port': 5432          
}

connection = pg.connect(**connection_args)
connection.autocommit = True   
                              

In [63]:
# few fields for first initial model
# all 2019 data held out for test set

query = """
SELECT *
FROM sec_prep_2019
;
"""

sec_data = pd_sql.read_sql(query, connection)
sec_data.head()


Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,InterestExpense,InventoryNet,SalesRevenueGoodsNet,WorkingCapital,Revenue_any,SIC Code,Industry,Unnamed: 2,div_code,Division
0,1 800 FLOWERS COM INC,2019-06-30,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,,,,5990,"Retail-Retail Stores, NEC",,59,Retail Trade
1,"10X GENOMICS, INC.",2019-12-31,4,0001193125-20-052640,1770787,455614458,3826,,2019,10-K,...,2099667.0,11920000.0,,,154430300.0,3826,Laboratory Analytical Instruments,,38,Manufacturing
2,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2019-12-31,4,0001493152-20-005206,1591890,461119100,6331,,2019,10-K,...,,,,,,6331,"Fire, Marine & Casualty Insurance",,63,"Finance, Insurance and Real Estate"
3,"1895 BANCORP OF WISCONSIN, INC.",2019-12-31,4,0001564590-20-014188,1751692,0,6036,,2019,10-K,...,4583000.0,,,,,6036,"Savings Institutions, Not Federally Chartered",,60,"Finance, Insurance and Real Estate"
4,1LIFE HEALTHCARE INC,2019-12-31,4,0001564590-20-013666,1404123,760707204,8011,,2019,10-K,...,704000.0,3521500.0,,,,8011,Services-Offices & Clinics of Doctors of Medicine,,80,Services


In [64]:
sec_data.columns

Index(['company_name', 'period_end_date', 'number_of_quarters',
       'submission_number', 'central_index_key', 'ein', 'sic',
       'fiscal_year_end', 'fiscal_year', 'form', 'date_filed', 'Assets',
       'AssetsCurrent', 'CashAndCashEquivalentsAtCarryingValue',
       'CashAndCashEquivalentsPeriodIncreaseDecrease', 'CommonStockValue',
       'CostsAndExpenses', 'CurrentAsset', 'DeferredIncomeTaxExpenseBenefit',
       'DeferredIncomeTaxLiabilities', 'DeferredIncomeTaxLiabilitiesNet',
       'DeferredIncomeTaxesAndTaxCredits', 'Depreciation',
       'EarningsPerShareBasic', 'EarningsPerShareDiluted', 'GrossProfit',
       'Liabilities', 'LiabilitiesAndStockholdersEquity', 'LiabilitiesCurrent',
       'LongTermDebt', 'LongTermDebtCurrent',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths',
       'LongTermDebtNoncurrent', 'NetCashProvidedByUsedInFinancingActivities',
       'NetCashProvidedByUsedInInvestingActivities',
       'NetCashProvidedByUsedInOperatingActiv

In [65]:
sec_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3485 entries, 0 to 3484
Data columns (total 56 columns):
 #   Column                                                         Non-Null Count  Dtype         
---  ------                                                         --------------  -----         
 0   company_name                                                   3485 non-null   object        
 1   period_end_date                                                3485 non-null   datetime64[ns]
 2   number_of_quarters                                             3485 non-null   int64         
 3   submission_number                                              3485 non-null   object        
 4   central_index_key                                              3485 non-null   object        
 5   ein                                                            3485 non-null   object        
 6   sic                                                            3485 non-null   object        
 7

In [66]:
sec_data.shape

(3485, 56)

In [67]:
# add the labels for the target
# sheet LIST contains 2015-2019 bankruptcies where company names where adjusted 
# to be exactly like in SEC filings

brd_list = pd.read_excel('debtor_list_ein_lookup.xlsx', sheet_name='FULL_LIST')
#brd_labels

In [68]:
brd_list['FULL_NAME'] = brd_list['name_in_sec_data'].str.upper()
brd_list['bankruptcy_date'] = brd_list['date_filed']
brd_list.head()

Unnamed: 0,debtor_name,date_filed,chapter_filing,name_in_sec_data,Unnamed: 4,Unnamed: 5,Unnamed: 6,FULL_NAME,bankruptcy_date
0,First Mariner Bancorp,2014-02-10,Chapter 11,First Mariner Bancorp,,,,FIRST MARINER BANCORP,2014-02-10
1,USEC Inc.,2014-03-05,Chapter 11,USEC INC,,,,USEC INC,2014-03-05
2,"Global Geophysical Services, Inc.",2014-03-25,Chapter 11,GLOBAL GEOPHYSICAL SERVICES INC,,,,GLOBAL GEOPHYSICAL SERVICES INC,2014-03-25
3,James River Coal Company,2014-04-07,Chapter 11,"JAMES RIVER GROUP HOLDINGS, LTD.",,,,"JAMES RIVER GROUP HOLDINGS, LTD.",2014-04-07
4,Momentive Performance Materials Inc.,2014-04-13,Chapter 11,Momentive Performance Materials Inc.,,,,MOMENTIVE PERFORMANCE MATERIALS INC.,2014-04-13


In [69]:
brd_list.shape

(201, 9)

In [70]:
brd_list['FULL_NAME'].nunique()

194

In [None]:
# Chaparral Energy filed in 2016 and 2020
# Ultra Petroleum Corp filed in 2016 and 2020
# American Apparel
# Geophysical
# Halcon Resources
# Paragon Offshore

In [71]:
brd_labels = brd_list[['FULL_NAME', 'bankruptcy_date']]

In [72]:
add_brd = pd.merge(sec_data, brd_labels, how='left', left_on='company_name', right_on='FULL_NAME')


In [73]:
add_brd.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SalesRevenueGoodsNet,WorkingCapital,Revenue_any,SIC Code,Industry,Unnamed: 2,div_code,Division,FULL_NAME,bankruptcy_date
0,1 800 FLOWERS COM INC,2019-06-30,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,,,5990,"Retail-Retail Stores, NEC",,59,Retail Trade,,NaT
1,"10X GENOMICS, INC.",2019-12-31,4,0001193125-20-052640,1770787,455614458,3826,,2019,10-K,...,,,154430300.0,3826,Laboratory Analytical Instruments,,38,Manufacturing,,NaT
2,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2019-12-31,4,0001493152-20-005206,1591890,461119100,6331,,2019,10-K,...,,,,6331,"Fire, Marine & Casualty Insurance",,63,"Finance, Insurance and Real Estate",,NaT
3,"1895 BANCORP OF WISCONSIN, INC.",2019-12-31,4,0001564590-20-014188,1751692,0,6036,,2019,10-K,...,,,,6036,"Savings Institutions, Not Federally Chartered",,60,"Finance, Insurance and Real Estate",,NaT
4,1LIFE HEALTHCARE INC,2019-12-31,4,0001564590-20-013666,1404123,760707204,8011,,2019,10-K,...,,,,8011,Services-Offices & Clinics of Doctors of Medicine,,80,Services,,NaT


In [74]:
add_brd.shape

(3487, 58)

In [75]:
add_brd[add_brd['bankruptcy_date'].notnull()].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SalesRevenueGoodsNet,WorkingCapital,Revenue_any,SIC Code,Industry,Unnamed: 2,div_code,Division,FULL_NAME,bankruptcy_date
104,AKORN INC,2019-12-31,4,0001628280-20-002314,3116,720717400,2834,,2019,10-K,...,,-505500000.0,,2834,Pharmaceutical Preparations,,28,Manufacturing,AKORN INC,2020-05-20
147,"ALTA MESA RESOURCES, INC. /DE",2019-12-31,4,0001690769-20-000032,1690769,814433840,1311,,2019,10-K,...,,,373538800.0,1311,Crude Petroleum & Natural Gas,,13,Mining,"ALTA MESA RESOURCES, INC. /DE",2019-09-11
259,ARCH COAL INC,2019-12-31,4,0001628280-20-001344,1037676,430921172,1221,,2019,10-K,...,,,,1221,Bituminous Coal & Lignite Surface Mining,,12,Mining,ARCH COAL INC,2016-01-11
287,"ASCENA RETAIL GROUP, INC.",2019-07-31,4,0001498301-19-000092,1498301,300641353,5600,,2019,10-K,...,,,5559033000.0,5600,Retail-Apparel & Accessory Stores,,56,Retail Trade,"ASCENA RETAIL GROUP, INC.",2020-07-23
461,"BONANZA CREEK ENERGY, INC.",2019-12-31,4,0001509589-20-000011,1509589,611630631,1311,,2019,10-K,...,,,,1311,Crude Petroleum & Natural Gas,,13,Mining,"BONANZA CREEK ENERGY, INC.",2017-01-04


In [50]:
#add_brd.info()

In [76]:
#put back to a datetime variable

add_brd['period_end_date'] = pd.to_datetime(add_brd['period_end_date'])


In [77]:
## if bankruptcy date is within 1 year of period_end_date , then target = 1

## add column that calculates difference in 2 dates

add_brd['time_delta'] =  add_brd['bankruptcy_date'] - add_brd['period_end_date'] 
add_brd['days'] = add_brd["time_delta"].dt.days
add_brd.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,Revenue_any,SIC Code,Industry,Unnamed: 2,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days
0,1 800 FLOWERS COM INC,2019-06-30,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,,5990,"Retail-Retail Stores, NEC",,59,Retail Trade,,NaT,NaT,
1,"10X GENOMICS, INC.",2019-12-31,4,0001193125-20-052640,1770787,455614458,3826,,2019,10-K,...,154430300.0,3826,Laboratory Analytical Instruments,,38,Manufacturing,,NaT,NaT,
2,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2019-12-31,4,0001493152-20-005206,1591890,461119100,6331,,2019,10-K,...,,6331,"Fire, Marine & Casualty Insurance",,63,"Finance, Insurance and Real Estate",,NaT,NaT,
3,"1895 BANCORP OF WISCONSIN, INC.",2019-12-31,4,0001564590-20-014188,1751692,0,6036,,2019,10-K,...,,6036,"Savings Institutions, Not Federally Chartered",,60,"Finance, Insurance and Real Estate",,NaT,NaT,
4,1LIFE HEALTHCARE INC,2019-12-31,4,0001564590-20-013666,1404123,760707204,8011,,2019,10-K,...,,8011,Services-Offices & Clinics of Doctors of Medicine,,80,Services,,NaT,NaT,


In [82]:
#add_brd['target'] = np.where((add_brd['days'] >= 0) & (add_brd['days'] <=365) , 1, 0)
add_brd['target'] = np.where(add_brd['bankruptcy_date'] >= datetime.datetime(2020,1,1), 1, 0)

In [83]:
add_brd[add_brd['bankruptcy_date'].notnull()].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SIC Code,Industry,Unnamed: 2,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days,target
104,AKORN INC,2019-12-31,4,0001628280-20-002314,3116,720717400,2834,,2019,10-K,...,2834,Pharmaceutical Preparations,,28,Manufacturing,AKORN INC,2020-05-20,141 days,141.0,1
147,"ALTA MESA RESOURCES, INC. /DE",2019-12-31,4,0001690769-20-000032,1690769,814433840,1311,,2019,10-K,...,1311,Crude Petroleum & Natural Gas,,13,Mining,"ALTA MESA RESOURCES, INC. /DE",2019-09-11,-111 days,-111.0,0
259,ARCH COAL INC,2019-12-31,4,0001628280-20-001344,1037676,430921172,1221,,2019,10-K,...,1221,Bituminous Coal & Lignite Surface Mining,,12,Mining,ARCH COAL INC,2016-01-11,-1450 days,-1450.0,0
287,"ASCENA RETAIL GROUP, INC.",2019-07-31,4,0001498301-19-000092,1498301,300641353,5600,,2019,10-K,...,5600,Retail-Apparel & Accessory Stores,,56,Retail Trade,"ASCENA RETAIL GROUP, INC.",2020-07-23,358 days,358.0,1
461,"BONANZA CREEK ENERGY, INC.",2019-12-31,4,0001509589-20-000011,1509589,611630631,1311,,2019,10-K,...,1311,Crude Petroleum & Natural Gas,,13,Mining,"BONANZA CREEK ENERGY, INC.",2017-01-04,-1091 days,-1091.0,0


In [84]:
add_brd[add_brd['target'] == 1].head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SIC Code,Industry,Unnamed: 2,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days,target
104,AKORN INC,2019-12-31,4,0001628280-20-002314,3116,720717400,2834,,2019,10-K,...,2834,Pharmaceutical Preparations,,28,Manufacturing,AKORN INC,2020-05-20,141 days,141.0,1
287,"ASCENA RETAIL GROUP, INC.",2019-07-31,4,0001498301-19-000092,1498301,300641353,5600,,2019,10-K,...,5600,Retail-Apparel & Accessory Stores,,56,Retail Trade,"ASCENA RETAIL GROUP, INC.",2020-07-23,358 days,358.0,1
482,BRIGGS & STRATTON CORP,2019-06-30,4,0000014195-19-000027,14195,390182330,3510,630.0,2019,10-K,...,3510,Engines & Turbines,,35,Manufacturing,BRIGGS & STRATTON CORP,2020-07-20,386 days,386.0,1
537,CALIFORNIA RESOURCES CORP,2019-12-31,4,0001609253-20-000066,1609253,465670947,1311,,2019,10-K,...,1311,Crude Petroleum & Natural Gas,,13,Mining,CALIFORNIA RESOURCES CORP,2020-07-15,197 days,197.0,1
565,CARBO CERAMICS INC,2019-12-31,4,0001564590-20-025870,1009672,721100013,3290,,2019,10-K,...,3290,"Abrasive, Asbestos & Misc Nonmetallic Mineral ...",,32,Manufacturing,CARBO CERAMICS INC,2020-03-29,89 days,89.0,1


In [85]:
add_brd.target.value_counts()

0    3439
1      48
Name: target, dtype: int64

In [None]:
# Searched for filings in SEC EDGAR
# Three companies did not send a 2019 report to SEC

# Neiman Marcus
# Centric Brands
# AAC Holdings, INC

In [86]:
add_brd.company_name.nunique()

3484

In [58]:
## examine further

add_brd.to_excel('review_test_df.xlsx', index=False)

In [87]:
# pickle the model_data df
import pickle

with open('test_data.pickle', 'wb') as to_write:
    pickle.dump(add_brd, to_write)