In [None]:
"""
merge filtered training/validation set of SEC data 
with bankruptcy list for training/validation set

apply logic to label target as where company
that filed for bankruptcy within 1 year of filing
their annual 10-K report with SEC

"""

In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import datetime
import pickle

In [2]:
# the cleaned and filtered SEC data

with open('SEC_filtered_over100MM.pickle', 'rb') as read_file:
    filtered_over100MM = pickle.load(read_file)
    
filtered_over100MM.head()

Unnamed: 0,company_name,period_end_date,date_filed,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,OperatingIncomeLoss,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,value_count
1,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,2015-09-11 00:00:00.000000,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183228000.0,23706000.0,,0.24,0.23,Retail Trade,13
3,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,2016-09-16 00:00:00.000000,497073000.0,155095000.0,286817000.0,123607000.0,,497073000.0,208449000.0,37617000.0,,0.31,0.3,Retail Trade,13
4,1 800 FLOWERS COM INC,2016-06-30 00:00:00.000000,2016-09-16 00:00:00.000000,506514000.0,166659000.0,263928000.0,120861000.0,,506514000.0,242586000.0,43282000.0,,0.57,0.55,Retail Trade,13
7,1 800 FLOWERS COM INC,2017-06-30 00:00:00.000000,2018-09-14 00:00:00.000000,552470000.0,257402000.0,270231000.0,125175000.0,,552470000.0,,46359000.0,1193625000.0,0.68,0.65,Retail Trade,13
9,1 800 FLOWERS COM INC,2018-06-30 00:00:00.000000,2019-09-13 00:00:00.000000,570889000.0,273021000.0,255985000.0,124799000.0,40791000.0,570889000.0,314904000.0,41048000.0,,0.63,0.61,Retail Trade,14


In [6]:
# add the labels for the target
brd_labels = pd.read_excel('debtor_list_ein_lookup.xlsx')
brd_labels.head()

Unnamed: 0,debtor_name,date_filed,chapter_filing,ein,name_in_sec_data,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,First Mariner Bancorp,2014-02-10,Chapter 11,521834860.0,First Mariner Bancorp,,,,
1,USEC Inc.,2014-03-05,Chapter 11,522107911.0,USEC INC,,,,
2,MModal Inc.,2014-03-20,Chapter 11,,,,,,
3,Dolan Company,2014-03-23,Chapter 11,,,,,,
4,"Global Geophysical Services, Inc.",2014-03-25,Chapter 11,,GLOBAL GEOPHYSICAL SERVICES INC,,,,


In [4]:
mask = (brd_labels['date_filed'] < datetime.datetime(2020, 1, 1) ) & (brd_labels['name_in_sec_data'].notna())

labels_tv = brd_labels[mask]
labels_tv.shape

(151, 9)

In [7]:
labels_tv['FULL_NAME'] = labels_tv.loc[:,'name_in_sec_data'].str.upper()
labels_tv.head()

Unnamed: 0,debtor_name,date_filed,chapter_filing,ein,name_in_sec_data,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,FULL_NAME
0,First Mariner Bancorp,2014-02-10,Chapter 11,521834860.0,First Mariner Bancorp,,,,,FIRST MARINER BANCORP
1,USEC Inc.,2014-03-05,Chapter 11,522107911.0,USEC INC,,,,,USEC INC
4,"Global Geophysical Services, Inc.",2014-03-25,Chapter 11,,GLOBAL GEOPHYSICAL SERVICES INC,,,,,GLOBAL GEOPHYSICAL SERVICES INC
5,James River Coal Company,2014-04-07,Chapter 11,980585280.0,"JAMES RIVER GROUP HOLDINGS, LTD.",,,,,"JAMES RIVER GROUP HOLDINGS, LTD."
7,Momentive Performance Materials Inc.,2014-04-13,Chapter 11,205748297.0,Momentive Performance Materials Inc.,,,,,MOMENTIVE PERFORMANCE MATERIALS INC.


In [8]:
brd_match_list = labels_tv[['date_filed', 'FULL_NAME']]
brd_match_list.head()

Unnamed: 0,date_filed,FULL_NAME
0,2014-02-10,FIRST MARINER BANCORP
1,2014-03-05,USEC INC
4,2014-03-25,GLOBAL GEOPHYSICAL SERVICES INC
5,2014-04-07,"JAMES RIVER GROUP HOLDINGS, LTD."
7,2014-04-13,MOMENTIVE PERFORMANCE MATERIALS INC.


In [9]:
brd_match_list.tail()

Unnamed: 0,date_filed,FULL_NAME
156,2019-10-03,EP ENERGY CORP
158,2019-10-21,NOBILIS HEALTH CORP.
159,2019-11-12,DEAN FOODS CO
160,2019-11-18,APPROACH RESOURCES INC
161,2019-12-27,"MELINTA THERAPEUTICS, INC. /NEW/"


In [10]:
brd_match_list.shape

(151, 2)

In [11]:
brd_match_list['FULL_NAME'].nunique()

143

In [13]:
# which companies are listed more than once
brd_match_list[brd_match_list.duplicated()]

Unnamed: 0,date_filed,FULL_NAME
32,2015-07-15,"WALTER ENERGY, INC."
75,2016-07-27,"ATLAS RESOURCE PARTNERS, L.P."


In [14]:
brd_match_list[brd_match_list.FULL_NAME == 'WALTER ENERGY, INC.']

Unnamed: 0,date_filed,FULL_NAME
29,2015-07-15,"WALTER ENERGY, INC."
32,2015-07-15,"WALTER ENERGY, INC."


In [15]:
brd_match_list[brd_match_list.FULL_NAME == 'ATLAS RESOURCE PARTNERS, L.P.']

Unnamed: 0,date_filed,FULL_NAME
73,2016-07-27,"ATLAS RESOURCE PARTNERS, L.P."
75,2016-07-27,"ATLAS RESOURCE PARTNERS, L.P."


In [17]:
# drop the 2 duplicated rows
brd_match_list.drop_duplicates(inplace=True)
brd_match_list.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brd_match_list.drop_duplicates(inplace=True)


(149, 2)

In [18]:
# brd list was manually adjusted so that it would match company names as in the SEC data
add_brd = pd.merge(filtered_over100MM, brd_match_list, how='left', left_on='company_name', right_on='FULL_NAME')

In [19]:
add_brd.head()

Unnamed: 0,company_name,period_end_date,date_filed_x,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,OperatingIncomeLoss,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,value_count,date_filed_y,FULL_NAME
0,1 800 FLOWERS COM INC,2014-06-30 00:00:00.000000,2015-09-11 00:00:00.000000,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183228000.0,23706000.0,,0.24,0.23,Retail Trade,13,NaT,
1,1 800 FLOWERS COM INC,2015-06-30 00:00:00.000000,2016-09-16 00:00:00.000000,497073000.0,155095000.0,286817000.0,123607000.0,,497073000.0,208449000.0,37617000.0,,0.31,0.3,Retail Trade,13,NaT,
2,1 800 FLOWERS COM INC,2016-06-30 00:00:00.000000,2016-09-16 00:00:00.000000,506514000.0,166659000.0,263928000.0,120861000.0,,506514000.0,242586000.0,43282000.0,,0.57,0.55,Retail Trade,13,NaT,
3,1 800 FLOWERS COM INC,2017-06-30 00:00:00.000000,2018-09-14 00:00:00.000000,552470000.0,257402000.0,270231000.0,125175000.0,,552470000.0,,46359000.0,1193625000.0,0.68,0.65,Retail Trade,13,NaT,
4,1 800 FLOWERS COM INC,2018-06-30 00:00:00.000000,2019-09-13 00:00:00.000000,570889000.0,273021000.0,255985000.0,124799000.0,40791000.0,570889000.0,314904000.0,41048000.0,,0.63,0.61,Retail Trade,14,NaT,


In [23]:
# rename a couple columns

add_brd.rename(columns={'date_filed_x':'date_filed_10K', 'date_filed_y': 'date_bankruptcy'}, inplace=True)

In [29]:
add_brd[add_brd['date_bankruptcy'].notnull()].company_name.nunique()

129

In [32]:
type(add_brd['period_end_date'][0])

str

In [33]:
add_brd['period_end_date'] = pd.to_datetime(add_brd['period_end_date'])


In [48]:
## if date_bankruptcy is within 1 year of period_end_date , then target = 1

## add column that calculates difference in 2 dates

add_brd['time_delta'] = add_brd['date_bankruptcy'] - add_brd['period_end_date']

add_brd.head()

Unnamed: 0,company_name,period_end_date,date_filed_10K,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,...,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,value_count,date_bankruptcy,FULL_NAME,time_delta,days,target
0,1 800 FLOWERS COM INC,2014-06-30,2015-09-11 00:00:00.000000,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183228000.0,...,,0.24,0.23,Retail Trade,13,NaT,,NaT,,0
1,1 800 FLOWERS COM INC,2015-06-30,2016-09-16 00:00:00.000000,497073000.0,155095000.0,286817000.0,123607000.0,,497073000.0,208449000.0,...,,0.31,0.3,Retail Trade,13,NaT,,NaT,,0
2,1 800 FLOWERS COM INC,2016-06-30,2016-09-16 00:00:00.000000,506514000.0,166659000.0,263928000.0,120861000.0,,506514000.0,242586000.0,...,,0.57,0.55,Retail Trade,13,NaT,,NaT,,0
3,1 800 FLOWERS COM INC,2017-06-30,2018-09-14 00:00:00.000000,552470000.0,257402000.0,270231000.0,125175000.0,,552470000.0,,...,1193625000.0,0.68,0.65,Retail Trade,13,NaT,,NaT,,0
4,1 800 FLOWERS COM INC,2018-06-30,2019-09-13 00:00:00.000000,570889000.0,273021000.0,255985000.0,124799000.0,40791000.0,570889000.0,314904000.0,...,,0.63,0.61,Retail Trade,14,NaT,,NaT,,0


In [49]:
add_brd['days'] = add_brd["time_delta"].dt.days

In [50]:
add_brd['target'] = np.where((add_brd['days'] >= 0) & (add_brd['days'] <=365) , 1, 0)

In [51]:
add_brd[add_brd['date_bankruptcy'].notnull()][:5]

Unnamed: 0,company_name,period_end_date,date_filed_10K,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,...,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,value_count,date_bankruptcy,FULL_NAME,time_delta,days,target
26,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-03-31,2016-08-23 00:00:00.000000,1309599000.0,192214000.0,1397526000.0,221366000.0,-30829000.0,1309599000.0,-148107000.0,...,,,,Services,11,2017-05-25,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",1151 days,1151.0,0
27,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-06-30,2016-08-23 00:00:00.000000,1125807000.0,201092000.0,1423081000.0,234463000.0,-210238000.0,1125807000.0,-359017000.0,...,,,,Services,11,2017-05-25,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",1060 days,1060.0,0
28,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-09-30,2016-08-23 00:00:00.000000,1197942000.0,324449000.0,1282101000.0,231336000.0,-87883000.0,1197942000.0,-449525000.0,...,,,,Services,11,2017-05-25,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",968 days,968.0,0
29,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2014-12-31,2016-08-23 00:00:00.000000,1153444000.0,254548000.0,1255775000.0,198732000.0,-192816000.0,1153444000.0,-493439000.0,...,,,,Services,11,2017-05-25,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",876 days,876.0,0
30,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",2015-03-31,2016-08-23 00:00:00.000000,1160587000.0,252893000.0,1276980000.0,229141000.0,-16667000.0,1160587000.0,-530007000.0,...,,,,Services,11,2017-05-25,"21ST CENTURY ONCOLOGY HOLDINGS, INC.",786 days,786.0,0


In [59]:
add_brd[add_brd['target'] == 1][:5]

Unnamed: 0,company_name,period_end_date,date_filed_10K,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,...,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,value_count,date_bankruptcy,FULL_NAME,time_delta,days,target
218,ACETO CORP,2018-06-30,2018-09-28 00:00:00.000000,767024000.0,502627000.0,671739000.0,302518000.0,-175423500.0,767024000.0,95285000.0,...,,-8.98,-4.98,Wholesale Trade,14,2019-02-19,ACETO CORP,234 days,234.0,1
423,AEROPOSTALE INC,2016-01-31,2016-05-20 00:00:00.000000,354383000.0,227406000.0,,175715000.0,-79298000.0,354383000.0,-35646000.0,...,,-0.995,-0.995,Retail Trade,13,2016-05-04,AEROPOSTALE INC,94 days,94.0,1
760,ALLIED NEVADA GOLD CORP.,2014-12-31,2015-03-27 00:00:00.000000,941238000.0,298847000.0,663680000.0,592376000.0,-490072000.0,941238000.0,277558000.0,...,,-4.62,-4.62,Mining,14,2015-03-10,ALLIED NEVADA GOLD CORP.,69 days,69.0,1
810,"ALPHA NATURAL RESOURCES, INC.",2014-12-31,2015-02-26 00:00:00.000000,10736150000.0,1876314000.0,7749348000.0,1009549000.0,,10736150000.0,2986800000.0,...,2678846000.0,-2.25,-2.25,Mining,14,2015-08-03,"ALPHA NATURAL RESOURCES, INC.",215 days,215.0,1
827,"ALTA MESA RESOURCES, INC. /DE",2018-12-31,2019-08-27 00:00:00.000000,1357830000.0,142917000.0,1141703000.0,956383000.0,-1518220000.0,1357830000.0,-29393000.0,...,336466000.0,-8.53,-8.53,Mining,15,2019-09-11,"ALTA MESA RESOURCES, INC. /DE",254 days,254.0,1


In [53]:
add_brd.target.value_counts()

0    20395
1       98
Name: target, dtype: int64

In [55]:
add_brd.columns

Index(['company_name', 'period_end_date', 'date_filed_10K', 'Assets',
       'AssetsCurrent', 'Liabilities', 'LiabilitiesCurrent', 'NetIncomeLoss',
       'LiabilitiesAndStockholdersEquity', 'StockholdersEquity',
       'OperatingIncomeLoss', 'Revenues', 'EarningsPerShareBasic',
       'EarningsPerShareDiluted', 'Division', 'value_count', 'date_bankruptcy',
       'FULL_NAME', 'time_delta', 'days', 'target'],
      dtype='object')

In [66]:
model_data = add_brd[['company_name', 'period_end_date', 'date_bankruptcy', 
                      'days', 'Assets', 'AssetsCurrent',
                      'Liabilities', 'LiabilitiesCurrent', 'NetIncomeLoss', 
                      'LiabilitiesAndStockholdersEquity', 'StockholdersEquity', 
                      'OperatingIncomeLoss', 'Revenues', 'EarningsPerShareBasic', 
                      'EarningsPerShareDiluted', 'Division', 'target']].reset_index(drop=True)
model_data.head()

Unnamed: 0,company_name,period_end_date,date_bankruptcy,days,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,NetIncomeLoss,LiabilitiesAndStockholdersEquity,StockholdersEquity,OperatingIncomeLoss,Revenues,EarningsPerShareBasic,EarningsPerShareDiluted,Division,target
0,1 800 FLOWERS COM INC,2014-06-30,NaT,,267569000.0,91818000.0,81451000.0,74307000.0,,267569000.0,183228000.0,23706000.0,,0.24,0.23,Retail Trade,0
1,1 800 FLOWERS COM INC,2015-06-30,NaT,,497073000.0,155095000.0,286817000.0,123607000.0,,497073000.0,208449000.0,37617000.0,,0.31,0.3,Retail Trade,0
2,1 800 FLOWERS COM INC,2016-06-30,NaT,,506514000.0,166659000.0,263928000.0,120861000.0,,506514000.0,242586000.0,43282000.0,,0.57,0.55,Retail Trade,0
3,1 800 FLOWERS COM INC,2017-06-30,NaT,,552470000.0,257402000.0,270231000.0,125175000.0,,552470000.0,,46359000.0,1193625000.0,0.68,0.65,Retail Trade,0
4,1 800 FLOWERS COM INC,2018-06-30,NaT,,570889000.0,273021000.0,255985000.0,124799000.0,40791000.0,570889000.0,314904000.0,41048000.0,,0.63,0.61,Retail Trade,0


In [67]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20493 entries, 0 to 20492
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   company_name                      20493 non-null  object        
 1   period_end_date                   20493 non-null  datetime64[ns]
 2   date_bankruptcy                   463 non-null    datetime64[ns]
 3   days                              463 non-null    float64       
 4   Assets                            20493 non-null  float64       
 5   AssetsCurrent                     14219 non-null  float64       
 6   Liabilities                       16170 non-null  float64       
 7   LiabilitiesCurrent                14208 non-null  float64       
 8   NetIncomeLoss                     18584 non-null  float64       
 9   LiabilitiesAndStockholdersEquity  20150 non-null  float64       
 10  StockholdersEquity                18517 non-nu

In [68]:
# pickle the model_data df

with open('model_data.pickle', 'wb') as to_write:
    pickle.dump(model_data, to_write)