In [None]:
"""
Prepare the test hold out data with same pre-processing done to training/validation data
"""

In [49]:
import pickle
import pandas as pd
import numpy as np

In [50]:
from sklearn.preprocessing import OneHotEncoder

In [51]:
### the 2019 test data 

with open('test_data.pickle', 'rb') as read_file:
    test_data = pickle.load(read_file)
    
test_data.head()

Unnamed: 0,company_name,period_end_date,number_of_quarters,submission_number,central_index_key,ein,sic,fiscal_year_end,fiscal_year,form,...,SIC Code,Industry,Unnamed: 2,div_code,Division,FULL_NAME,bankruptcy_date,time_delta,days,target
0,1 800 FLOWERS COM INC,2019-06-30,4,0001437749-19-018360,1084869,113117311,5990,630.0,2019,10-K,...,5990,"Retail-Retail Stores, NEC",,59,Retail Trade,,NaT,NaT,,0
1,"10X GENOMICS, INC.",2019-12-31,4,0001193125-20-052640,1770787,455614458,3826,,2019,10-K,...,3826,Laboratory Analytical Instruments,,38,Manufacturing,,NaT,NaT,,0
2,"1347 PROPERTY INSURANCE HOLDINGS, INC.",2019-12-31,4,0001493152-20-005206,1591890,461119100,6331,,2019,10-K,...,6331,"Fire, Marine & Casualty Insurance",,63,"Finance, Insurance and Real Estate",,NaT,NaT,,0
3,"1895 BANCORP OF WISCONSIN, INC.",2019-12-31,4,0001564590-20-014188,1751692,0,6036,,2019,10-K,...,6036,"Savings Institutions, Not Federally Chartered",,60,"Finance, Insurance and Real Estate",,NaT,NaT,,0
4,1LIFE HEALTHCARE INC,2019-12-31,4,0001564590-20-013666,1404123,760707204,8011,,2019,10-K,...,8011,Services-Offices & Clinics of Doctors of Medicine,,80,Services,,NaT,NaT,,0


In [52]:
# put large dollar amounts in millions
# move decimal place as model will take large numbers

test_data['Assets_MM'] = test_data['Assets'] / 1e6
test_data['AssetsCurrent_MM'] = test_data['AssetsCurrent'] / 1e6
test_data['Libilities_MM'] = test_data['Liabilities'] / 1e6
test_data['LibilitiesCurrent_MM'] = test_data['LiabilitiesCurrent'] / 1e6
test_data['NetIncomeLoss_MM'] = test_data['NetIncomeLoss'] / 1e6
test_data['StockholdersEquity_MM'] = test_data['StockholdersEquity'] / 1e6
test_data['OperatingIncomeLoss_MM'] = test_data['OperatingIncomeLoss'] / 1e6
test_data['Revenues_MM'] = test_data['Revenues'] / 1e6


test_data['CashAndCashEquivalentsAtCarryingValue_MM'] = test_data['CashAndCashEquivalentsAtCarryingValue'] / 1e6
test_data['CommonStockValue_MM'] = test_data['CommonStockValue'] / 1e6
test_data['Goodwill_MM'] = test_data['Goodwill'] / 1e6
test_data['GrossProfit_MM'] = test_data['GrossProfit'] / 1e6
test_data['InterestExpense_MM'] = test_data['InterestExpense'] / 1e6


test_data['Revenue_any_MM'] = test_data['Revenue_any'] / 1e6
test_data['Revenues_MM'] = test_data['Revenues'] / 1e6
test_data['SalesRevenueGoodsNet_MM'] = test_data['SalesRevenueGoodsNet'] / 1e6
test_data['SalesRevenueNet_MM'] = test_data['SalesRevenueNet'] / 1e6

test_data['NetCashProvidedByUsedInFinancingActivities_MM'] = test_data['NetCashProvidedByUsedInFinancingActivities'] / 1e6
test_data['NetCashProvidedByUsedInInvestingActivities_MM'] = test_data['NetCashProvidedByUsedInInvestingActivities'] / 1e6
test_data['NetCashProvidedByUsedInOperatingActivities_MM'] = test_data['NetCashProvidedByUsedInOperatingActivities'] / 1e6

test_data['LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths_MM'] = test_data['LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths'] / 1e6
test_data['LongTermDebtNoncurrent_MM'] = test_data['LongTermDebtNoncurrent'] / 1e6
test_data['LongTermDebt_MM'] = test_data['LongTermDebt'] / 1e6
test_data['RepaymentsOfLongTermDebt_MM'] = test_data['RepaymentsOfLongTermDebt'] / 1e6

test_data['Depreciation_MM'] = test_data['Depreciation'] / 1e6

In [53]:
# accounting ratios

test_data['debt_ratio'] = test_data['Liabilities'] / test_data['StockholdersEquity']
test_data['debt_equity_ratio'] = test_data['Liabilities'] / test_data['Assets']
test_data['current_ratio'] = test_data['AssetsCurrent'] / test_data['LiabilitiesCurrent']
test_data['leverage'] = test_data['Assets'] / test_data['StockholdersEquity']
test_data['return_on_equity'] = test_data['NetIncomeLoss'] / test_data['StockholdersEquity']
test_data['return_on_assets'] = test_data['NetIncomeLoss'] / test_data['Assets']


test_data['ratio_1'] = test_data['StockholdersEquity'] / test_data['Assets']
test_data['ratio_2'] = test_data['GrossProfit'] / test_data['Assets']
test_data['ratio_3'] = test_data['Assets'] / test_data['Liabilities']
test_data['ratio_4'] = (test_data['GrossProfit'] + test_data['Depreciation'] ) /test_data['Liabilities']

In [54]:
# fill infinity values with NaN

test_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [55]:
test_data.shape

(3487, 95)

In [56]:
def add_deviation_feature(X, feature, category):
    
    # temp groupby object
    category_gb = X.groupby(category)[feature]
    
    # create columns of category means and standard deviations
    category_mean = category_gb.transform(lambda x: x.mean())
    category_std = category_gb.transform(lambda x: x.std())
    
    # compute stds from category mean for each feature value,
    # add to X as new feature
    deviation_feature = (X[feature] - category_mean) / category_std 
    X[feature + '_Dev_' + category] = deviation_feature  

In [57]:
add_deviation_feature(test_data, 'debt_ratio', 'Division')
add_deviation_feature(test_data, 'debt_equity_ratio', 'Division')
add_deviation_feature(test_data, 'current_ratio', 'Division')
add_deviation_feature(test_data, 'leverage', 'Division')
add_deviation_feature(test_data, 'return_on_equity', 'Division')
add_deviation_feature(test_data, 'return_on_assets', 'Division')
add_deviation_feature(test_data, 'ratio_2', 'Division')
add_deviation_feature(test_data, 'EarningsPerShareDiluted', 'Division')

In [58]:
# fill infinity values with NaN

test_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [59]:
## split into 3 parts to handle missing

test_ready_1 = test_data[['company_name', 'FULL_NAME', 'period_end_date', 'submission_number',
       'central_index_key', 'ein', 'sic', 'fiscal_year_end', 'fiscal_year',
       'form', 'date_filed', 'Industry', 'div_code', 'Division',
       'bankruptcy_date', 'days', 'target']]

In [60]:
## RF will not accept NaNs

## change all NaN to zero where large values

test_ready_2 = test_data[['Assets_MM', 'AssetsCurrent_MM',
       'Libilities_MM', 'LibilitiesCurrent_MM', 'NetIncomeLoss_MM',
       'StockholdersEquity_MM', 'OperatingIncomeLoss_MM', 'Revenues_MM',
       'CashAndCashEquivalentsAtCarryingValue_MM', 'CommonStockValue_MM',
       'Goodwill_MM', 'GrossProfit_MM', 'InterestExpense_MM', 'Revenue_any_MM',
       'SalesRevenueGoodsNet_MM', 'SalesRevenueNet_MM',
       'NetCashProvidedByUsedInFinancingActivities_MM',
       'NetCashProvidedByUsedInInvestingActivities_MM',
       'NetCashProvidedByUsedInOperatingActivities_MM',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths_MM',
       'LongTermDebtNoncurrent_MM', 'LongTermDebt_MM',
       'RepaymentsOfLongTermDebt_MM']].fillna(-999999)


In [61]:

test_ready_3 = test_data[['debt_ratio', 'debt_equity_ratio',
       'current_ratio', 'leverage', 'return_on_equity', 'return_on_assets',
       'ratio_1', 'ratio_2', 'ratio_3', 'ratio_4', 'EarningsPerShareBasic',
       'EarningsPerShareDiluted', 'debt_ratio_Dev_Division',
       'debt_equity_ratio_Dev_Division', 'current_ratio_Dev_Division',
       'leverage_Dev_Division', 'return_on_equity_Dev_Division',
       'return_on_assets_Dev_Division', 'ratio_2_Dev_Division',
       'EarningsPerShareDiluted_Dev_Division']].fillna(-999999)

In [62]:
test_ready = pd.concat([test_ready_1, test_ready_2, test_ready_3], axis=1)

In [63]:
test_ready.shape

(3487, 60)

In [64]:
test_ready.company_name.nunique()

3484

In [65]:
# one hot encoding for the industry division

div_code = test_ready[['Division']]

one = OneHotEncoder(sparse=False)
one.fit(div_code)
cats = one.transform(div_code)

columns = one.get_feature_names(['Division'])
div_code_df = pd.DataFrame(cats, columns=columns, index=div_code.index)

div_code_df.head()

Unnamed: 0,"Division_Agriculture, Forestry and Fishing",Division_Construction,"Division_Finance, Insurance and Real Estate",Division_Manufacturing,Division_Mining,Division_Retail Trade,Division_Services,"Division_Transportation, Communications, Electric, Gas and Sanitary service",Division_Wholesale Trade
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [66]:
test_rfm_ready = pd.concat([test_ready, div_code_df], axis=1)
test_rfm_ready.shape

(3487, 69)

In [67]:
test_rfm_ready.columns

Index(['company_name', 'FULL_NAME', 'period_end_date', 'submission_number',
       'central_index_key', 'ein', 'sic', 'fiscal_year_end', 'fiscal_year',
       'form', 'date_filed', 'Industry', 'div_code', 'Division',
       'bankruptcy_date', 'days', 'target', 'Assets_MM', 'AssetsCurrent_MM',
       'Libilities_MM', 'LibilitiesCurrent_MM', 'NetIncomeLoss_MM',
       'StockholdersEquity_MM', 'OperatingIncomeLoss_MM', 'Revenues_MM',
       'CashAndCashEquivalentsAtCarryingValue_MM', 'CommonStockValue_MM',
       'Goodwill_MM', 'GrossProfit_MM', 'InterestExpense_MM', 'Revenue_any_MM',
       'SalesRevenueGoodsNet_MM', 'SalesRevenueNet_MM',
       'NetCashProvidedByUsedInFinancingActivities_MM',
       'NetCashProvidedByUsedInInvestingActivities_MM',
       'NetCashProvidedByUsedInOperatingActivities_MM',
       'LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths_MM',
       'LongTermDebtNoncurrent_MM', 'LongTermDebt_MM',
       'RepaymentsOfLongTermDebt_MM', 'debt_ratio', 'debt

In [68]:
import pickle

with open('test_rfm_ready.pickle', 'wb') as to_write:
    pickle.dump(test_rfm_ready, to_write)