In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# read data
data = pd.read_csv('data.csv')

data.head()

Unnamed: 0,Instrument,Total Debt,Accounts Payable (CF),NAICS Industry Group Name,Inventories (CF),Delisted Quote Flag,Retained Earnings (Accumulated Deficit),Net Debt Incl. Pref.Stock & Min.Interest,Total Current Assets,Total Liabilities,...,Tangible Book Value Per Share,Net Sales,Operating Income,Instrument Is Active Flag,Tangible Book Value - Utility,"Total Assets, Reported",Depreciation And Amortization,Total Current Liabilities,Year,Bankrupt
0,910655,1866000.0,-235000.0,Motor Vehicle Body and Trailer Manufacturing,-9289000.0,1.0,49741000.0,-8580000.0,82689000.0,23250000.0,...,8.43993,360326000.0,40075000.0,0.0,87124000.0,117739000.0,413000.0,19209000.0,2000,1
1,1116521,10000000.0,-47000000.0,Communications Equipment Manufacturing,81000000.0,1.0,,-184000000.0,3043000000.0,2422000000.0,...,5.59129,8268000000.0,369000000.0,0.0,1556000000.0,4239000000.0,,1597000000.0,2000,1
2,895651,377000.0,,Offices of Other Health Practitioners,289000.0,1.0,-18251000.0,-1483000.0,9479000.0,13186000.0,...,0.4067,54971000.0,-1056000.0,0.0,3622000.0,32145000.0,,6303000.0,2000,1
3,1398702,,,Tobacco Manufacturing,,0.0,,,,,...,,,,1.0,,,,,2000,1
4,1434621,,,Nondepository Credit Intermediation,,0.0,,,,,...,,,,1.0,,,,,2000,1


In [3]:
print('Unique CIKs:', len(data.Instrument.unique()))

Unique CIKs: 15221


In [30]:
data.columns

Index(['Instrument', 'Total Debt', 'Accounts Payable (CF)',
       'NAICS Industry Group Name', 'Inventories (CF)', 'Delisted Quote Flag',
       'Retained Earnings (Accumulated Deficit)',
       'Net Debt Incl. Pref.Stock & Min.Interest', 'Total Current Assets',
       'Total Liabilities', 'Company Common Name', 'CIK Number',
       'Total Equity', 'Tangible Book Value - Banks',
       'Tangible Book Value - Insurance',
       'Net Income/Starting Line, Cumulative',
       'Tangible Book Value - Reported', 'Cash and Short Term Investments',
       'Market Value for Company', 'Tangible Book Value Per Share',
       'Net Sales', 'Operating Income', 'Instrument Is Active Flag',
       'Tangible Book Value - Utility', 'Total Assets, Reported',
       'Depreciation And Amortization', 'Total Current Liabilities', 'Year',
       'Bankrupt'],
      dtype='object')

In [34]:
# replace all NAN values with 0 and non-NANs with 1
data2 = data.notnull().astype('int')
data2.Instrument, data2.Year, data2.Bankrupt = data.Instrument, data.Year, data.Bankrupt

data2['index'] = data.index

data2 = data2.groupby(['Instrument', 'Year', 'Bankrupt', 'index']).sum().reset_index()
data2['noNANs'] = data2.sum(axis=1) - data2['Instrument'] - data2['Year'] - data2['Bankrupt'] - data2['index']
data2.Instrument, data2.Year, data2.Bankrupt, data2.index = data.Instrument, data.Year, data.Bankrupt, data.index
data2 = data2[['index', 'Instrument','Year','Bankrupt', 'noNANs']].copy()

data2

Unnamed: 0,index,Instrument,Year,Bankrupt,noNANs
0,25155,910655,2000,1,22
1,41232,1116521,2000,1,22
2,57309,895651,2000,1,22
3,73386,1398702,2000,1,22
4,89463,1434621,2000,1,22
5,105540,1367311,2000,1,22
6,121617,1124608,2000,1,22
7,137694,902281,2000,1,22
8,153771,773318,2000,1,22
9,169848,1379905,2000,1,22


In [None]:
# first I need to remove the duplicate columns for Instrument-Year

# Secondly, There are Bankrupt companies recorded with label 0 (healthy)
# for years before backruptcy which somehow fucked everything up

# So you have to generate Bankrupt and healthy lists and also CIK-year dic
# again and check the results with the files in the backup folders

In [None]:
# check CIK with duble-label
test = data[['Instrument','Bankrupt']].copy()
test2 = test.groupby(['Instrument']).count()

test3 = test2[test2.Bankrupt > 20]

test3.Bankrupt.value_counts()

In [None]:
test2[test2.Bankrupt == 120]

In [8]:
data[(data.Instrument == 1525494) & (data.Year == 2019)]

Unnamed: 0,Instrument,Total Debt,Accounts Payable (CF),NAICS Industry Group Name,Inventories (CF),Delisted Quote Flag,Retained Earnings (Accumulated Deficit),Net Debt Incl. Pref.Stock & Min.Interest,Total Current Assets,Total Liabilities,...,Tangible Book Value Per Share,Net Sales,Operating Income,Instrument Is Active Flag,Tangible Book Value - Utility,"Total Assets, Reported",Depreciation And Amortization,Total Current Liabilities,Year,Bankrupt
333357,1525494,852427000.0,40503000.0,Other Information Services,,0.0,-1689683000.0,-2152171000.0,6381061000.0,8673061000.0,...,8.19758,10068780000.0,1796465000.0,1.0,2415542000.0,28266510000.0,,5600948000.0,2019,0


## Healthy Companies Data

In [None]:
# extract the healthy dataframe
healthy_raw = data[data['Bankrupt'] == 0]

healthy_raw.head()

In [None]:
for col in healthy_raw.columns:
    print(col)

print()
print(len(healthy_raw.columns))

In [None]:
# drop non-quantitative columns
healthy = healthy_raw.drop(columns=['NAICS Industry Group Name', 'Delisted Quote Flag', 'Company Common Name',
                                    'CIK Number', 'Instrument Is Active Flag', 'Bankrupt'])

healthy.head()

In [None]:
# replace all NAN values with 0 and non-NANs with 1
healthy = healthy.notnull().astype('int')

# except for the instruments and the year
healthy['Instrument'] = healthy_raw['Instrument']
healthy['Year'] = healthy_raw['Year']

print(healthy.shape)

healthy.head()

In [None]:
for col in healthy.columns:
    print(col)

print()
print(len(healthy.columns))

In [None]:
healthy = healthy.groupby(['Instrument', 'Year']).sum().reset_index()
healthy['noNANs'] = healthy.sum(axis=1) - healthy['Instrument'] - healthy['Year']
healthy = healthy[['Instrument', 'Year', 'noNANs']]

healthy.head()

In [None]:
healthy.noNANs.max()

## Bankrupt Companies Dates Data

In [None]:
# read the bankruptcy dates
bankrupt = pd.read_csv('bankruptcy_dates.csv', index_col=0)
# sort by date
bankrupt.sort_values(by=['Date'], inplace=True)

bankrupt

In [None]:
# keep only bankrupt companies that went bankrupt after 2006
bankrupt = bankrupt[bankrupt.Date > 2005]
dates = bankrupt.groupby(['Date']).count()

dates

# The Situation

So here we have 14 years, in each a specific number of bankrupt comapnies. Now we want to extract 5 healthy companies for each bankrupt one. Plus, this data should be collected from a period of year-7 to year-5.

So for example, we need data for 93 * 5 healthy companies in years 2004 to 2008 (inclusive), for 93 companies that went bankrupt in 2010.


On the other hand, we have a dataframe of healthy companies CIK code for each year (2000-2019) and a column showing the number of non-NAN values for each row. A no-NANs equal to 21 indicate that the company for that year has full data.
Now our goal is to pick those 5 company for each bankrupt companies from ones with the highest no-NANs values.



## The Strategy

For each time period, we calculate the sum of no-NANs for that period and pick the n maximums; n being the number of companiets that went bankrupt in year+1. So we have to do it 14 times (2007-2019).

In [None]:
healthy.head()

In [None]:
# write a function for the task

def extract_healthy(df, year, n):
    '''
    The function takes the year in which n companies went bankrupt,
    and from the healthy dataframe, returns a CIK lists of 5 * n
    healthy companies which 6 years to 2 years before that year has
    the fullest dataset.
    '''
    
    # first remove all the extra rows for years not in the period
    df_temp = df[(df.Year < year-1) & (df.Year > year-7)]
    # group the dataframe by instrument, sum over the no-NANs
    df_temp = df_temp.groupby(['Instrument']).sum()
    # drop the now useless year column
    df_temp.drop(columns=['Year'], inplace=True)
    # descending-sort the companies based on noNANs magnitude
    df_temp.sort_values(by=['noNANs'], ascending=False, inplace=True)
    # cut the first n * 5 ones
    df_temp = df_temp[:5*n]
    # make a list of their CIK cides
    CIK_list = df_temp.index.to_list()
    # drop the healthy companies already picked from the df
     
    
    return CIK_list
    

In [None]:
# keep the original df safe, just in case
healthy_copy = healthy.copy()

# create a dic storing the CIK lists for different years
CIK_year = {}

# loop over all available years in our dataset
for year in range(2019, 2005, -1):
    # take the number of companies went bankrupt in that year
    n = int(dates[dates.index == year]['Company'])
    # take the CIK list
    CIK_list = extract_healthy(healthy_copy, year, n)
    # add the list to the dictionary
    CIK_year[year] = CIK_list
    

print(CIK_year)

In [None]:
# write the dictionary
with open('CIK_year.txt', 'wb') as f:
    pickle.dump(CIK_year, f)

## Second Strategy

This time I will create a dataframe, with the rows being healthy companies, and the columns being the 5-year periods, so to calculate the fullness of each healthy company for each period for an optimal selection.

In [None]:
healthy.head()

In [None]:
def score_cal(df, year):
        '''
        The function takes the healthy df and a year, and calculate the
        healthy companies' scores (#non-NAN-values) for 6 years to 2
        years before that year, and returns it in a dataseries.
        '''
        
        # first remove all the extra rows for years not in the period
        df_temp = df[(df.Year < year-1) & (df.Year > year-7)]
        # group the dataframe by instrument, sum over the no-NANs
        df_temp = df_temp.groupby(['Instrument']).sum()

        return df_temp.noNANs


    
def extract_healthy_2(df):
    '''
    This function takes a healthy dataframe and and returns a
    dataframe stating each company's score (non-NAN values) for
    a period.
    '''

    # create a new dataframe to store each company score per period
    df_out = df.groupby(['Instrument']).sum()
    
    for year in range(2019, 2005, -1):
        # set the column title
        title = str(year) + ':(' + str(year-6) + '-' + str(year-2) + ')'
        df_out[title] = np.around((score_cal(df, year) / 105 * 100), decimals=2)
        
    df_out.drop(columns=['Year', 'noNANs'], inplace=True)
        
    return df_out


In [None]:
dfout = extract_healthy_2(healthy)

dfout

## Final Decision

In [None]:
# for the first step, remove the CIK with no filed/recorded 10-K forms
# read the csv file
cik_no10K = pd.read_csv('cik_without_10k.csv', header=None)

bad_cik = cik_no10K[0].to_list()

cik_no10K.head()

In [None]:
# just check a list of full rows companies, plus one totally empty one
check_list = [98840, 98222, 1001082, 1000697, 1000229, 1000180]

for cik in check_list:
    if cik in bad_cik:
        print(cik, 'is a bad CIK!')

print(len(bad_cik), 'bad CIK codes!')
healthy_fullness = dfout.reset_index()
healthy_fullness.head()

In [None]:
# check the CIK formats in dataframe and dictionary
# to make sure they're comparable
print(type(bad_cik[1]))
print(type(healthy_fullness.Instrument[1]))


In [None]:
# calculate the number of unique CIK in the healthy list
len(healthy_fullness.Instrument.unique())

# drop companies with no filed 10-K form
healthy = healthy_fullness[~healthy_fullness.Instrument.isin(bad_cik)]
# check to make sure everything is okay and under control
print(len(healthy_fullness), '-', len(set(bad_cik)), '=', len(healthy))

healthy.head()

In [None]:
# create a new dictionary to save the fullest
# healthy companies for each period
cik_codes = {2000:[], 2001:[], 2002:[], 2003:[], 2004:[],
             2005:[], 2006:[], 2007:[], 2008:[], 2009:[],
             2010:[], 2011:[], 2012:[], 2013:[], 2014:[],
             2015:[], 2016:[], 2017:[]}

def CIK_picking(df, year, n):
    '''
    this function takes the healthy fullness dataframe,
    a year, and the number of bankrupt companies in the year (n),
    and produces a list including 5n healthy companies with
    the fullest rows in the period 7 to 2 years before that year,
    plus CIK of bankrupt companies in that year.
    The function, then, adds these CIK codes to the cik_codes'
    keys for each year in the period of 7-2 years before that year.
    And finally write the dictionary in a pickle file.
    '''
    
    # descending-sort the companies for that year
    col = str(year) + ':(' + str(year-6) + '-' + str(year-2) + ')'
    df.sort_values(by=[col], ascending=False, inplace=True)
    # make a list of their CIK cides
    CIK_list = df.Instrument.to_list()
    # cut the first n * 5 ones
    max_list = CIK_list[:5*n]
    # add bankrupt CIK in the 'year' to the list
    final_list = max_list + bankrupt[bankrupt.Date == year].CIK.to_list()
    
    for yr in range(year-6, year-1):
        cik_codes[yr] += final_list


# run the healthy_picking function for 14 years (2006-19)
for year in range(2006, 2020):
    # take the number of companies went bankrupt in that year
    n = dates.Company[year]
    # add the CIK lists to the cik_codes dictionary
    CIK_picking(healthy, year, n)

In [None]:
# Clear each cik_codes dictionary's key's list to make sure
# there ain't repetitive elements (CIKs) in them

# create a list, to store all CIKs
all_CIKs = []

for year in range(2000, 2018):
    cik_codes[year] = list(set(cik_codes[year]))
    all_CIKs += cik_codes[year]
    
# write the dictionary
with open('CIK_year.txt', 'wb') as f:
    pickle.dump(cik_codes, f)
    

# remove repetitive CIKs
all_CIKs = list(set(all_CIKs))
# the number of CIKs - the total number of bankrupt CIKs
# to check the number of healthy CIKs in our data
print('Final number of healthy companies in our dataset:', len(all_CIKs) - dates.Company.sum())
print('And the number of bankrupt companies in our dataset:', dates.Company.sum())