In [1]:
import os
import pandas as pd

In [2]:
# read the Compustat-extracted lists of
# healthy and bankrupt companies
folder = '/Users/user/Documents/Bankruptcy/bankruptcy/data/compustat/'
bankrupt = pd.read_csv(str(folder+'list_bankrupt.csv'), dtype=object)
healthy = pd.read_csv(str(folder+'list_healthy.csv'), dtype=object)


****

# 1. Merging Bankrupt Companies Data

In [3]:
#remove the glitch column
bankrupt = bankrupt.drop(['Unnamed: 0'], axis=1)

bankrupt.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,Ticker,CUSIP,CIK
0,1367,Amber Resources Company of C,08/31/2012,2.0,3AMBE,023184203,276750
1,2033,Fairchild Corp. (The),11/01/2011,2.0,FCHDQ,303698104,9779
2,4049,Constar International Inc,06/01/2011,2.0,CNSTQ,21036U206,29806
3,4352,Energy Conversion Devices In,09/04/2012,2.0,ENERQ,292659109,32878
4,4768,Fleetwood Enterprises Inc.,08/23/2010,2.0,FLTWQ,339099103,314132


## 1.1. ISIN

In [4]:
# ----------------------------------------------------------------------
# work with ISIN codes converted from CUSIP
# ----------------------------------------------------------------------
# read the bankrupt companies CUSIP to ISIN convert-table csv file
bankrupt_csp2isn = pd.read_csv('bankrupt_csp2isn.csv', dtype=object)
bankrupt_csp2isn.rename(columns={'Unnamed: 0':'CUSIP'}, inplace=True)

print(bankrupt_csp2isn.head(), '\n\n')

# check the compatibility of CUSIP columns in two dataframe, before merging
for i in range(len(bankrupt)):
    if bankrupt.iloc[i][5] != bankrupt_csp2isn.iloc[i][0]:
        print('WARNING: There is inconsistency at row:', i)
        

# ----------------------------------------------------------------------
# work with ISIN codes converted from Ticker
# ----------------------------------------------------------------------
# read the bankrupt companies Ticker to ISIN convert-table csv file
bankrupt_tic2isn = pd.read_csv('bankrupt_tic2isn.csv', dtype=object)
bankrupt_tic2isn.rename(columns={'Unnamed: 0':'Ticker'}, inplace=True)

print(bankrupt_tic2isn.head())

# check the compatibility of Ticker columns in two dataframe, before merging
for i in range(len(bankrupt)):
    if bankrupt.iloc[i][4] != bankrupt_tic2isn.iloc[i][0]:
        print('WARNING: There is inconsistency at row:', i)
        

       CUSIP          ISIN error
0  023184203  US0231842032   NaN
1  303698104  US3036981047   NaN
2  21036U206  US21036U2069   NaN
3  292659109  US2926591098   NaN
4  339099103  US3390991038   NaN 


  Ticker          ISIN                    error
0  3AMBE           NaN  No best match available
1  FCHDQ  US3036981047                      NaN
2  CNSTQ           NaN  No best match available
3  ENERQ  US2926591098                      NaN
4  FLTWQ  US3390991038                      NaN


In [5]:
# attach the ISIN columns to the bankrupt dataframe
bankrupt['csp2ISIN'] = bankrupt_csp2isn['ISIN']
bankrupt['tic2ISIN'] = bankrupt_tic2isn['ISIN']

bankrupt.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,Ticker,CUSIP,CIK,csp2ISIN,tic2ISIN
0,1367,Amber Resources Company of C,08/31/2012,2.0,3AMBE,023184203,276750,US0231842032,
1,2033,Fairchild Corp. (The),11/01/2011,2.0,FCHDQ,303698104,9779,US3036981047,US3036981047
2,4049,Constar International Inc,06/01/2011,2.0,CNSTQ,21036U206,29806,US21036U2069,
3,4352,Energy Conversion Devices In,09/04/2012,2.0,ENERQ,292659109,32878,US2926591098,US2926591098
4,4768,Fleetwood Enterprises Inc.,08/23/2010,2.0,FLTWQ,339099103,314132,US3390991038,US3390991038


In [6]:
# check compatibility between two ISIN columns
inconsistency = 0

# add a single column for ISIN, and
bankrupt['ISIN'] = ''
# a column to indicate there was inconsistency in converting symbols
bankrupt['ISINc'] = ''

for index, row in bankrupt.iterrows():
    # in case of inconsistency
    if row['csp2ISIN'] != row['tic2ISIN']:
        # count the number of inconsistencies
        inconsistency += 1
        print('At', row['Company'], '\n', row['csp2ISIN'], 'is not the same as', row['tic2ISIN'], '\n')
        # indicate there was an inconsistency,
        # to be careful about that data sample
        row['ISINc'] = 1
        
        # fill ISIN with a non-NAN value if possible
        # in case one of or both values are NAN
        if row['csp2ISIN'] != row['csp2ISIN']:
            row['ISIN'] = row['tic2ISIN']
        elif row['tic2ISIN'] != row['tic2ISIN']:
            row['ISIN'] = row['csp2ISIN']
        # and in case both are non-NANs
        else:
            row['ISIN'] = str(row['csp2ISIN']) + '-' + str(row['tic2ISIN'])
            
    # in case of consistency        
    else:
        row['ISIN'] = row['csp2ISIN']
        # indicate consistency
        row['ISINc'] = 0

print(inconsistency, 'case(s) of inconsistency!')

# remove the extra *ISIN columns
bankrupt = bankrupt.drop(['csp2ISIN', 'tic2ISIN'], axis=1)


At Amber Resources Company of C 
 US0231842032 is not the same as nan 

At Constar International Inc 
 US21036U2069 is not the same as nan 

At Keystone Camera Products 
 US4933971032 is not the same as nan 

At Scientific Radio Systems Inc 
 US8087831041 is not the same as nan 

At Waterford Wedgwood PLC 
 US9415133019 is not the same as nan 

At Hi-Def Enterprise Inc 
 nan is not the same as nan 

At Hayes Lemmerz International 
 nan is not the same as US4207814033 

At Dura Automotive Systems Inc 
 nan is not the same as US26632V1026 

At Kasten Chase Applied Researc 
 CA4859061018 is not the same as nan 

At Interdent Inc 
 US45865R2085 is not the same as nan 

At Cano Petroleum Inc 
 US1378011068 is not the same as nan 

At New Century Energy Corp 
 US64360E1091 is not the same as nan 

At Cervus Financial Group Inc 
 CA15712V1085 is not the same as nan 

At Leader Mining International 
 CA52169T2039 is not the same as nan 

At Cygnal Technologies Corp 
 CA2329751028 is not the sa

In [7]:
bankrupt.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,Ticker,CUSIP,CIK,ISIN,ISINc
0,1367,Amber Resources Company of C,08/31/2012,2.0,3AMBE,023184203,276750,US0231842032,1
1,2033,Fairchild Corp. (The),11/01/2011,2.0,FCHDQ,303698104,9779,US3036981047,0
2,4049,Constar International Inc,06/01/2011,2.0,CNSTQ,21036U206,29806,US21036U2069,1
3,4352,Energy Conversion Devices In,09/04/2012,2.0,ENERQ,292659109,32878,US2926591098,0
4,4768,Fleetwood Enterprises Inc.,08/23/2010,2.0,FLTWQ,339099103,314132,US3390991038,0


## 1.2. RIC

In [8]:
# ----------------------------------------------------------------------
# work with RIC codes converted from CUSIP
# ----------------------------------------------------------------------
# read the bankrupt companies CUSIP to RIC convert-table csv file
bankrupt_csp2ric = pd.read_csv('bankrupt_csp2ric.csv', dtype=object)
bankrupt_csp2ric.rename(columns={'Unnamed: 0':'CUSIP'}, inplace=True)

print(bankrupt_csp2ric.head(), '\n\n')

# check the compatibility of CUSIP columns in two dataframe, before merging
for i in range(len(bankrupt)):
    if bankrupt.iloc[i][5] != bankrupt_csp2ric.iloc[i][0]:
        print('WARNING: There is inconsistency at row:', i)
        

# ----------------------------------------------------------------------
# work with RIC codes converted from Ticker
# ----------------------------------------------------------------------
# read the bankrupt companies Ticker to ISIN convert-table csv file
bankrupt_tic2ric = pd.read_csv('bankrupt_tic2ric.csv', dtype=object)
bankrupt_tic2ric.rename(columns={'Unnamed: 0':'Ticker'}, inplace=True)

print(bankrupt_tic2ric.head())

# check the compatibility of Ticker columns in two dataframe, before merging
for i in range(len(bankrupt)):
    if bankrupt.iloc[i][4] != bankrupt_tic2ric.iloc[i][0]:
        print('WARNING: There is inconsistency at row:', i)
        


       CUSIP  RIC                    error
0  023184203  NaN  No best match available
1  303698104  NaN  No best match available
2  21036U206  NaN  No best match available
3  292659109  NaN  No best match available
4  339099103  NaN  No best match available 


  Ticker           RIC                    error
0  3AMBE           NaN  No best match available
1  FCHDQ  FCHDQ.PK^K11                      NaN
2  CNSTQ           NaN  No best match available
3  ENERQ  ENERQ.PK^I12                      NaN
4  FLTWQ  FLTWQ.PK^H10                      NaN


In [9]:
# attach the RIC columns to the bankrupt dataframe
bankrupt['csp2RIC'] = bankrupt_csp2ric['RIC']
bankrupt['tic2RIC'] = bankrupt_tic2ric['RIC']

bankrupt.head()


Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,Ticker,CUSIP,CIK,ISIN,ISINc,csp2RIC,tic2RIC
0,1367,Amber Resources Company of C,08/31/2012,2.0,3AMBE,023184203,276750,US0231842032,1,,
1,2033,Fairchild Corp. (The),11/01/2011,2.0,FCHDQ,303698104,9779,US3036981047,0,,FCHDQ.PK^K11
2,4049,Constar International Inc,06/01/2011,2.0,CNSTQ,21036U206,29806,US21036U2069,1,,
3,4352,Energy Conversion Devices In,09/04/2012,2.0,ENERQ,292659109,32878,US2926591098,0,,ENERQ.PK^I12
4,4768,Fleetwood Enterprises Inc.,08/23/2010,2.0,FLTWQ,339099103,314132,US3390991038,0,,FLTWQ.PK^H10


In [10]:
# check compatibility between two RIC columns
inconsistency = 0

# add a single column for RIC, and
bankrupt['RIC'] = ''
# a column to indicate there was inconsistency in converting symbols
bankrupt['RICc'] = ''

for index, row in bankrupt.iterrows():
    # in case of inconsistency
    if row['csp2RIC'] != row['tic2RIC']:
        # count the number of inconsistencies
        inconsistency += 1
        print('At', row['Company'], '\n', row['csp2RIC'], 'is not the same as', row['tic2RIC'], '\n')
        # indicate there was an inconsistency,
        # to be careful about that data sample
        row['RICc'] = 1
        
        # fill ISIN with a non-NAN value if possible
        # in case one of or both values are NAN
        if row['csp2RIC'] != row['csp2RIC']:
            row['RIC'] = row['tic2RIC']
        elif row['tic2RIC'] != row['tic2RIC']:
            row['RIC'] = row['csp2RIC']
            
        # and in case both are non-NANs
        else:
            row['RIC'] = str(row['csp2RIC']) + '-' + str(row['tic2RIC'])
            
    # in case of consistency        
    else:
        row['RIC'] = row['csp2RIC']
        # indicate consistency
        row['RICc'] = 0

print(inconsistency, 'case(s) of inconsistency!')

# remove the extra *RIC columns
bankrupt = bankrupt.drop(['csp2RIC', 'tic2RIC'], axis=1)


At Amber Resources Company of C 
 nan is not the same as nan 

At Fairchild Corp. (The) 
 nan is not the same as FCHDQ.PK^K11 

At Constar International Inc 
 nan is not the same as nan 

At Energy Conversion Devices In 
 nan is not the same as ENERQ.PK^I12 

At Fleetwood Enterprises Inc. 
 nan is not the same as FLTWQ.PK^H10 

At Frozen Food Express Industri 
 nan is not the same as nan 

At Great Atlantic & Pacific Tea 
 nan is not the same as GAPTQ.PK^C12 

At Interstate Bakeries Corp 
 nan is not the same as IBCIQ.PK^B09 

At Keystone Camera Products 
 KYC.QAD^I02 is not the same as nan 

At Florida Gaming Corp 
 nan is not the same as nan 

At Penn Traffic Co 
 nan is not the same as PTFCQ.PK^J11 

At Ronson Corp 
 nan is not the same as RONCQ.PK^B12 

At Scientific Radio Systems Inc 
 SCRD.PK^H09 is not the same as nan 

At Standard Register Co (The) 
 nan is not the same as nan 

At Enesco Group Inc 
 nan is not the same as ENCZQ.PK^L10 

At Bombay Co Inc. (The) 
 nan is not the

In [11]:
bankrupt.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,Ticker,CUSIP,CIK,ISIN,ISINc,RIC,RICc
0,1367,Amber Resources Company of C,08/31/2012,2.0,3AMBE,023184203,276750,US0231842032,1,,1
1,2033,Fairchild Corp. (The),11/01/2011,2.0,FCHDQ,303698104,9779,US3036981047,0,FCHDQ.PK^K11,1
2,4049,Constar International Inc,06/01/2011,2.0,CNSTQ,21036U206,29806,US21036U2069,1,,1
3,4352,Energy Conversion Devices In,09/04/2012,2.0,ENERQ,292659109,32878,US2926591098,0,ENERQ.PK^I12,1
4,4768,Fleetwood Enterprises Inc.,08/23/2010,2.0,FLTWQ,339099103,314132,US3390991038,0,FLTWQ.PK^H10,1


# 2. Merging Healthy Companies Data

In [13]:
#remove the glitch column
healthy = healthy.drop(['Unnamed: 0'], axis=1)

healthy.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,Ticker,CUSIP,CIK
0,1004,AAR Corp,.,,AIR,000361105,1750
1,1013,ADC Telecommunications Inc.,12/10/2010,1.0,ADCT,000886309,61478
2,1021,AFP Imaging Corp,09/15/2014,7.0,IWKS,001058205,319126
3,1034,Alpharma Inc.,12/31/2008,1.0,ALO.2,020813101,730469
4,1045,American Airlines Group Inc,.,,AAL,02376R102,6201


## 2.1. ISIN

In [30]:
# ----------------------------------------------------------------------
# work with ISIN codes converted from CUSIP
# ----------------------------------------------------------------------
# read the bankrupt companies CUSIP to ISIN convert-table csv files
healthy_csp2isn1 = pd.read_csv('healthy_csp2isn1.csv', dtype=object)
healthy_csp2isn2 = pd.read_csv('healthy_csp2isn2.csv', dtype=object)
healthy_csp2isn3 = pd.read_csv('healthy_csp2isn3.csv', dtype=object)
healthy_csp2isn4 = pd.read_csv('healthy_csp2isn4.csv', dtype=object)
healthy_csp2isn5 = pd.read_csv('healthy_csp2isn5.csv', dtype=object)
healthy_csp2isn6 = pd.read_csv('healthy_csp2isn6.csv', dtype=object)

# concatenate the dataframes
healthy_csp2isn = pd.concat([healthy_csp2isn1, healthy_csp2isn2, healthy_csp2isn3,
                             healthy_csp2isn4, healthy_csp2isn5, healthy_csp2isn6])

healthy_csp2isn.rename(columns={'Unnamed: 0':'CUSIP'}, inplace=True)

# check whether there are NAN values in CUSIP column
# there should not be, cause it is already taken care of,
# but still.... cause that's important and would be catastrophic
# for merging results
print(healthy_csp2isn.isnull().any()) # return False in case of no NANs
print()
print(healthy_csp2isn.head(), '\n\n')


# ----------------------------------------------------------------------
# work with ISIN codes converted from Ticker
# ----------------------------------------------------------------------
# read the bankrupt companies Ticker to ISIN convert-table csv files
healthy_tic2isn1 = pd.read_csv('healthy_tic2isn1.csv', dtype=object)
healthy_tic2isn2 = pd.read_csv('healthy_tic2isn2.csv', dtype=object)
healthy_tic2isn3 = pd.read_csv('healthy_tic2isn3.csv', dtype=object)
healthy_tic2isn4 = pd.read_csv('healthy_tic2isn4.csv', dtype=object)
healthy_tic2isn5 = pd.read_csv('healthy_tic2isn5.csv', dtype=object)
healthy_tic2isn6 = pd.read_csv('healthy_tic2isn6.csv', dtype=object)

# concatenate the dataframes
healthy_tic2isn = pd.concat([healthy_tic2isn1, healthy_tic2isn2, healthy_tic2isn3,
                             healthy_tic2isn4, healthy_tic2isn5, healthy_tic2isn6])

healthy_tic2isn.rename(columns={'Unnamed: 0':'Ticker'}, inplace=True)

# check whether there are NAN values in CUSIP column
# there should not be, cause it is already taken care of,
# but still.... cause that's important and would be catastrophic
# for merging results
print(healthy_tic2isn.isnull().any()) # return False in case of no NANs
print()
print(healthy_tic2isn.head(), '\n\n')


# ----------------------------------------------------------------------
# insert ISIN columns of those two dataframe in the healthy dataframe
# ----------------------------------------------------------------------
# create two ISIN columns in the original healthy dataframe
healthy['cspISIN'] = ''
healthy['ticISIN'] = ''


calc = 0
for index, row in healthy_csp2isn.iterrows():
    for i in range(len(healthy)):
        calc += 1
        if calc%40000 == 0:
            print('*')
        if row['CUSIP'] == healthy.iloc[i][5]:
            healthy['cspISIN'] = row['ISIN']
            break
            
'''
for index, row in healthy_tic2isn.iterrows():
    for i in range(len(healthy)):
        if row['Ticker'] == healthy.iloc[i][5]:
            healthy['ticISIN'] = row['Ticker']
            break
'''
healthy.head()

CUSIP    False
ISIN      True
error     True
dtype: bool

       CUSIP          ISIN error
0  000361105  US0003611052   NaN
1  000886309  US0008863096   NaN
2  001058205  US0010582056   NaN
3  020813101  US0208131013   NaN
4  02376R102  US02376R1023   NaN 


Ticker    False
ISIN       True
error      True
dtype: bool

  Ticker          ISIN                    error
0    AIR  US0003611052                      NaN
1   ADCT           NaN  No best match available
2   IWKS  US0010582056                      NaN
3  ALO.2           NaN  No best match available
4    AAL  US02376R1023                      NaN 


*
*
*


KeyboardInterrupt: 

****

In [None]:
# save the final csv file of bankrupt companies
bankrupt.to_csv('final_bankrupt_list.csv')
healthy.to_csv('final_healthy_list.csv')