In [13]:
import os
import numpy as np
import pandas as pd
import string

In [14]:
# set the path for reading the Compustat lists
path = 'D:\\studyproject\\bankruptcy\\data\\Eikon\\Identifiers_Mapping\\' + \
           '0.Ticker_CUSIP-to-ISIN_RIC\\' # for win decomment this line
# path = '/Users/user/Documents/Bankruptcy/bankruptcy/data/Eikon/ \\
#        Identifiers_Mapping/0.Ticker_CUSIP-to-ISIN_RIC/' # for mac decomment this line

# read the lists from the previous conversion
bankrupt = pd.read_csv(path + '0.bankrupt_list.csv', dtype=object, index_col=[0])
healthy = pd.read_csv(path + '0.healthy_list.csv', dtype=object, index_col=[0])

print(bankrupt.head())
healthy.head()

  Identifier                       Company Data Deletion Date Deletion Reason  \
0     001367  Amber Resources Company of C         08/31/2012             2.0   
1     002033         Fairchild Corp. (The)         11/01/2011             2.0   
2     004049     Constar International Inc         06/01/2011             2.0   
3     004352  Energy Conversion Devices In         09/04/2012             2.0   
4     004768    Fleetwood Enterprises Inc.         08/23/2010             2.0   

  Ticker      CUSIP         CIK          ISIN ISINc           RIC RICc  
0  3AMBE  023184203  0000276750  US0231842032     1           NaN    0  
1  FCHDQ  303698104  0000009779  US3036981047     0  FCHDQ.PK^K11    1  
2  CNSTQ  21036U206  0000029806  US21036U2069     1           NaN    0  
3  ENERQ  292659109  0000032878  US2926591098     0  ENERQ.PK^I12    1  
4  FLTWQ  339099103  0000314132  US3390991038     0  FLTWQ.PK^H10    1  


Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,Ticker,CUSIP,CIK,ISIN,ISINc,RIC,RICc
0,1004,AAR Corp,.,,AIR,000361105,1750,US0003611052,0,AIR,0
1,1013,ADC Telecommunications Inc.,12/10/2010,1.0,ADCT,000886309,61478,US0008863096,1,,0
2,1021,AFP Imaging Corp,09/15/2014,7.0,IWKS,001058205,319126,US0010582056,0,IWKS.PK,1
3,1034,Alpharma Inc.,12/31/2008,1.0,ALO.2,020813101,730469,US0208131013,1,,0
4,1045,American Airlines Group Inc,.,,AAL,02376R102,6201,US02376R1023,0,AAL.O - AAL.Z,3


### Extracting the Currency Column from the Raw Data

In [15]:
# Later on we will need the currency in which the company's
# shares being traded, as an acceptable approximation of its
# headquarter to determine the first 2 letters of the ISIN code

# set the path
path = 'D:\\studyproject\\bankruptcy\\data\\Compustat\\' # for win

identifier = pd.read_fwf(path + 'chunk_1.rtf', dtype=object)
currency = pd.read_fwf(path + 'chunk_2.rtf', dtype=object)


In [16]:
# make a dataframe out of the company names
country = identifier[['conml']].copy()
country.rename(columns={'conml':'Company'}, inplace=True) # rename the column

# and the currency column
country['Country'] = currency['curcd'].values

# group them by company names
country = country.groupby(['Company']).max().reset_index()

# change the currency to countries 2-letter abbreviation
country.Country.replace(['USD', 'CAD'], ['US', 'CA'], inplace=True)

country.head()

Unnamed: 0,Company,Country
0,01 Communique Laboratory Inc,CA
1,0373849 B C Ltd,CA
2,1-800 Contacts Inc,US
3,1-800-FLOWERS.COM Inc,US
4,111 Inc,US


In [17]:
# add the column currency to the two dataframes
bankrupt = bankrupt.merge(country, how='inner', on=['Company'])
healthy = healthy.merge(country, how='inner', on=['Company'])


### Implementing the Conversion Code

In [18]:
# implement the CUSIP to ISIN formula

def c2i(CUSIP, country_code):
    '''
    This function takes CUSIP and the country code, and
    returns the ISIN.
    '''
    if country_code == 'US':
        ISIN2B = '3028'+ CUSIP
    else:
        ISIN2B = '1210' + CUSIP
    

    ISIN = []
    for ch in ISIN2B:
        try:
            ISIN.append(int(ch))
        except:
            ISIN.append((ord(ch) - 55) // 10)
            ISIN.append((ord(ch) - 55) % 10)
        
        
    counter = -1
    list_a = []
    list_B = []
    while counter >= -len(ISIN):
        list_a.append(ISIN[counter])
        try:        
            list_B.append(ISIN[counter-1])
        except:
            pass
        counter -= 2
        
    list_a = [2 * x for x in list_a]
    
    list_A = []
    for i in list_a:
        if i <= 9:
            list_A.append(i)
        else:
            list_A.append(i // 10)
            list_A.append(i % 10)
    
    final_digit = (10 - ((sum(list_A) + sum(list_B)) % 10)) % 10
    ISIN = country_code + CUSIP + str(final_digit)
    
         
    return ISIN
    

### Applying the Conversion Function to a Dataframe

In [19]:
# convert lists CUSIPs to ISINs
def createISIN(df):
    '''
    this functions takes bankrupt and healthy dataframes,
    use the CUSIP column and creates corresponding ISINs,
    and compare the consistency with the existing ISINS
    '''
    # add an empty column for CUSIP-converted ISIN
    df.insert(9, 'cusipISIN', '')

    # iterate over dataframe rows
    for index, row in df.iterrows():
        # if CUSIP or Country not available return NAN
        if (row['CUSIP'] != row['CUSIP']) or (row['Country'] != row['Country']):
            row['cusipISIN'] = np.nan
        # else convert CUSIP to ISIN
        else:
            row['cusipISIN'] = c2i(row['CUSIP'], row['Country'])
    

### Check the Consistency

In [20]:
def checkConsistency(df):
    '''
    The function takes the bankrupt and healthy dataframes,
    and check the consistency of ISIN and cusipISIN columns.
    '''
    # iterate over rows
    for index, row in df.iterrows():
        # if cusipISIN is not available, pass
        if row['cusipISIN'] != row['cusipISIN']:
            pass
        else:
            # there are 4 different cases possible:

            # ISINc = 0, so no inconsistency:
            if row['ISINc'] == '0':
                # 1. but no ISIN code either
                if row['ISIN'] != row['ISIN']:
                    # replace ISIN's NAN value with cusipISIN
                    row['ISIN'] = row['cusipISIN']
                    # but mention the inconsistency
                    row['ISINc'] = '1'
                # 2. and a ISIN code available
                else:
                    # check wether the two ISIN codes are the same
                    # if they are, everything's cool and we don't need
                    # to do anything, esle
                    if not (row['ISIN'] == row['cusipISIN']):
                        # we add the new ISIN to the previous one
                        row['ISIN'] = row['ISIN'] + ' - ' + row['cusipISIN']
                        # and mention the inconsistency
                        # but since now two out of three conversaions had
                        # consistent result, the value of ISINc would be .5
                        row['ISINc'] = '.5'

            # 3. case in which we have an ISIN code but it's only
            # from one of the two tries of creating the code, and
            # the other one produced no results
            elif row['ISINc'] == '1':
                # if the code is consistent with the newly created ISIN
                if row['ISIN'] == row['cusipISIN']:
                    # assume consistency
                    row['ISINc'] = '0'
                # and if not
                else:
                    # add the new code to the ISIN column
                    row['ISIN'] = row['ISIN'] + ' - ' + row['cusipISIN']
                    # and increase the value of inconsistency
                    # because now we have three different results 
                    row['ISINc'] = '3'

            # 4. serious inconsistency case, in which we have already
            # two differen ISIN codes
            else:
                # first break the code into its two parts
                ISIN1 = row['ISIN'][:row['ISIN'].find(' - ')]
                ISIN2 = row['ISIN'][row['ISIN'].find(' - ') + 3:]

                # compare them against the new code, if one is the same
                # with the new code, replace the ISIN with that code and
                # reduce the inconsistency value to 1.
                if ISIN1 == row['cusipISIN']:
                    row['ISIN'] = ISIN1
                    row['ISINc'] = '1'
                elif ISIN2 == row['cusipISIN']:
                    row['ISIN'] = ISIN2
                    row['ISINc'] = '1'
                # else we have three different codes and it hits a
                # new level of inconsistency, so add the new code and
                # increase the inconsistency value to 4
                else:
                    row['ISIN'] += ' - ' + row['cusipISIN']
                    row['ISINc'] = '4'


### Applying the Two Functions to the Two Dataframes

In [21]:
# first check the inconsistency status of the dataframes
print('Inconsistency status of the bankrupt dataframe:')
print(bankrupt.ISINc.value_counts())
print()
print('Inconsistency status of the healthy dataframe:')
healthy.ISINc.value_counts()

Inconsistency status of the bankrupt dataframe:
0    93
1    19
Name: ISINc, dtype: int64

Inconsistency status of the healthy dataframe:


0    12778
1     8149
3      213
Name: ISINc, dtype: int64

In [22]:
# convert CUSIPs to ISINs
createISIN(bankrupt)
createISIN(healthy)
# add check the new ISIN with the previous ones
checkConsistency(bankrupt)
checkConsistency(healthy)

In [23]:
# check the inconsistency status after using the second conversion technique
print('Inconsistency status of the bankrupt dataframe:')
print(bankrupt.ISINc.value_counts())
print()
print('Inconsistency status of the healthy dataframe:')
healthy.ISINc.value_counts()

Inconsistency status of the bankrupt dataframe:
0     104
3       4
.5      3
1       1
Name: ISINc, dtype: int64

Inconsistency status of the healthy dataframe:


0     17889
1      1599
3       871
.5      761
4        20
Name: ISINc, dtype: int64

In [24]:
# drop the extra columns
bankrupt = bankrupt.drop(['cusipISIN'], axis=1)
healthy = healthy.drop(['cusipISIN'], axis=1)

# and save the results as csv files
bankrupt.to_csv('1.bankrupt_list.csv', index=False)
healthy.to_csv('1.heathy_list.csv', index=False)

### Extract ISIN and RIC Lists 
#### For the Next Round of Conversions

In [None]:
# extract and locally-save 4 lists of RICs and ISINs for bankrupt and healthy companies

# create a folder to save the list-pickles
if not os.path.exists('Lists'):
    os.makedirs('Lists')

# ISIN list for bankrupt companies
bankrupt_ISIN = bankrupt.ISIN[bankrupt['ISINc'] != 3].dropna().to_list()
# handle serious conflict cases which include more than one code
for index, row in bankrupt.iterrows():
    if row['ISINc'] == 3:
        bankrupt_ISIN.append(row['ISIN'][:row['ISIN'].find(' - ')])
        bankrupt_ISIN.append(row['ISIN'][row['ISIN'].find(' - ') + 3:])
# and save the list
with open('Lists/bankrupt_ISIN.txt', 'wb') as f:
    pickle.dump(bankrupt_ISIN, f)
    

# create RIC list for bankrupt companies
bankrupt_RIC = bankrupt.RIC[bankrupt['RICc'] != 3].dropna().to_list()
# handle serious conflict cases which include more than one code
for index, row in bankrupt.iterrows():
    if row['RICc'] == 3:
        bankrupt_RIC.append(row['RIC'][:row['RIC'].find(' - ')])
        bankrupt_RIC.append(row['RIC'][row['RIC'].find(' - ') + 3:])
# and save the list
with open('Lists/bankrupt_RIC.txt', 'wb') as f:
    pickle.dump(bankrupt_RIC, f)
    

# ISIN list for healthy companies
healthy_ISIN = healthy.ISIN[healthy['ISINc'] != 3].dropna().to_list()
# handle serious conflict cases which include more than one code
for index, row in healthy.iterrows():
    if row['ISINc'] == 3:
        healthy_ISIN.append(row['ISIN'][:row['ISIN'].find(' - ')])
        healthy_ISIN.append(row['ISIN'][row['ISIN'].find(' - ') + 3:])
# and save the list
with open('Lists/healthy_ISIN.txt', 'wb') as f:
    pickle.dump(healthy_ISIN, f)
    
    
# RIC list for healthy companies
healthy_RIC = healthy.RIC[healthy['RICc'] != 3].dropna().to_list()
# handle serious conflict cases which include more than one code
for index, row in healthy.iterrows():
    if row['RICc'] == 3:
        healthy_RIC.append(row['RIC'][:row['RIC'].find(' - ')])
        healthy_RIC.append(row['RIC'][row['RIC'].find(' - ') + 3:])
# and save the list
with open('Lists/healthy_RIC.txt', 'wb') as f:
    pickle.dump(healthy_RIC, f)
    