In [1]:
import os
import numpy as np
import pandas as pd

# chunk_1.rtf

In [2]:
# read the text file into a dataframe
# take care of the codes' leading zeros by setting data type
data1_raw = pd.read_fwf('chunk_1.rtf', dtype={'gvkey': object})

data1_raw.head()

Unnamed: 0,gvkey,datadate,fyear,conml,Unnamed: 4,dldte,dlrsn,conm,Unnamed: 8
0,,,,,,,,,
1,1004.0,05/31/2005,2004.0,AAR Corp,,.,,AAR CORP,\
2,1004.0,05/31/2006,2005.0,AAR Corp,,.,,AAR CORP,\
3,1004.0,05/31/2007,2006.0,AAR Corp,,.,,AAR CORP,\
4,1004.0,05/31/2008,2007.0,AAR Corp,,.,,AAR CORP,\


In [3]:
# make a copy of the raw data, and
# drop the first blank row and reset the index
data1 = data1_raw.drop([0], axis=0).reset_index(drop=True)
# drop the last blank column
data1 = data1.drop(['Unnamed: 8'], axis=1)

# change the header for the fifth column
data1.rename(columns={'Unnamed: 4':'lia'}, inplace=True)

data1.head()

Unnamed: 0,gvkey,datadate,fyear,conml,lia,dldte,dlrsn,conm
0,1004,05/31/2005,2004,AAR Corp,,.,,AAR CORP
1,1004,05/31/2006,2005,AAR Corp,,.,,AAR CORP
2,1004,05/31/2007,2006,AAR Corp,,.,,AAR CORP
3,1004,05/31/2008,2007,AAR Corp,,.,,AAR CORP
4,1004,05/31/2009,2008,AAR Corp,,.,,AAR CORP


In [4]:
# make another copy of the data for cleaning, and
# drop 4 un-immediately-necessary columns
data1_clean = data1.drop(['datadate', 'fyear', 'lia', 'conm'], axis=1)

# add a column to explicitly shows the bankruptcy status
data1_clean['isBankrupt'] = np.where(data1_clean['dlrsn']==2, 1, 0)

# make the headers comprehensible
data1_clean.columns = ['Identifier', 'Company', 'Data Deletion Date', 'Deletion Reason', 'isBankrupt']

data1_clean.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,isBankrupt
0,1004,AAR Corp,.,,0
1,1004,AAR Corp,.,,0
2,1004,AAR Corp,.,,0
3,1004,AAR Corp,.,,0
4,1004,AAR Corp,.,,0


****

# chunk_2.rtf

In [5]:
# read the text file into a dataframe
# take care of the codes' leading zeros by setting data type
data2_raw = pd.read_fwf('chunk_2.rtf', dtype={'cik': object})

data2_raw.head()

Unnamed: 0,tic,cusip,cik,exchg,consol,indfmt,datafmt,popsrc,curcd,costat
0,,,,,,,,,,
1,AIR,361105.0,1750.0,11.0,C,INDL,STD,D,USD,A \
2,AIR,361105.0,1750.0,11.0,C,INDL,STD,D,USD,A \
3,AIR,361105.0,1750.0,11.0,C,INDL,STD,D,USD,A \
4,AIR,361105.0,1750.0,11.0,C,INDL,STD,D,USD,A \


In [6]:
# make a copy of the raw data, and
# drop the first blank row and reset the index
data2 = data2_raw.drop([0], axis=0).reset_index(drop=True)

data2.head()

Unnamed: 0,tic,cusip,cik,exchg,consol,indfmt,datafmt,popsrc,curcd,costat
0,AIR,361105,1750,11,C,INDL,STD,D,USD,A \
1,AIR,361105,1750,11,C,INDL,STD,D,USD,A \
2,AIR,361105,1750,11,C,INDL,STD,D,USD,A \
3,AIR,361105,1750,11,C,INDL,STD,D,USD,A \
4,AIR,361105,1750,11,C,INDL,STD,D,USD,A \


In [7]:
# make another copy of the data for cleaning, and
# drop 7 un-immediately-necessary columns
data2_clean = data2.drop(['exchg', 'consol', 'indfmt',
                          'datafmt', 'popsrc', 'costat', 'curcd'], axis=1)

# make the headers comprehensible
data2_clean.columns = ['Ticker', 'CUSIP', 'CIK']

data2_clean.head()

Unnamed: 0,Ticker,CUSIP,CIK
0,AIR,361105,1750
1,AIR,361105,1750
2,AIR,361105,1750
3,AIR,361105,1750
4,AIR,361105,1750


****

# Merging

In [8]:
# check whether the two shits are of the same size
if len(data2_clean) == len(data1_clean):
    print('Everything is ready for "Merging Mission."')
else:
    print('You''re so fucked up.')
        

Everything is ready for "Merging Mission."


In [9]:
# merge two dataframes
data_clean = pd.concat([data1_clean.reset_index(drop=True),data2_clean.reset_index(drop=True)], axis=1)

data_clean.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,isBankrupt,Ticker,CUSIP,CIK
0,1004,AAR Corp,.,,0,AIR,361105,1750
1,1004,AAR Corp,.,,0,AIR,361105,1750
2,1004,AAR Corp,.,,0,AIR,361105,1750
3,1004,AAR Corp,.,,0,AIR,361105,1750
4,1004,AAR Corp,.,,0,AIR,361105,1750


In [10]:
# make the final copy, and collapse the repetitive rows,
# with the only reliable unique code
data = data_clean.groupby(['Identifier']).max().reset_index()

data.head()

Unnamed: 0,Identifier,Company,Data Deletion Date,Deletion Reason,isBankrupt,Ticker,CUSIP,CIK
0,1004,AAR Corp,.,,0,AIR,000361105,1750
1,1013,ADC Telecommunications Inc.,12/10/2010,1.0,0,ADCT,000886309,61478
2,1021,AFP Imaging Corp,09/15/2014,7.0,0,IWKS,001058205,319126
3,1034,Alpharma Inc.,12/31/2008,1.0,0,ALO.2,020813101,730469
4,1045,American Airlines Group Inc,.,,0,AAL,02376R102,6201


****

# Writing

In [11]:
# write dataframe into a csv file
data.to_csv('compustat.csv')

In [12]:
# make a new dataframe of healthy companies
healthy = data[data['isBankrupt'] == 0].reset_index(drop=True)
# drop the 'isBankrupt' columns
healthy = healthy.drop(['isBankrupt'], axis=1)
# write it into a csv file
healthy.to_csv('list_healthy.csv')
print('\nNow, we have a list of', len(healthy), 'healthy companies,')


# make a new dataframe of bankrupt companies
bankrupt = data[data['isBankrupt'] == 1].reset_index(drop=True)
# drop the 'isBankrupt' column
bankrupt = bankrupt.drop(['isBankrupt'], axis=1)
# write it into a csv file
bankrupt.to_csv('list_bankrupt.csv')
print('and another list of', len(bankrupt), 'bankrupt companies.')


Now, we have a list of 20783 healthy companies,
and another list of 112 bankrupt companies.
