IBES, CRSP, COMPUSTAT data @author Tim Copeland

In [358]:
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta

Cleaning constituents file

In [359]:
#use compustat to download sp500 constituents from 1970 to 2016.
cons = pd.read_csv('data/cons_70_17.csv', low_memory=False)

In [360]:
#delete duplicate rows
cons = cons[~cons.duplicated()]

#delete rows with empty ticker/company_name
cons = cons.dropna(subset=['co_tic'])

cons = cons.sort_values(['co_tic'])

In [361]:
cons.head()

Unnamed: 0,gvkey,gvkeyx,from,thru,conm,tic,co_conm,co_tic,co_cusip
374,10507,3,19640331,20071009.0,S&P 500 Comp-Ltd,I0003,ENERGY FUTURE HOLDINGS CORP,0033A,873168108
1253,12144,3,20020722,20050811.0,S&P 500 Comp-Ltd,I0003,SUNGARD DATA SYSTEMS INC,0139A,867363103
296,2316,3,19640331,19941220.0,S&P 500 Comp-Ltd,I0003,HEXION INC,0141A,428300107
1147,30095,3,19991116,20030925.0,S&P 500 Comp-Ltd,I0003,QUINTILES TRANSNATIONAL CORP,0573B,748767100
484,7148,3,19681231,19750831.0,S&P 500 Comp-Ltd,I0003,MCCRORY CORP,1219A,579865007


In [362]:
cons.describe()

Unnamed: 0,gvkey,gvkeyx,from,thru
count,1629.0,1629.0,1629.0,1124.0
mean,25184.907919,3.0,19858300.0,19951160.0
std,44781.70422,0.0,181025.1,135729.3
min,1010.0,3.0,19640330.0,19700230.0
25%,4839.0,3.0,19650330.0,19840630.0
50%,8488.0,3.0,19850930.0,19970800.0
75%,15084.0,3.0,20010800.0,20070220.0
max,316056.0,3.0,20171010.0,20171010.0


In [363]:
print(len(cons['co_conm'].unique()))
print(len(cons['co_conm']))

1543
1629


In [364]:
cons.to_csv('data/clean_cons_70_17.csv')

In [365]:
#plaintext of gvkeys
gvkeys = np.sort(cons['gvkey'].unique())
np.savetxt('gvkey.txt', gvkeys[None,:], delimiter="\r\n", fmt="%s")
print(len(gvkeys))

1543


Working with CRSP data

In [366]:
#use gvkeys to download gvkey -> lpermno link table from crsp. Read the resulting table.
Compustat_CRSP_link_table = pd.read_csv('link_tables/Compustat_CRSP_link_table.csv')

In [367]:
print(len(gvkeys) - len(Compustat_CRSP_link_table['gvkey'].unique()))

25


In [368]:
#missing gvkeys
gvkeys = pd.Series(gvkeys)
missing_gvkey = gvkeys[~gvkeys.isin(Compustat_CRSP_link_table['gvkey'].unique())]
missing_gvkey

200       3039
214       3165
319       4199
380       4799
557       6537
752       8430
774       8617
881       9746
1112     13353
1129     14097
1157     15448
1252     27800
1265     28192
1275     28719
1281     29004
1287     29155
1301     29819
1325     31774
1328     32106
1329     32154
1422    122147
1530    186915
1531    186932
1539    265975
1540    266101
dtype: int64

In [369]:
#plaintext of lpermnos
lpermnos = np.sort(Compustat_CRSP_link_table['LPERMNO'].unique())
np.savetxt('lpermno.txt', lpermnos[None,:], delimiter="\r\n", fmt="%s")
print(len(lpermnos))

1727


In [370]:
dtype_dic = {'PERMNO': np.int64,
             'date': np.int64, 
             'NCUSIP': str, 
             'TICKER': object, 
             'COMNAM': str, 
             'CUSIP': object, 
             'DIVAMT': np.float64,
             'BIDLO': np.float64, 
             'ASKHI':np.float64, 
             'PRC': np.float64, #np.int32, 
             'VOL': np.float64, #np.int32, 
             'RET': object, #np.int32,  
             'SHROUT': np.float64, #np.int32, 
             'OPENPRC': np.float64, 
             'NUMTRD': np.float64, #np.int32, 
            }

In [371]:
#use lpermnos to download crsp data. Read the table.
crsp = pd.read_csv('data/crsp_sp500_1970_2016.csv', chunksize=100000, low_memory = False,dtype=dtype_dic)

In [372]:
#save to hdf5 file (Makes it easier to access by slice later on)
hdf = pd.HDFStore('data/crsp.h5', mode = 'a')
for i in crsp: 
    hdf.append('df', i, data_columns=True, format = 'table')

In [373]:
hdf.close()

IBES data

In [374]:
#use crsp lpermnos to download crsp lpermno->ibes ticker link table
CRSP_IBES_link_table = pd.read_csv('link_tables/CRSP_IBES_link_table.csv')

In [375]:
print(len(lpermnos) - len(CRSP_IBES_link_table['PERMNO'].unique()))

99


In [376]:
#missing lpermnos (that's a lot!!)
lpermnos = pd.Series(lpermnos)
missing_lpermnos = lpermnos[~lpermnos.isin(CRSP_IBES_link_table['PERMNO'].unique())]
missing_lpermnos

0       10006
17      10276
25      10436
30      10495
32      10524
40      10735
41      10751
52      11041
54      11068
56      11092
70      11543
76      11658
78      11690
80      11746
82      11826
87      11949
104     12124
106     12167
107     12191
110     12343
112     12346
117     12466
128     12669
133     12837
143     13063
154     13311
159     13522
165     13637
179     13964
184     14066
        ...  
893     48119
915     49592
938     51027
971     53357
993     55386
996     56012
1040    59221
1052    59475
1086    61890
1105    63343
1156    67791
1167    69163
1201    75038
1206    75181
1212    75255
1277    77058
1295    77459
1388    80411
1398    80783
1457    83715
1502    85658
1509    85904
1548    87030
1583    88663
1603    89155
1609    89223
1651    90379
1658    90562
1662    90740
1688    91518
Length: 141, dtype: int64

In [377]:
ibtic = np.sort(CRSP_IBES_link_table['TICKER'].unique())
np.savetxt('tic.txt', ibtic[None,:], delimiter="\r\n", fmt="%s")
print(len(ibtic))

1527


In [378]:
estimates = pd.read_csv('data/estimates.csv')
estimates.head()

Unnamed: 0,OFTIC,TICKER,CUSIP,CNAME,ACTDATS,ANALYS,FPI,MEASURE,VALUE,FPEDATS
0,FDC,00VP,32008D10,FIRST DATA,20160114,79876,1,EPS,0.34,20151231
1,FDC,00VP,32008D10,FIRST DATA,20160212,149615,1,EPS,1.55,20161231
2,FDC,00VP,32008D10,FIRST DATA,20160419,149615,1,EPS,1.49,20161231
3,FDC,00VP,32008D10,FIRST DATA,20160425,10258,1,EPS,1.85,20161231
4,FDC,00VP,32008D10,FIRST DATA,20160425,10258,1,EPS,1.14,20161231


In [379]:
estimates.describe()

Unnamed: 0,ACTDATS,ANALYS,FPI,VALUE,FPEDATS
count,125134.0,125134.0,125134.0,125134.0,125134.0
mean,20100060.0,78310.285022,1.0,-11.155375,20101050.0
std,48164.37,45128.259632,0.0,1214.927431,47936.13
min,19921010.0,0.0,1.0,-111300.0,19921230.0
25%,20070810.0,45147.0,1.0,0.92,20071230.0
50%,20110120.0,79582.0,1.0,2.01,20110630.0
75%,20140420.0,113313.75,1.0,3.62,20141230.0
max,20170720.0,188608.0,1.0,45000.0,20180530.0


In [380]:
actuals = pd.read_csv('data/actuals.csv')
actuals.head()

Unnamed: 0,TICKER,CUSIP,OFTIC,CNAME,PENDS,MEASURE,PDICITY,ANNDATS,ANNTIMS,ACTDATS,ACTTIMS,VALUE,CURR_ACT
0,A,2742010.0,A,AMERN MEDIC BLDG,19881231,EPS,ANN,19890401,0:00:00,19890401,0:00:00,-0.47,USD
1,A,2742010.0,A,AMERN MEDIC BLDG,19891231,EPS,ANN,19900418,0:00:00,19900418,0:00:00,-0.09,USD
2,A,2742010.0,A,AMERN MEDIC BLDG,19901231,EPS,ANN,19920403,0:00:00,19920403,0:00:00,0.03,USD
3,AA,,,,19741231,EPS,ANN,19760115,0:00:00,19760115,0:00:00,,USD
4,AA,,,,19751231,EPS,ANN,19760219,0:00:00,19760219,0:00:00,,USD


In [381]:
#drop actuals rows with missing earnings values
actuals = actuals.dropna(axis=0, subset=['VALUE'])

In [382]:
actuals.describe()

Unnamed: 0,PENDS,ANNDATS,ACTDATS,VALUE
count,13531.0,13531.0,13531.0,13531.0
mean,20000870.0,20007480.0,20007490.0,1.443553
std,93749.98,93718.22,93717.36,9.323321
min,19761230.0,19800120.0,19800120.0,-554.7998
25%,19921230.0,19930210.0,19930210.0,0.395
50%,20001230.0,20010120.0,20010120.0,1.15
75%,20081230.0,20090120.0,20090130.0,2.27
max,20170530.0,20170630.0,20170630.0,200.5


In [383]:
#given a date (in YYYYMMDD format), returns quarter number
def quarter_num(date):
    a = datetime.strptime(str(date), '%Y%m%d')
    return (a.month-1)//3

quarter_num('20060801')

2

In [384]:
#add quarternum column to actuals and estimates
estimates['quarternum'] = estimates['FPEDATS'].apply(quarter_num)
actuals['quarternum']   = actuals['ANNDATS'].apply(quarter_num)

In [385]:
#save clean estimates
estimates.to_csv('ibes/clean_estimates.csv')
actuals.to_csv('ibes/clean_actuals.csv')

In [386]:
#analyst forecast and actual earnings dataframes
df_aforecast   = estimates.set_index(['TICKER', 'FPEDATS', 'quarternum', 'ANALYS'])['VALUE']
df_actual_ern = actuals.set_index(['TICKER', 'ANNDATS', 'quarternum'])['VALUE']

In [387]:
df_aforecast.to_csv('ibes/df_aforecast.csv')
df_actual_ern.to_csv('ibes/df_actual_ern.csv')

Generate price distribution dataframe

In [388]:
#fill na's in cons thru with max date (assumed to be nov 1st 2017)
max_date = 20171101.0
cons['thru'] = cons['thru'].fillna(max_date)
cons['thru'] = pd.to_numeric(cons['thru'])
cons['from'] = pd.to_numeric(cons['from'])

In [389]:
#clean up missing values in CRSP_IBES_linktable
CRSP_IBES_link_table = CRSP_IBES_link_table.fillna(method='ffill')

In [390]:
class df_handler:
    #actuals = actuals dataframe
    def __init__(self,actuals, X=30): 
        self.actuals = actuals
        self.X = X #denotes range of prices [-X days, +Xdays] from target date
        
    def gen_df(self):
        
        prc_range = [str(i) for i in range(-X,X+1)]
        col_names = ['TICKER', 'ANNDATS', 'quarternum'] + prc_range
        df = pd.DataFrame(columns = col_names)
        for i in range(0,len(self.actuals['TICKER'])):
            row = self.actuals.iloc[i]
            if self.check_SP500(row): 
                data = [row['TICKER'] , row['ANNDATS'] , row['quarternum']] + [i for i in self.crsp_prices(row)]
                tmp = dict(zip(col_names,data))
                df = df.append(tmp, ignore_index = True) 
            
        return df
        
        
    #function that accepts a row of actuals data and checks if this earnings belongs in the S&P500 at that time or not.
    def check_SP500(self,actual_entry):
        dt = actual_entry['ACTDATS']

        #match this ibtic to corresponding lpermno via CRSP_IBES_link_table
        ibtic   = actual_entry['TICKER']
        lpermno = CRSP_IBES_link_table[CRSP_IBES_link_table['TICKER'] == ibtic]['PERMNO'].unique()
        if lpermno.size == 0: return False

        #use lpermno to match with compustat gvkey using Compustat_CRSP_link_table. "lpermno[0]" the zero index is because 
        #    .unique( ) returns a numpy array of size 1. We need it as a number instead. Dangerous!!
        gvkey = Compustat_CRSP_link_table[Compustat_CRSP_link_table['LPERMNO'] == lpermno[0]]['gvkey'].unique()
        if gvkey.size == 0: return False

        #use gvkey to access corresponding date ranges in constituents file
        dt_from = cons[cons['gvkey'] == gvkey[0]]['from']
        dt_thru = cons[cons['gvkey'] == gvkey[0]]['thru']

        #check if actual earnings announcement date is within the (or one of the) date ranges
        for i in range(0,len(dt_from)):
            if dt >= dt_from.values[i] and dt <= dt_thru.values[i]: return True

        return False #no match - return false
    
    #given a row of actuals data, match it to crsp data and generate [-X,X] days of price
    #data. Prices are normalized such that the earnings date price is 1.
    #     NOTE: there are duplicate dates entries for the same permno. No idea why this is. I just deleted them.
    #This part can definitely be sped up. Some redundant operations..
    #  1. linking ibtics and permnos
    #  2. slicing crsp and manipulating slice.
    def crsp_prices(self,actual_entry):
        dt = actual_entry['ACTDATS']
        X = self.X

        #match this ibtic to corresponding lpermno via CRSP_IBES_link_table
        ibtic   = actual_entry['TICKER']
        lpermno = CRSP_IBES_link_table[CRSP_IBES_link_table['TICKER'] == ibtic]['PERMNO'].unique()
        if lpermno.size == 0: return []

        #grab crsp prices associated with this ticker
        crsp = pd.read_hdf('data/crsp.h5', 'df', where='PERMNO in ' + str(lpermno[0]))
                    
        #delete duplicate rows
        crsp = crsp[~crsp.duplicated()]
        
        #delete duplicate dates (Why do these exist in the first place? Is this correct??)
        crsp = crsp[~crsp['date'].duplicated()]
        
        #delete entries with missing dates
        crsp = crsp.dropna(subset=['date'])
        
        #find date price date closest to earnings date
        data     = crsp.sort_values(['date']).reset_index(drop=True)
        prc_date = pd.Index(data['date']).get_loc(dt, method='nearest')
        
        #take slice of dates X days after dt and X days before dt 
        date_range = [datetime.strptime(str(data['date'][prc_date]), '%Y%m%d') + timedelta(days=j) for j in range(-X, X+1)]
        prices = []
        for i in date_range:
            tar_prc = data[data['date'] == int(i.strftime('%Y%m%d'))]['PRC']

            if tar_prc.empty: prices.append('NaN')
            else:             prices.append(tar_prc.values[0])
                
        #normalize prices such that target date = 1 (ie each price / target date price)
        for i in range(0,len(prices)): 
            if type(prices[i]) == str: continue
            else:                      prices[i] = prices[i]/prices[X]
                       
        return prices

In [391]:
#test check_SP500
test = df_handler(actuals)
print(actuals.iloc[0])
print(test.check_SP500(actuals.iloc[0])) #should be false
print(' ')
print(actuals.iloc[8])
print(test.check_SP500(actuals.iloc[8])) #should be true

TICKER                       A
CUSIP                 02742010
OFTIC                        A
CNAME         AMERN MEDIC BLDG
PENDS                 19881231
MEASURE                    EPS
PDICITY                    ANN
ANNDATS               19890401
ANNTIMS                0:00:00
ACTDATS               19890401
ACTTIMS                0:00:00
VALUE                    -0.47
CURR_ACT                   USD
quarternum                   1
Name: 0, dtype: object
False
 
TICKER              AA
CUSIP         02224910
OFTIC               AA
CNAME            ALCOA
PENDS         19881231
MEASURE            EPS
PDICITY            ANN
ANNDATS       19890119
ANNTIMS        0:00:00
ACTDATS       19890119
ACTTIMS        0:00:00
VALUE           3.6525
CURR_ACT           USD
quarternum           0
Name: 10, dtype: object
True


In [392]:
#test crsp_rets
print(test.crsp_prices(actuals.iloc[8])) 

[0.90204081632653066, 0.88775510204081631, 0.88367346938775515, 0.88979591836734695, 'NaN', 'NaN', 'NaN', 0.89183673469387759, 0.89591836734693875, 0.90816326530612246, 0.91428571428571426, 'NaN', 'NaN', 'NaN', 0.91224489795918362, 0.92448979591836733, 0.9408163265306122, 0.9408163265306122, 'NaN', 'NaN', 0.94897959183673475, 0.95714285714285718, 0.97959183673469385, 0.97959183673469385, 0.97346938775510206, 'NaN', 'NaN', 0.97551020408163269, 0.97755102040816322, 0.99591836734693873, 1.0, 61.875, 'NaN', 'NaN', 61.75, 61.25, 61.25, 61.625, 62.875, 'NaN', 'NaN', 62.875, 64.625, 64.875, 63.875, 63.5, 'NaN', 'NaN', 62.875, 63.875, 64.25, 63.375, 61.5, 'NaN', 'NaN', 61.375, 61.25, 61.125, 62.0, 63.0, 'NaN']


In [393]:
#generate dataframe
df_prc_dist = test.gen_df()

In [394]:
df_prc_dist.head()

Unnamed: 0,TICKER,ANNDATS,quarternum,-30,-29,-28,-27,-26,-25,-24,...,21,22,23,24,25,26,27,28,29,30
0,AA,19840124,0,,,1.0084,1.0056,1.0084,1.0056,,...,38.5,38.875,39.25,39.0,,,,38.375,37.75,37.25
1,AA,19850121,0,,,0.970395,,0.967105,0.963816,0.970395,...,38.875,39.0,39.125,38.75,38.125,,,,38.125,37.875
2,AA,19860122,0,1.01967,1.03279,,1.03607,1.03934,,,...,45.375,44.625,44.5,,,,44.75,43.75,43.875,45.75
3,AA,19870122,0,0.860317,0.857143,,0.857143,,,0.857143,...,43.0,44.125,,,,45.0,43.875,43.375,43.75,
4,AA,19880122,0,1.18475,1.18768,,,,1.13196,1.12317,...,44.625,,,,46.5,45.625,44.875,45.25,,


In [395]:
df_prc_dist.describe()

Unnamed: 0,0
count,7640.0
mean,1.0
std,0.0
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [396]:
df_prc_dist.to_csv('df_prc_dist.csv')