In [43]:
import wrds
import pandas as pd
import pytz
import datetime as dt
import pandas_market_calendars as mcal

In [44]:
db = wrds.Connection(wrds_username = "connorwz")

Loading library list...
Done


In [18]:
def SP500_CRSP_data(year_range):
    first_year, last_year = str(year_range[0]),str(year_range[1])
    sp_500_query = f"""SELECT a.*, b.date, b.ret, b.prc, b.openprc
                        FROM crsp.dsp500list as a,
                        crsp.dsf as b
                        WHERE a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/{first_year}' and b.date<='12/31/{last_year}'
                        order by date;"""
    sp_500 = db.raw_sql(sp_500_query,date_cols=['start', 'ending', 'date'])
    dse = db.raw_sql("""
                        select comnam,ncusip, namedt, nameendt,permno
                        from crsp.dsenames
                        """, date_cols=['namedt', 'nameendt'])
    dse['nameendt']=dse['nameendt'].fillna(pd.to_datetime('today'))
    sp500_full = pd.merge(sp_500, dse, how = 'left', on = 'permno')
    sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \
                                & (sp500_full.date<=sp500_full.nameendt)]
    sp500_full.reset_index(inplace = True,drop = True)
    sp500_full = sp500_full[["permno","date","comnam","ret","openprc","prc"]]
    sp500_full["prc"] = sp500_full["prc"].apply(abs)
    sp500_full["CO_ret"] = (sp500_full['prc'] - sp500_full['openprc'])/sp500_full['openprc']
    return sp500_full

In [19]:
mapping_file = pd.read_csv("SP500_Mapping_file.csv")
SP500_entity_id_str = ','.join(f"'{id}'" for id in list(mapping_file.rp_entity_id)) 
mapping_file.head()

Unnamed: 0,permno,rp_entity_id
0,10104,D6489C
1,10107,228D42
2,10138,2F94A5
3,10145,FF6644
4,10516,2B7A40


In [20]:
def SP500_RP_headline_data(year_range):
    years = range(int(year_range[0]),int(year_range[-1])+1)
    years_str = [str(year) for year in years]
    RP_df = pd.DataFrame()
    for year in years_str:
        RP_year_query =f"""
                        SELECT DISTINCT timestamp_utc,rp_entity_id,headline
                        FROM rpna.rpa_djpr_equities_{year}
                        WHERE rp_entity_id IN ({SP500_entity_id_str})
                        """
        RP_df = pd.concat((RP_df,db.raw_sql(RP_year_query)),axis = 0)
    RP_df = RP_df.drop_duplicates(("rp_entity_id","headline"))
    return RP_df

In [42]:

def contem_ret(year_range):
    first_year,last_year = int(year_range[0]),int(year_range[-1])

    # Create financial dataframe from crsp and link to entity id
    sp500_crsp = SP500_CRSP_data(year_range)
    sp500_crsp_rpid =  sp500_crsp.merge(mapping_file, on = "permno", how = "inner")

    # Create RavenPack headline data and map the timestamp to contemporaneous return date
    sp500_rp =  SP500_RP_headline_data(year_range)
    sp500_rp.set_index("timestamp_utc",inplace= True)
    sp500_rp["timestamp_NY"] = pd.to_datetime(sp500_rp.index).tz_localize("UTC").tz_convert("America/New_York")
    nyse = mcal.get_calendar("NYSE")
    nyse_tradingdays= nyse.valid_days(start_date=f"{str(first_year)}-01-01",end_date=f"{str(last_year)}-12-31")\
        .tz_localize(None).tz_localize("America/New_York")
    nyse_tradingdays_closing = nyse_tradingdays + dt.timedelta(hours = 16)
    def contem_ret_date(timestamp):
        later_time = nyse_tradingdays_closing[nyse_tradingdays_closing >= timestamp]
        return later_time[0].date() if not later_time.empty else None
    sp500_rp['contem_ret_date'] = sp500_rp["timestamp_NY"].apply(contem_ret_date)
    sp500_rp.reset_index(inplace = True)

    # Merge crsp dataframe with RP dataframe
    sp500_rp_contem_ret = sp500_rp[["contem_ret_date","rp_entity_id","headline"]]
    sp500_rp_contem_ret = sp500_rp_contem_ret.dropna()
    sp500_rp_contem_ret.contem_ret_date = pd.to_datetime(sp500_rp_contem_ret.contem_ret_date)
    sp500_crsp_rpid = sp500_crsp_rpid[["date","rp_entity_id","comnam","ret"]]
    sp500_crsp_rp_contem_ret = pd.merge(sp500_crsp_rpid,sp500_rp_contem_ret,left_on=["date","rp_entity_id"],\
                                        right_on=["contem_ret_date","rp_entity_id"],how = "inner").drop(columns = "contem_ret_date")
    
    sp500_crsp_rp_contem_ret.drop_duplicates(("rp_entity_id","headline"),inplace=True)
    sp500_crsp_rp_contem_ret.dropna(inplace=True)
    return sp500_crsp_rp_contem_ret

In [45]:
def future_ret(year_range):
    first_year,last_year = int(year_range[0]),int(year_range[-1])

    # Create financial dataframe from crsp and link to entity id
    sp500_crsp = SP500_CRSP_data(year_range)
    sp500_crsp_rpid =  sp500_crsp.merge(mapping_file, on = "permno", how = "inner")

    # Create RavenPack headline data and map the timestamp to future return date
    sp500_rp =  SP500_RP_headline_data(year_range)
    sp500_rp.set_index("timestamp_utc",inplace= True)
    sp500_rp["timestamp_NY"] = pd.to_datetime(sp500_rp.index).tz_localize("UTC").tz_convert("America/New_York")
    nyse = mcal.get_calendar("NYSE")
    nyse_tradingdays= nyse.valid_days(start_date=f"{str(first_year)}-01-01",end_date=f"{str(last_year)}-12-31")\
        .tz_localize(None).tz_localize("America/New_York")
    nyse_tradingdays_opening = nyse_tradingdays + dt.timedelta(hours = 9)
    nyse_tradingdays_closing = nyse_tradingdays + dt.timedelta(hours = 16)
    def future_ret_date(timestamp):
        later_opening = nyse_tradingdays_opening[nyse_tradingdays_opening>=timestamp]
        later_closing = nyse_tradingdays_closing[nyse_tradingdays_closing>=timestamp]
        if (not later_opening.empty) & (not later_closing.empty):
            next_opening = later_opening[0]
            next_closing = later_closing[0]
            if next_opening.date() == next_closing.date():
                return [next_opening.date(),1]
            else: 
                return [next_opening.date(),0]
        else:
            return [None,None]
    sp500_rp_future_ret_date = sp500_rp.apply(lambda row:future_ret_date(row['timestamp_NY']),axis = 1, result_type="expand")
    sp500_rp_future_ret_date = sp500_rp_future_ret_date.rename(columns = {0:"future_ret_date",1:"bool_CO_ret"}) 
    sp500_rp = pd.concat([sp500_rp,sp500_rp_future_ret_date],axis = 1)

    # Merge crsp dataframe with RP dataframe
    sp500_rp.future_ret_date = pd.to_datetime(sp500_rp.future_ret_date)
    sp500_crsp_rp_future_ret = sp500_crsp_rpid.merge(sp500_rp,left_on=["date","rp_entity_id"],right_on = ["future_ret_date","rp_entity_id"],how = "inner")
    sp500_crsp_rp_future_ret = sp500_crsp_rp_future_ret[["date","rp_entity_id","comnam","CO_ret","ret","headline","bool_CO_ret"]]
    
    sp500_crsp_rp_future_ret = sp500_crsp_rp_future_ret.drop_duplicates(subset = ["rp_entity_id","headline"])
    sp500_crsp_rp_future_ret["future_ret"] = sp500_crsp_rp_future_ret.apply(lambda row: row["CO_ret"] if row["bool_CO_ret"] \
                                                                            else (row["ret"] if not row["bool_CO_ret"]  else None), axis = 1)
    sp500_crsp_rp_future_ret = sp500_crsp_rp_future_ret.drop(columns = ['CO_ret','ret','bool_CO_ret'])
    sp500_crsp_rp_future_ret.drop_duplicates(inplace = True)
    sp500_crsp_rp_future_ret.dropna(inplace = True)
    
    return sp500_crsp_rp_future_ret

In [46]:
a = future_ret((2020,2020))

In [50]:
b = contem_ret((2020,2020))

In [49]:
a.shape

(1445236, 5)

In [51]:
b.shape

(1445782, 5)