# Initialization

In [592]:
import wrds
import pandas as pd
import pytz
import datetime as dt
import pandas_market_calendars as mcal

In [593]:
db = wrds.Connection(wrds_username = "connorwz")

Loading library list...
Done


# Find SP500 constituents in 2023

In [594]:
sp500 = db.raw_sql("""
                        select a.*, b.date, b.ret, b.prc, b.openprc
                        from crsp.dsp500list as a,
                        crsp.dsf as b
                        where a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/2023' and b.date<='12/31/2023'
                        order by date;
                        """, date_cols=['start', 'ending', 'date'])

In [595]:
dse = db.raw_sql("""
                        select comnam,ncusip, namedt, nameendt,permno
                        from crsp.dsenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
dse['nameendt']=dse['nameendt'].fillna(pd.to_datetime('today'))

In [596]:
# Merge with SP500 data
sp500_full_2023 = pd.merge(sp500, dse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full_2023 = sp500_full_2023.loc[(sp500_full_2023.date>=sp500_full_2023.namedt) \
                            & (sp500_full_2023.date<=sp500_full_2023.nameendt)]
sp500_full_2023.reset_index(inplace = True,drop = True)

In [597]:
sp500_full_2023 = sp500_full_2023[["permno","date","comnam","ret","openprc","prc"]]

## Dirty data check

In [598]:
# Negative sign means bid/ask average
sp500_full_2023[sp500_full_2023.prc <0]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
23150,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
55029,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999


In [599]:
# For 76841, bid/ask average represents closing price for a stock 
# with similar numerical values of adjacent days 
sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]

  sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]


Unnamed: 0,permno,date,comnam,ret,openprc,prc
54410,76841,2023-06-08,BIOGEN INC,0.013054,310.69,308.88
55029,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999
55737,76841,2023-06-12,BIOGEN INC,-0.011013,319.76001,313.41


In [600]:
# For 11786, this seems to be something wrong
sp500_full_2023[sp500_full_2023.permno == 11786][-3:]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
22458,11786,2023-03-08,S V B FINANCIAL GROUP,0.001645,266.85999,267.82999
22939,11786,2023-03-09,S V B FINANCIAL GROUP,-0.604077,176.55,106.04
23150,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37


In [601]:
# 0 means neither closing price nor bid/ask average exists 
sp500_full_2023.prc[sp500_full_2023.prc == 0]

Series([], Name: prc, dtype: float64)

In [602]:
# There are some companies without open price which caused negative prices.
sp500_full_2023[sp500_full_2023.openprc.isna()]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
23150,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
55029,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999


In [603]:
# No dirty data for open price
(sp500_full_2023.openprc <= 0).sum()

0

In [604]:
# Nan values 
sp500_full_2023[sp500_full_2023.isna().sum(axis = 1) >0]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
828,23570,2023-01-04,G E HEALTHCARE TECHNOLOGIES INC,,54.13,60.49
23150,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
55029,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999
62527,23942,2023-07-03,FORTREA HOLDINGS INC,,33.8,36.84
63204,23944,2023-07-05,PHINIA INC,,29.89,36.75
94120,24174,2023-10-02,VERALTO CORP,,83.11,85.12
94163,24175,2023-10-02,W K KELLOGG CO,,13.8,13.35


In [605]:
# There is no duplicated values 
sp500_full_2023.duplicated().sum()

0

# Get opening-closing returns and closing-closing returns for SP500 constituents in 2023

In [606]:
# get closing-open return
sp500_full_2023["prc"] = sp500_full_2023["prc"].apply(abs)
sp500_full_2023["CO_ret"] = (sp500_full_2023['prc'] - sp500_full_2023['openprc'])/sp500_full_2023['openprc']

# Link CRSP to RavenPack

In [607]:
mapping_file_query = """ 
                    SELECT DISTINCT 
                    a.permno, b.rp_entity_id 
                    FROM (SELECT * FROM crsp.dse WHERE ncusip IS NOT NULL) as a,
                    rpna.wrds_company_names as b
                    WHERE a.ncusip=substr(b.isin,3,8)
"""
mapping_file = db.raw_sql(mapping_file_query)

In [608]:
# There are companies mapped to more than one entity_id
mapping_file.groupby("permno").filter(lambda sub:sub.shape[0]>1).sort_values("permno")

Unnamed: 0,permno,rp_entity_id
1581,10066,179A00
571,10066,2E83D7
10538,10082,686DB1
13893,10082,229150
11921,10560,C8C45A
...,...,...
11032,90722,D553E8
1319,92010,3E387A
1863,92010,133899
12498,92685,A1C951


In [609]:
# All RP's headlines in 2023 
RP_2023_query = """SELECT rpa_date_utc,timestamp_utc,rp_entity_id,entity_name,headline
                FROM rpna.rpa_djpr_equities_2023
            """
RP_2023 = db.raw_sql(RP_2023_query)

In [610]:
# Remove those entity_ids which don't exist in RP
mapping_file = mapping_file[mapping_file.rp_entity_id.isin(RP_2023.rp_entity_id.unique())]

In [611]:
# Drop those companies which don't exist in RP
sp500_2023_RPid = sp500_full_2023.merge(mapping_file, on = "permno", how = "inner")

In [615]:
len(sp500_2023_RPid.permno.unique()) < len(sp500_full_2023.permno.unique())

True

In [612]:
# No permno is redundantly mapped
sp500_2023_RPid.groupby("permno").filter(lambda sub:len(sub.rp_entity_id.unique())>1)

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id


In [613]:
print(sp500_2023_RPid[["permno","date","comnam","rp_entity_id"]].isna().sum().sum())
print(sp500_2023_RPid.duplicated().sum())

0
0


# Link headlines based on contemporaneous returns

In [619]:
sp500_2023_RPid_backup = sp500_2023_RPid.copy()
RP_2023_backup = RP_2023.copy()

In [623]:
# Drop duplicates in consecutive days
RP_2023 = RP_2023.drop_duplicates(("rp_entity_id","headline"))

In [625]:
# Drop those non-sp500 constituents' headlines
RP_2023 = RP_2023[RP_2023.rp_entity_id.isin(sp500_2023_RPid.rp_entity_id.unique())]

In [627]:
RP_2023.set_index("timestamp_utc",inplace= True)
RP_2023["timestamp_NY"] = pd.to_datetime(RP_2023.index).tz_localize("UTC").tz_convert("America/New_York")

In [628]:
nyse = mcal.get_calendar("NYSE")
nyse_trading_2023 = nyse.valid_days(start_date="2022-12-31",end_date="2023-12-31").tz_localize(None).tz_localize("America/New_York")

In [629]:
nyse_trading_2023_closing = nyse_trading_2023 + dt.timedelta(hours = 16)

In [630]:
def contem_ret_date(timestamp):
    later_time = nyse_trading_2023_closing[nyse_trading_2023_closing >= timestamp]
    return later_time[0].date() if not later_time.empty else None

In [631]:
RP_2023['contem_ret_date'] = RP_2023["timestamp_NY"].apply(contem_ret_date)

In [632]:
RP_2023.reset_index(inplace = True)
RP_2023_contem_ret = RP_2023[["contem_ret_date","rp_entity_id","headline"]]
RP_2023_contem_ret = RP_2023_contem_ret.dropna()
sp500_2023_RPid_contem_ret = sp500_2023_RPid[["date","rp_entity_id","comnam","ret"]]
RP_2023_contem_ret.contem_ret_date = pd.to_datetime(RP_2023_contem_ret.contem_ret_date)
SP500_RP_contem_ret_2023 = pd.merge(sp500_2023_RPid_contem_ret,RP_2023_contem_ret,left_on=["date","rp_entity_id"],\
                                    right_on=["contem_ret_date","rp_entity_id"],how = "inner").drop(columns = "contem_ret_date")

In [633]:
SP500_RP_contem_ret_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,ret,headline
0,2023-01-03,BDD12C,WHIRLPOOL CORP,0.016047,"Daikin Acquires Williams Distributing, a Distr..."
1,2023-01-03,652E62,P G & E CORP,-0.03567,PG&E Cut to Neutral From Buy by Jefferies
2,2023-01-03,652E62,P G & E CORP,-0.03567,PG&E Price Target Announced at $17.00/Share by...
3,2023-01-03,652E62,P G & E CORP,-0.03567,MW PG&E downgraded to neutral from buy at UBS
4,2023-01-03,652E62,P G & E CORP,-0.03567,Correction to PG&E Ratings Headlines


In [634]:
SP500_RP_contem_ret_2023.duplicated().sum()
SP500_RP_contem_ret_2023.isna().sum().sum()

0

# Link headlines based on future returns

In [640]:
nyse_trading_2023_opening = nyse_trading_2023 + dt.timedelta(hours = 9)
# 1 stands for CO_ret
def future_ret_date(timestamp):
    later_opening = nyse_trading_2023_opening[nyse_trading_2023_opening>=timestamp]
    later_closing = nyse_trading_2023_closing[nyse_trading_2023_closing>=timestamp]
    if (not later_opening.empty) & (not later_closing.empty):
        next_opening = later_opening[0]
        next_closing = later_closing[0]
        if next_opening.date() == next_closing.date():
            return [next_opening.date(),1]
        else: 
            return [next_opening.date(),0]
    else:
        return [None,None]

In [651]:
RP_2023_future_ret_date = RP_2023.apply(lambda row:future_ret_date(row['timestamp_NY']),axis = 1, result_type="expand")

In [658]:
RP_2023_future_ret_date = RP_2023_future_ret_date.rename(columns = {0:"future_ret_date",1:"bool_CO_ret"}) 
RP_2023 = pd.concat([RP_2023,RP_2023_future_ret_date],axis = 1)

**Duplicate's problem**

In [737]:
sp500_2023_RPid.head()

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id
0,25419,2023-01-03,WHIRLPOOL CORP,0.016047,144.12,143.73,-0.002706,BDD12C
1,13688,2023-01-03,P G & E CORP,-0.03567,15.99,15.68,-0.019387,652E62
2,92402,2023-01-03,M S C I INC,-0.008621,469.57001,461.16,-0.01791,3ED92D
3,63467,2023-01-03,BROWN & BROWN INC,-0.005617,57.06,56.65,-0.007185,C598D7
4,81736,2023-01-03,RESMED INC,0.004709,210.07001,209.11,-0.00457,434F38


In [738]:
sp500_2023_RPid.groupby("permno").filter(lambda sub:len(sub.rp_entity_id.unique()) > 1)

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id


In [735]:
RP_2023.head()

Unnamed: 0,timestamp_utc,rpa_date_utc,rp_entity_id,entity_name,headline,timestamp_NY,contem_ret_date,future_ret_date,bool_CO_ret
0,2023-01-01 02:00:05.447,2023-01-01,0157B1,Amazon.com Inc.,New Film Release: Cryptid Horror Movie Brings ...,2022-12-31 21:00:05.447000-05:00,2023-01-03,2023-01-03,1.0
1,2023-01-01 07:00:11.264,2023-01-01,0157B1,Amazon.com Inc.,The Craziest Moments From the Longest Tech Boo...,2023-01-01 02:00:11.264000-05:00,2023-01-03,2023-01-03,1.0
2,2023-01-01 07:00:11.275,2023-01-01,0157B1,Amazon.com Inc.,The Craziest Moments From the Longest Tech Boo...,2023-01-01 02:00:11.275000-05:00,2023-01-03,2023-01-03,1.0
3,2023-01-01 10:30:00.085,2023-01-01,0157B1,Amazon.com Inc.,The Year Big Tech Stocks Fell From Glory -- WSJ,2023-01-01 05:30:00.085000-05:00,2023-01-03,2023-01-03,1.0
4,2023-01-01 13:00:00.055,2023-01-01,0157B1,Amazon.com Inc.,Robust Job and Wage Growth Showed Signs of Coo...,2023-01-01 08:00:00.055000-05:00,2023-01-03,2023-01-03,1.0


In [736]:
RP_2023.duplicated(("rp_entity_id","headline")).sum()

0

In [746]:
RP_2023.future_ret_date = pd.to_datetime(RP_2023.future_ret_date)
SP500_RP_future_ret_2023 = sp500_2023_RPid.merge(RP_2023,left_on=["date","rp_entity_id"],right_on = ["future_ret_date","rp_entity_id"],how = "inner")
SP500_RP_future_ret_2023 = SP500_RP_future_ret_2023[["date","rp_entity_id","comnam","CO_ret","ret","headline","bool_CO_ret"]]

In [747]:
SP500_RP_future_ret_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,CO_ret,ret,headline,bool_CO_ret
0,2023-01-03,652E62,P G & E CORP,-0.019387,-0.03567,PG&E Cut to Neutral From Buy by Jefferies,1.0
1,2023-01-03,652E62,P G & E CORP,-0.019387,-0.03567,PG&E Price Target Announced at $17.00/Share by...,1.0
2,2023-01-03,434F38,RESMED INC,-0.00457,0.004709,ResMed Announces Participation in the 41st Ann...,1.0
3,2023-01-03,434F38,RESMED INC,-0.00457,0.004709,Sydney Closing Stock Prices -2-,1.0
4,2023-01-03,434F38,RESMED INC,-0.00457,0.004709,Press Release: ResMed Announces Participation ...,1.0


In [748]:
# Where do these duplicates come from
SP500_RP_future_ret_2023.duplicated(["rp_entity_id","headline"]).sum()

62584

In [749]:
# Examples
SP500_RP_future_ret_2023[SP500_RP_future_ret_2023.duplicated(["rp_entity_id","headline"],keep = False)]

Unnamed: 0,date,rp_entity_id,comnam,CO_ret,ret,headline,bool_CO_ret
1383,2023-01-03,4A6F00,ALPHABET INC,-0.005191,0.010087,MW 'Our social skills are like any other muscl...,1.0
1384,2023-01-03,4A6F00,ALPHABET INC,-0.005191,0.010087,New Film Release: Cryptid Horror Movie Brings ...,1.0
1385,2023-01-03,4A6F00,ALPHABET INC,-0.005191,0.010087,Astounding 2023 New Year's Resolution: Longtim...,1.0
1386,2023-01-03,4A6F00,ALPHABET INC,-0.005191,0.010087,Credefi Scores Major Milestone in Partnership ...,1.0
1387,2023-01-03,4A6F00,ALPHABET INC,-0.005191,0.010087,Press Release: Credefi Scores Major Milestone ...,1.0
...,...,...,...,...,...,...,...
1745616,2023-12-29,4A6F00,ALPHABET INC,0.001777,-0.002477,Gen AI Hype Grips Telecom Industry as Telcos U...,1.0
1745617,2023-12-29,4A6F00,ALPHABET INC,0.001777,-0.002477,"Hankyung.com: ""AI technology to feed the Korea...",1.0
1745618,2023-12-29,4A6F00,ALPHABET INC,0.001777,-0.002477,Dow Jones Futures Dip Ahead Of Final Session O...,1.0
1745619,2023-12-29,4A6F00,ALPHABET INC,0.001777,-0.002477,Valmet Oyj: Valmet Oyj: Notification according...,1.0


In [750]:
SP500_RP_future_ret_2023 = SP500_RP_future_ret_2023.drop_duplicates(subset = ["rp_entity_id","headline"])
SP500_RP_future_ret_2023["future_ret"] = SP500_RP_future_ret_2023.apply(lambda row: row["CO_ret"] if row["bool_CO_ret"] \
                                                                        else (row["ret"] if not row["bool_CO_ret"]  else None), axis = 1)
SP500_RP_future_ret_2023 = SP500_RP_future_ret_2023.drop(columns = ['CO_ret','ret','bool_CO_ret'])

In [751]:
SP500_RP_future_ret_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,headline,future_ret
0,2023-01-03,652E62,P G & E CORP,PG&E Cut to Neutral From Buy by Jefferies,-0.019387
1,2023-01-03,652E62,P G & E CORP,PG&E Price Target Announced at $17.00/Share by...,-0.019387
2,2023-01-03,434F38,RESMED INC,ResMed Announces Participation in the 41st Ann...,-0.00457
3,2023-01-03,434F38,RESMED INC,Sydney Closing Stock Prices -2-,-0.00457
4,2023-01-03,434F38,RESMED INC,Press Release: ResMed Announces Participation ...,-0.00457


In [752]:
# NA comes from None opening prices
print(SP500_RP_future_ret_2023.duplicated().sum())
print(SP500_RP_future_ret_2023.isna().sum().sum())

0
148
