# Initialization

In [47]:
import wrds
import pandas as pd
import pytz
import datetime as dt
import pandas_market_calendars as mcal

In [48]:
db = wrds.Connection(wrds_username = "connorwz")

Loading library list...
Done


# Find SP500 constituents in 2023

In [49]:
sp500 = db.raw_sql("""
                        select a.*, b.date, b.ret, b.prc, b.openprc
                        from crsp.dsp500list as a,
                        crsp.dsf as b
                        where a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/2023' and b.date<='12/31/2023'
                        order by date;
                        """, date_cols=['start', 'ending', 'date'])

In [50]:
dse = db.raw_sql("""
                        select comnam,ncusip, namedt, nameendt,permno
                        from crsp.dsenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
dse['nameendt']=dse['nameendt'].fillna(pd.to_datetime('today'))

In [51]:
# Merge with SP500 data
sp500_full_2023 = pd.merge(sp500, dse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full_2023 = sp500_full_2023.loc[(sp500_full_2023.date>=sp500_full_2023.namedt) \
                            & (sp500_full_2023.date<=sp500_full_2023.nameendt)]
sp500_full_2023.reset_index(inplace = True,drop = True)

In [52]:
sp500_full_2023 = sp500_full_2023[["permno","date","comnam","ret","openprc","prc"]]

## Dirty data check

In [53]:
# Negative sign means bid/ask average
sp500_full_2023[sp500_full_2023.prc <0]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
23497,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
54846,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999


In [54]:
# For 76841, bid/ask average represents closing price for a stock 
# with similar numerical values of adjacent days 
sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]

  sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]


Unnamed: 0,permno,date,comnam,ret,openprc,prc
54784,76841,2023-06-08,BIOGEN INC,0.013054,310.69,308.88
54846,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999
55503,76841,2023-06-12,BIOGEN INC,-0.011013,319.76001,313.41


In [55]:
# For 11786, this seems to be something wrong
sp500_full_2023[sp500_full_2023.permno == 11786][-3:]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
22415,11786,2023-03-08,S V B FINANCIAL GROUP,0.001645,266.85999,267.82999
23058,11786,2023-03-09,S V B FINANCIAL GROUP,-0.604077,176.55,106.04
23497,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37


In [56]:
# 0 means neither closing price nor bid/ask average exists 
sp500_full_2023.prc[sp500_full_2023.prc == 0]

Series([], Name: prc, dtype: float64)

In [57]:
# There are some companies without open price which caused negative prices.
sp500_full_2023[sp500_full_2023.openprc.isna()]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
23497,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
54846,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999


In [58]:
# No dirty data for open price
(sp500_full_2023.openprc <= 0).sum()

0

In [59]:
# Nan values 
sp500_full_2023[sp500_full_2023.isna().sum(axis = 1) >0]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
668,23570,2023-01-04,G E HEALTHCARE TECHNOLOGIES INC,,54.13,60.49
23497,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
54846,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999
62718,23942,2023-07-03,FORTREA HOLDINGS INC,,33.8,36.84
63335,23944,2023-07-05,PHINIA INC,,29.89,36.75
94168,24175,2023-10-02,W K KELLOGG CO,,13.8,13.35
94208,24174,2023-10-02,VERALTO CORP,,83.11,85.12


In [60]:
# There is no duplicated values 
sp500_full_2023.duplicated().sum()

0

# Get opening-closing returns and closing-closing returns for SP500 constituents in 2023

In [61]:
# get closing-open return
sp500_full_2023["prc"] = sp500_full_2023["prc"].apply(abs)
sp500_full_2023["CO_ret"] = (sp500_full_2023['prc'] - sp500_full_2023['openprc'])/sp500_full_2023['openprc']

# Link CRSP to RavenPack

In [62]:
mapping_file = pd.read_csv("Mapping_file.csv")
# Drop those companies which don't exist in RP
sp500_2023_RPid = sp500_full_2023.merge(mapping_file, on = "permno", how = "inner")

In [63]:
print(sp500_2023_RPid[["permno","date","comnam","rp_entity_id"]].isna().sum().sum())
print(sp500_2023_RPid.duplicated().sum())

0
0


# Link headlines based on contemporaneous returns

In [64]:
# All RP's headlines in 2023 
RP_2023_query = """SELECT rpa_date_utc,timestamp_utc,rp_entity_id,entity_name,headline
                FROM rpna.rpa_djpr_equities_2023
            """
RP_2023 = db.raw_sql(RP_2023_query)

In [65]:
sp500_2023_RPid_backup = sp500_2023_RPid.copy()
RP_2023_backup = RP_2023.copy()

In [66]:
# Drop duplicates in consecutive days
RP_2023 = RP_2023.drop_duplicates(("rp_entity_id","headline"))

In [67]:
# Drop those non-sp500 constituents' headlines
RP_2023 = RP_2023[RP_2023.rp_entity_id.isin(sp500_2023_RPid.rp_entity_id.unique())]

In [68]:
sp500_2023_RPid.head()

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id
0,39917,2023-01-03,WEYERHAEUSER CO,-0.004516,31.25,30.86,-0.01248,FF4BA4
1,82642,2023-01-03,LAUDER ESTEE COS INC,0.023377,256.17999,253.91,-0.008861,14ED2B
2,21178,2023-01-03,LOCKHEED MARTIN CORP,-0.018418,483.39001,477.53,-0.012123,96F126
3,66800,2023-01-03,AMERICAN INTERNATIONAL GROUP INC,-0.004902,63.45,62.93,-0.008195,0BC29E
4,64653,2023-01-03,PUBLIC STORAGE,-0.02145,281.07999,274.17999,-0.024548,AFEC35


In [69]:
RP_2023.head()

Unnamed: 0,rpa_date_utc,timestamp_utc,rp_entity_id,entity_name,headline
3,2023-01-01,2023-01-01 02:00:05.447,0157B1,Amazon.com Inc.,New Film Release: Cryptid Horror Movie Brings ...
4,2023-01-01,2023-01-01 07:00:11.264,0157B1,Amazon.com Inc.,The Craziest Moments From the Longest Tech Boo...
5,2023-01-01,2023-01-01 07:00:11.275,0157B1,Amazon.com Inc.,The Craziest Moments From the Longest Tech Boo...
6,2023-01-01,2023-01-01 10:30:00.085,0157B1,Amazon.com Inc.,The Year Big Tech Stocks Fell From Glory -- WSJ
7,2023-01-01,2023-01-01 13:00:00.055,0157B1,Amazon.com Inc.,Robust Job and Wage Growth Showed Signs of Coo...


In [70]:
RP_2023.rpa_date_utc = pd.to_datetime(RP_2023.rpa_date_utc)
sp500_2023_RPid.merge(RP_2023,left_on = ["date","rp_entity_id"], right_on = ["rpa_date_utc","rp_entity_id"]).head()

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id,rpa_date_utc,timestamp_utc,entity_name,headline
0,39917,2023-01-03,WEYERHAEUSER CO,-0.004516,31.25,30.86,-0.01248,FF4BA4,2023-01-03,2023-01-03 21:42:37.069,Weyerhaeuser Co.,New York Closing Stocks
1,39917,2023-01-03,WEYERHAEUSER CO,-0.004516,31.25,30.86,-0.01248,FF4BA4,2023-01-03,2023-01-03 21:53:55.832,Weyerhaeuser Co.,Extra Space Storage Inc. Stock Outperforms Com...
2,39917,2023-01-03,WEYERHAEUSER CO,-0.004516,31.25,30.86,-0.01248,FF4BA4,2023-01-03,2023-01-03 22:02:55.434,Weyerhaeuser Co.,International Paper Co. Stock Outperforms Mark...
3,39917,2023-01-03,WEYERHAEUSER CO,-0.004516,31.25,30.86,-0.01248,FF4BA4,2023-01-03,2023-01-03 22:33:56.797,Weyerhaeuser Co.,"Weyerhaeuser Co. Stock Falls Tuesday, Underper..."
4,82642,2023-01-03,LAUDER ESTEE COS INC,0.023377,256.17999,253.91,-0.008861,14ED2B,2023-01-03,2023-01-03 13:00:04.072,Estee Lauder Cos. Inc.,Veganuary 2023 sign-ups at record rate - one p...


In [71]:
RP_2023.set_index("timestamp_utc",inplace= True)
RP_2023["timestamp_NY"] = pd.to_datetime(RP_2023.index).tz_localize("UTC").tz_convert("America/New_York")

In [72]:
nyse = mcal.get_calendar("NYSE")
nyse_trading_2023 = nyse.valid_days(start_date="2022-12-31",end_date="2023-12-31").tz_localize(None).tz_localize("America/New_York")

In [73]:
nyse_trading_2023_closing = nyse_trading_2023 + dt.timedelta(hours = 16)

In [74]:
def contem_ret_date(timestamp):
    later_time = nyse_trading_2023_closing[nyse_trading_2023_closing >= timestamp]
    return later_time[0].date() if not later_time.empty else None

In [75]:
RP_2023['contem_ret_date'] = RP_2023["timestamp_NY"].apply(contem_ret_date)

In [76]:
RP_2023.reset_index(inplace = True)
RP_2023_contem_ret = RP_2023[["contem_ret_date","rp_entity_id","headline"]]
RP_2023_contem_ret = RP_2023_contem_ret.dropna()
sp500_2023_RPid_contem_ret = sp500_2023_RPid[["date","rp_entity_id","comnam","ret"]]
RP_2023_contem_ret.contem_ret_date = pd.to_datetime(RP_2023_contem_ret.contem_ret_date)
SP500_RP_contem_ret_2023 = pd.merge(sp500_2023_RPid_contem_ret,RP_2023_contem_ret,left_on=["date","rp_entity_id"],\
                                    right_on=["contem_ret_date","rp_entity_id"],how = "inner").drop(columns = "contem_ret_date")

In [77]:
SP500_RP_contem_ret_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,ret,headline
0,2023-01-03,14ED2B,LAUDER ESTEE COS INC,0.023377,Steady Stream of Multimillion Dollar Gifts Pro...
1,2023-01-03,14ED2B,LAUDER ESTEE COS INC,0.023377,Veganuary 2023 sign-ups at record rate - one p...
2,2023-01-03,14ED2B,LAUDER ESTEE COS INC,0.023377,Estee Lauder Is Maintained at Overweight by We...
3,2023-01-03,14ED2B,LAUDER ESTEE COS INC,0.023377,Estee Lauder Price Target Raised to $275.00/Sh...
4,2023-01-03,96F126,LOCKHEED MARTIN CORP,-0.018418,Airborne Sonar Global Market Report 2022: Tech...


In [78]:
SP500_RP_contem_ret_2023.duplicated().sum()
SP500_RP_contem_ret_2023.isna().sum().sum()

0

# Link headlines based on future returns

In [79]:
nyse_trading_2023_opening = nyse_trading_2023 + dt.timedelta(hours = 9)
# 1 stands for CO_ret
def future_ret_date(timestamp):
    later_opening = nyse_trading_2023_opening[nyse_trading_2023_opening>=timestamp]
    later_closing = nyse_trading_2023_closing[nyse_trading_2023_closing>=timestamp]
    if (not later_opening.empty) & (not later_closing.empty):
        next_opening = later_opening[0]
        next_closing = later_closing[0]
        if next_opening.date() == next_closing.date():
            return [next_opening.date(),1]
        else: 
            return [next_opening.date(),0]
    else:
        return [None,None]

In [80]:
RP_2023_future_ret_date = RP_2023.apply(lambda row:future_ret_date(row['timestamp_NY']),axis = 1, result_type="expand")

In [81]:
RP_2023_future_ret_date = RP_2023_future_ret_date.rename(columns = {0:"future_ret_date",1:"bool_CO_ret"}) 
RP_2023 = pd.concat([RP_2023,RP_2023_future_ret_date],axis = 1)

**Duplicate's problem**

In [82]:
sp500_2023_RPid.head()

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id
0,39917,2023-01-03,WEYERHAEUSER CO,-0.004516,31.25,30.86,-0.01248,FF4BA4
1,82642,2023-01-03,LAUDER ESTEE COS INC,0.023377,256.17999,253.91,-0.008861,14ED2B
2,21178,2023-01-03,LOCKHEED MARTIN CORP,-0.018418,483.39001,477.53,-0.012123,96F126
3,66800,2023-01-03,AMERICAN INTERNATIONAL GROUP INC,-0.004902,63.45,62.93,-0.008195,0BC29E
4,64653,2023-01-03,PUBLIC STORAGE,-0.02145,281.07999,274.17999,-0.024548,AFEC35


In [83]:
sp500_2023_RPid.groupby("permno").filter(lambda sub:len(sub.rp_entity_id.unique()) > 1)

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id


In [84]:
RP_2023.head()

Unnamed: 0,timestamp_utc,rpa_date_utc,rp_entity_id,entity_name,headline,timestamp_NY,contem_ret_date,future_ret_date,bool_CO_ret
0,2023-01-01 02:00:05.447,2023-01-01,0157B1,Amazon.com Inc.,New Film Release: Cryptid Horror Movie Brings ...,2022-12-31 21:00:05.447000-05:00,2023-01-03,2023-01-03,1.0
1,2023-01-01 07:00:11.264,2023-01-01,0157B1,Amazon.com Inc.,The Craziest Moments From the Longest Tech Boo...,2023-01-01 02:00:11.264000-05:00,2023-01-03,2023-01-03,1.0
2,2023-01-01 07:00:11.275,2023-01-01,0157B1,Amazon.com Inc.,The Craziest Moments From the Longest Tech Boo...,2023-01-01 02:00:11.275000-05:00,2023-01-03,2023-01-03,1.0
3,2023-01-01 10:30:00.085,2023-01-01,0157B1,Amazon.com Inc.,The Year Big Tech Stocks Fell From Glory -- WSJ,2023-01-01 05:30:00.085000-05:00,2023-01-03,2023-01-03,1.0
4,2023-01-01 13:00:00.055,2023-01-01,0157B1,Amazon.com Inc.,Robust Job and Wage Growth Showed Signs of Coo...,2023-01-01 08:00:00.055000-05:00,2023-01-03,2023-01-03,1.0


In [85]:
RP_2023.duplicated(("rp_entity_id","headline")).sum()

0

In [86]:
RP_2023.future_ret_date = pd.to_datetime(RP_2023.future_ret_date)
SP500_RP_future_ret_2023 = sp500_2023_RPid.merge(RP_2023,left_on=["date","rp_entity_id"],right_on = ["future_ret_date","rp_entity_id"],how = "inner")
SP500_RP_future_ret_2023 = SP500_RP_future_ret_2023[["date","rp_entity_id","comnam","CO_ret","ret","headline","bool_CO_ret"]]

In [87]:
SP500_RP_future_ret_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,CO_ret,ret,headline,bool_CO_ret
0,2023-01-03,14ED2B,LAUDER ESTEE COS INC,-0.008861,0.023377,Steady Stream of Multimillion Dollar Gifts Pro...,1.0
1,2023-01-03,14ED2B,LAUDER ESTEE COS INC,-0.008861,0.023377,Veganuary 2023 sign-ups at record rate - one p...,1.0
2,2023-01-03,96F126,LOCKHEED MARTIN CORP,-0.012123,-0.018418,Airborne Sonar Global Market Report 2022: Tech...,1.0
3,2023-01-03,AFEC35,PUBLIC STORAGE,-0.024548,-0.02145,Here Are Barron's 12 Best Income Investments f...,1.0
4,2023-01-03,DD682D,ASSURANT INC,0.01434,0.018071,"At CES 2023, Leading Research Firm Parks Assoc...",1.0


In [88]:
# Where do these duplicates come from
SP500_RP_future_ret_2023.duplicated(["rp_entity_id","headline"]).sum()

62584

In [89]:
# Examples
SP500_RP_future_ret_2023[SP500_RP_future_ret_2023.duplicated(["rp_entity_id","headline"],keep = False)]

Unnamed: 0,date,rp_entity_id,comnam,CO_ret,ret,headline,bool_CO_ret
147,2023-01-03,DD1BA1,NEWS CORP NEW,-0.004308,0.002711,"California Storm Floods Roads, Cuts Power on N...",1.0
154,2023-01-03,DD1BA1,NEWS CORP NEW,-0.003811,0.005494,"California Storm Floods Roads, Cuts Power on N...",1.0
155,2023-01-03,7BFF81,FOX CORP,-0.003516,-0.003866,Divided Government Set to Return With Start of...,1.0
156,2023-01-03,7BFF81,FOX CORP,-0.003516,-0.003866,Kevin McCarthy's House Speaker Bid Teeters Ahe...,1.0
157,2023-01-03,7BFF81,FOX CORP,-0.003516,-0.003866,Global View: The World According to Tom Cotto...,1.0
...,...,...,...,...,...,...,...
1748680,2023-12-29,4A6F00,ALPHABET INC,0.000430,-0.003851,Gen AI Hype Grips Telecom Industry as Telcos U...,1.0
1748681,2023-12-29,4A6F00,ALPHABET INC,0.000430,-0.003851,"Hankyung.com: ""AI technology to feed the Korea...",1.0
1748682,2023-12-29,4A6F00,ALPHABET INC,0.000430,-0.003851,Dow Jones Futures Dip Ahead Of Final Session O...,1.0
1748683,2023-12-29,4A6F00,ALPHABET INC,0.000430,-0.003851,Valmet Oyj: Valmet Oyj: Notification according...,1.0


In [90]:
SP500_RP_future_ret_2023 = SP500_RP_future_ret_2023.drop_duplicates(subset = ["rp_entity_id","headline"])
SP500_RP_future_ret_2023["future_ret"] = SP500_RP_future_ret_2023.apply(lambda row: row["CO_ret"] if row["bool_CO_ret"] \
                                                                        else (row["ret"] if not row["bool_CO_ret"]  else None), axis = 1)
SP500_RP_future_ret_2023 = SP500_RP_future_ret_2023.drop(columns = ['CO_ret','ret','bool_CO_ret'])

In [91]:
SP500_RP_future_ret_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,headline,future_ret
0,2023-01-03,14ED2B,LAUDER ESTEE COS INC,Steady Stream of Multimillion Dollar Gifts Pro...,-0.008861
1,2023-01-03,14ED2B,LAUDER ESTEE COS INC,Veganuary 2023 sign-ups at record rate - one p...,-0.008861
2,2023-01-03,96F126,LOCKHEED MARTIN CORP,Airborne Sonar Global Market Report 2022: Tech...,-0.012123
3,2023-01-03,AFEC35,PUBLIC STORAGE,Here Are Barron's 12 Best Income Investments f...,-0.024548
4,2023-01-03,DD682D,ASSURANT INC,"At CES 2023, Leading Research Firm Parks Assoc...",0.01434


In [92]:
# NA comes from None opening prices
print(SP500_RP_future_ret_2023.duplicated().sum())
print(SP500_RP_future_ret_2023.isna().sum().sum())

0
148
