# Initialization

In [68]:
import wrds
import pandas as pd
import pytz
import datetime as dt
import pandas_market_calendars as mcal

In [337]:
db = wrds.Connection(wrds_username = "connorwz")

Loading library list...
Done


# Find SP500 constituents in 2023

In [350]:
sp500 = db.raw_sql("""
                        select a.*, b.date, b.ret, b.prc, b.openprc
                        from crsp.dsp500list as a,
                        crsp.dsf as b
                        where a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/2023' and b.date<='12/31/2023'
                        order by date;
                        """, date_cols=['start', 'ending', 'date'])

In [352]:
dse = db.raw_sql("""
                        select comnam,ncusip, namedt, nameendt,permno
                        from crsp.dsenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
dse['nameendt']=dse['nameendt'].fillna(pd.to_datetime('today'))

In [353]:
# Merge with SP500 data
sp500_full_2023 = pd.merge(sp500, dse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full_2023 = sp500_full_2023.loc[(sp500_full_2023.date>=sp500_full_2023.namedt) \
                            & (sp500_full_2023.date<=sp500_full_2023.nameendt)]
sp500_full_2023.reset_index(inplace = True,drop = True)

In [355]:
sp500_full_2023 = sp500_full_2023[["permno","date","comnam","ret","openprc","prc"]]

## Dirty data check

In [357]:
# Negative sign means bid/ask average
sp500_full_2023[sp500_full_2023.prc <0]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
23233,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
54957,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999


In [358]:
# For 76841, bid/ask average represents closing price for a stock 
# with similar numerical values of adjacent days 
sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]

  sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]


Unnamed: 0,permno,date,comnam,ret,openprc,prc
54419,76841,2023-06-08,BIOGEN INC,0.013054,310.69,308.88
54957,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999
55586,76841,2023-06-12,BIOGEN INC,-0.011013,319.76001,313.41


In [359]:
# For 11786, this seems to be something wrong
sp500_full_2023[sp500_full_2023.permno == 11786][-3:]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
22186,11786,2023-03-08,S V B FINANCIAL GROUP,0.001645,266.85999,267.82999
22842,11786,2023-03-09,S V B FINANCIAL GROUP,-0.604077,176.55,106.04
23233,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37


In [360]:
# 0 means neither closing price nor bid/ask average exists 
sp500_full_2023.prc[sp500_full_2023.prc == 0]

Series([], Name: prc, dtype: float64)

In [361]:
# There are some companies without open price which caused negative prices.
sp500_full_2023[sp500_full_2023.openprc.isna()]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
23233,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
54957,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999


In [362]:
# No dirty data for open price
(sp500_full_2023.openprc <= 0).sum()

0

In [363]:
# Nan values 
sp500_full_2023[sp500_full_2023.isna().sum(axis = 1) >0]

Unnamed: 0,permno,date,comnam,ret,openprc,prc
619,23570,2023-01-04,G E HEALTHCARE TECHNOLOGIES INC,,54.13,60.49
23233,11786,2023-03-10,S V B FINANCIAL GROUP,-0.628725,,-39.37
54957,76841,2023-06-09,BIOGEN INC,0.025965,,-316.89999
62788,23942,2023-07-03,FORTREA HOLDINGS INC,,33.8,36.84
63178,23944,2023-07-05,PHINIA INC,,29.89,36.75
94442,24175,2023-10-02,W K KELLOGG CO,,13.8,13.35
94443,24174,2023-10-02,VERALTO CORP,,83.11,85.12


In [364]:
# There is no duplicated values 
sp500_full_2023.duplicated().sum()

0

# Get opening-closing returns and closing-closing returns for SP500 constituents in 2023

In [365]:
# get closing-open return
sp500_full_2023["prc"] = sp500_full_2023["prc"].apply(abs)
sp500_full_2023["CO_ret"] = (sp500_full_2023['prc'] - sp500_full_2023['openprc'])/sp500_full_2023['openprc']

# Link CRSP to RavenPack

In [367]:
mapping_file_query = """ 
                    SELECT DISTINCT 
                    a.permno, b.rp_entity_id 
                    FROM (SELECT * FROM crsp.dse WHERE ncusip IS NOT NULL) as a,
                    rpna.wrds_company_names as b
                    WHERE a.ncusip=substr(b.isin,3,8)
"""
mapping_file = db.raw_sql(mapping_file_query)

In [368]:
# There are companies mapped to more than one entity_id
mapping_file.groupby("permno").filter(lambda sub:sub.shape[0]>1).sort_values("permno")

Unnamed: 0,permno,rp_entity_id
1581,10066,179A00
571,10066,2E83D7
10538,10082,686DB1
13893,10082,229150
11921,10560,C8C45A
...,...,...
11032,90722,D553E8
1319,92010,3E387A
1863,92010,133899
12498,92685,A1C951


In [296]:
# All RP's headlines in 2023 
RP_2023_query = """SELECT rpa_date_utc,timestamp_utc,rp_entity_id,entity_name,headline
                FROM rpna.rpa_djpr_equities_2023
            """
RP_2023 = db.raw_sql(RP_2023_query)

In [369]:
# Remove those entity_ids which don't exist in RP
mapping_file = mapping_file[mapping_file.rp_entity_id.isin(RP_2023.rp_entity_id.unique())]

In [370]:
sp500_2023_RPid = sp500_full_2023.merge(mapping_file, on = "permno", how = "inner")

In [371]:
# No permno is redundantly mapped
sp500_2023_RPid.groupby("permno").filter(lambda sub:len(sub.rp_entity_id.unique())>1)

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id


In [372]:
print(sp500_2023_RPid[["permno","date","comnam","rp_entity_id"]].isna().sum().sum())
print(sp500_2023_RPid.duplicated().sum())

0
0


# Link headlines based on contemporaneous returns

In [417]:
sp500_2023_RPid_backup = sp500_2023_RPid.copy()
RP_2023_backup = RP_2023.copy()

In [427]:
sp500_2023_RPid.head()

Unnamed: 0,permno,date,comnam,ret,openprc,prc,CO_ret,rp_entity_id
0,69649,2023-01-03,RAYMOND JAMES FINANCIAL INC,-0.006926,107.3,106.11,-0.01109,B3CB74
1,17005,2023-01-03,C V S HEALTH CORP,-0.003005,91.72,92.91,0.012974,69CE71
2,21936,2023-01-03,PFIZER INC,0.00039,51.01,51.26,0.004901,267718
3,12476,2023-01-03,TARGA RESOURCES CORP,-0.038776,73.26,70.65,-0.035627,EAEBF3
4,25419,2023-01-03,WHIRLPOOL CORP,0.016047,144.12,143.73,-0.002706,BDD12C


In [511]:
RP_2023.head()

Unnamed: 0,timestamp_utc,rpa_date_utc,rp_entity_id,entity_name,headline
0,2023-01-01 10:25:31.105,2023-01-01,00194C,Continental Controls Ltd.,BSE: Continental Controls Ltd. - Closure Of Tr...
1,2023-01-01 10:25:31.435,2023-01-01,00194C,Continental Controls Ltd.,BSE: Continental Controls Ltd. - Closure Of Tr...
2,2023-01-01 08:31:38.519,2023-01-01,01316B,Bilibili Inc.,"Bilibili Hosts Annual New Year's Eve Gala - ""T..."
3,2023-01-01 02:00:05.447,2023-01-01,0157B1,Amazon.com Inc.,New Film Release: Cryptid Horror Movie Brings ...
4,2023-01-01 07:00:11.264,2023-01-01,0157B1,Amazon.com Inc.,The Craziest Moments From the Longest Tech Boo...


In [512]:
# Drop duplicates in consecutive days
RP_2023 = RP_2023.drop_duplicates(("rp_entity_id","headline"))

In [517]:
RP_2023.set_index("timestamp_utc",inplace= True)
RP_2023["timestamp_NY"] = pd.to_datetime(RP_2023.index).tz_localize("UTC").tz_convert("America/New_York")

In [533]:
nyse = mcal.get_calendar("NYSE")
nyse_trading_2023 = nyse.valid_days(start_date="2022-12-31",end_date="2023-12-31").tz_localize(None).tz_localize("America/New_York")

In [539]:
nyse_trading_2023_closing = nyse_trading_2023 + dt.timedelta(hours = 16)

In [570]:
def contem_ret_date(timestamp):
    later_time = nyse_trading_2023_closing[nyse_trading_2023_closing >= timestamp]
    return later_time[0].date() if not later_time.empty else None

In [572]:
RP_2023['contem_ret_date'] = RP_2023["timestamp_NY"].apply(contem_ret_date)

In [581]:
RP_2023.reset_index(inplace = True)
RP_2023_contem_ret = RP_2023[["contem_ret_date","rp_entity_id","headline"]]
RP_2023_contem_ret = RP_2023_contem_ret.dropna()
sp500_2023_RPid_contem_ret = sp500_2023_RPid[["date","rp_entity_id","comnam","ret"]]

In [588]:
RP_2023_contem_ret.contem_ret_date = pd.to_datetime(RP_2023_contem_ret.contem_ret_date)
SP500_RP_contem_ret_2023 = pd.merge(sp500_2023_RPid_contem_ret,RP_2023_contem_ret,left_on=["date","rp_entity_id"],\
                                    right_on=["contem_ret_date","rp_entity_id"],how = "inner").drop(columns = "contem_ret_date")

In [589]:
SP500_RP_contem_ret_2023.head()

Unnamed: 0,date,rp_entity_id,comnam,ret,headline
0,2023-01-03,B3CB74,RAYMOND JAMES FINANCIAL INC,-0.006926,Chewy Looks to Open More Automated Fulfillment...
1,2023-01-03,B3CB74,RAYMOND JAMES FINANCIAL INC,-0.006926,Chewy Looks to Open More Automated Fulfillment...
2,2023-01-03,B3CB74,RAYMOND JAMES FINANCIAL INC,-0.006926,"Armed With a Big War Chest, Steward Partners A..."
3,2023-01-03,B3CB74,RAYMOND JAMES FINANCIAL INC,-0.006926,"Chmn Madar Registers 7,600 Of Inter Parfums In..."
4,2023-01-03,B3CB74,RAYMOND JAMES FINANCIAL INC,-0.006926,Southwest Used to Be the Best Run Airline. Wha...


In [591]:
SP500_RP_contem_ret_2023.duplicated().sum()
SP500_RP_contem_ret_2023.isna().sum().sum()

0

# Seperating line

In [None]:
# Drop same headlines happening in consecutive days
RP_2023 = RP_2023.drop_duplicates("")

In [26]:
def next_workday(date):
  if date.weekday() >= 4:
    return date+dt.timedelta(days = 7-date.weekday())
  else:
    return date+dt.timedelta(days = 1)
def headline_ret_date(ET_time):
  if ET_time.weekday()>=5:
    return next_workday(ET_time).date()
  else:
    closing_time = ET_time.replace(hour = 16,minute =30)
    if ET_time < closing_time:
      return ET_time.date()
    else:
      return next_workday(ET_time).date()

In [27]:
RP_2023_test = RP_2023.copy()
RP_2023_test.head()

Unnamed: 0,timestamp_utc,rp_entity_id,headline
0,2023-01-01 10:25:31.105,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
1,2023-01-01 10:25:31.435,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
2,2023-01-01 08:31:38.519,01316B,"Bilibili Hosts Annual New Year's Eve Gala - ""T..."
3,2023-01-01 02:00:05.447,0157B1,New Film Release: Cryptid Horror Movie Brings ...
4,2023-01-01 07:00:11.264,0157B1,The Craziest Moments From the Longest Tech Boo...


In [28]:
RP_2023_test = RP_2023_test.rename(columns= {"timestamp_utc":"timestamp"})
RP_2023_test.set_index("timestamp",inplace = True)
RP_2023_test.index = pd.to_datetime(RP_2023_test.index).tz_localize("UTC").tz_convert(pytz.timezone("America/New_York"))
RP_2023_test.head()

Unnamed: 0_level_0,rp_entity_id,headline
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 05:25:31.105000-05:00,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
2023-01-01 05:25:31.435000-05:00,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
2023-01-01 03:31:38.519000-05:00,01316B,"Bilibili Hosts Annual New Year's Eve Gala - ""T..."
2022-12-31 21:00:05.447000-05:00,0157B1,New Film Release: Cryptid Horror Movie Brings ...
2023-01-01 02:00:11.264000-05:00,0157B1,The Craziest Moments From the Longest Tech Boo...


In [29]:
RP_2023_test.reset_index(inplace = True)
# Associate headlines' times to returns' times 
RP_2023_test.timestamp = RP_2023_test.timestamp.apply(lambda x:headline_ret_date(x))

In [30]:
RP_2023_test = RP_2023_test.rename(columns = {"timestamp":"date_ret"})

In [31]:
# There are some duplicated headlines after associating to returns'dates which should
# not be dropped. For example: same headlines on Saturday and Monday go to Monday together
RP_2023_test.duplicated().sum()

2541131

In [32]:
# NO NA values
RP_2023_test.isna().sum().sum()

0

In [33]:
RP_2023_test.date_ret = pd.to_datetime(RP_2023_test.date_ret)
sp500_2023_ret_headline = sp500_2023_RPid.merge(RP_2023_test, how = "inner", left_on = ["date","RP_ENTITY_ID"],right_on = ["date_ret","rp_entity_id"])

In [34]:
# There are duplicates because of duplicates in headline data frame after associated with returns'
# dates which should be kept
sp500_2023_ret_headline.duplicated().sum()

378922

In [35]:
# sp500_2023_ret_headline = sp500_2023_ret_headline.drop_duplicates()
sp500_2023_ret_headline = sp500_2023_ret_headline[["date","permno","ret","headline"]]
sp500_2023_ret_headline.head()

Unnamed: 0,date,permno,ret,headline
0,2023-01-03,22592,0.021264,The Worldwide Industrial Food and Beverages Fi...
1,2023-01-03,22592,0.021264,Europe OTC Braces & Supports Market Report 202...
2,2023-01-03,22592,0.021264,$16.6 Billion Worldwide Ceramic Matrix Composi...
3,2023-01-03,22592,0.021264,MIMEDX Appoints Ricci S. Whitlow as Chief Oper...
4,2023-01-03,22592,0.021264,3M Tries to Contain Legal Battles Over 'Foreve...
