In [1]:
import wrds
import pandas as pd
import pytz
import datetime as dt

In [2]:
db = wrds.Connection()

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [3]:
sp500 = db.raw_sql("""
                        select a.*, b.date, b.prc, b.openprc, b.ret
                        from crsp.dsp500list as a,
                        crsp.dsf as b
                        where a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/2023' and b.date<='12/31/2023'
                        order by date;
                        """, date_cols=['start', 'ending', 'date'])

In [4]:
dse = db.raw_sql("""
                        select comnam,ncusip, namedt, nameendt,permno
                        from crsp.dsenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
dse['nameendt']=dse['nameendt'].fillna(pd.to_datetime('today'))

In [5]:
# Merge with SP500 data
sp500_full_2023 = pd.merge(sp500, dse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full_2023 = sp500_full_2023.loc[(sp500_full_2023.date>=sp500_full_2023.namedt) \
                            & (sp500_full_2023.date<=sp500_full_2023.nameendt)]
sp500_full_2023.reset_index(inplace = True,drop = True)

In [6]:
sp500_full_2023 = sp500_full_2023[["permno","date","comnam","openprc","prc","ret"]]

In [7]:
# Negative sign means bid/ask average
sp500_full_2023[sp500_full_2023.prc <0]

Unnamed: 0,permno,date,comnam,openprc,prc,ret
23244,11786,2023-03-10,S V B FINANCIAL GROUP,,-39.37,-0.628725
55299,76841,2023-06-09,BIOGEN INC,,-316.89999,0.025965


In [8]:
# For 76841, bid/ask average represents closing price for a stock 
#with similar numerical values of adjacent days 
sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]

  sp500_full_2023[sp500_full_2023.permno == 76841].loc[sp500_full_2023.date.isin(["2023-06-08","2023-06-09","2023-06-12"])]


Unnamed: 0,permno,date,comnam,openprc,prc,ret
54667,76841,2023-06-08,BIOGEN INC,310.69,308.88,0.013054
55299,76841,2023-06-09,BIOGEN INC,,-316.89999,0.025965
55348,76841,2023-06-12,BIOGEN INC,319.76001,313.41,-0.011013


In [9]:
# For 11786, this seems to be something wrong
sp500_full_2023[sp500_full_2023.permno == 11786]

Unnamed: 0,permno,date,comnam,openprc,prc,ret
298,11786,2023-01-03,S V B FINANCIAL GROUP,232.17,225.22,-0.021378
939,11786,2023-01-04,S V B FINANCIAL GROUP,230.10001,240.06,0.065891
1371,11786,2023-01-05,S V B FINANCIAL GROUP,235.71001,232.59,-0.031117
1988,11786,2023-01-06,S V B FINANCIAL GROUP,237.12,245.78999,0.056752
2048,11786,2023-01-09,S V B FINANCIAL GROUP,247.96001,249.42999,0.014809
2762,11786,2023-01-10,S V B FINANCIAL GROUP,246.46001,252.67999,0.01303
3095,11786,2023-01-11,S V B FINANCIAL GROUP,255.31,254.99001,0.009142
3656,11786,2023-01-12,S V B FINANCIAL GROUP,259.19,253.82001,-0.004588
4314,11786,2023-01-13,S V B FINANCIAL GROUP,248.06,252.73,-0.004294
4951,11786,2023-01-17,S V B FINANCIAL GROUP,254.08,259.98999,0.028726


In [10]:
# 0 means neither closing price nor bid/ask average exists 
sp500_full_2023.prc[sp500_full_2023.prc == 0]

Series([], Name: prc, dtype: float64)

In [11]:
# There are some companies without open price which caused negative prices. Drop them
sp500_full_2023[sp500_full_2023.openprc.isna()]

Unnamed: 0,permno,date,comnam,openprc,prc,ret
23244,11786,2023-03-10,S V B FINANCIAL GROUP,,-39.37,-0.628725
55299,76841,2023-06-09,BIOGEN INC,,-316.89999,0.025965


In [12]:
# No dirty data for open price
(sp500_full_2023.openprc <= 0).sum()

0

In [13]:
# There are some companies without returns also
sp500_full_2023[sp500_full_2023.isna().sum(axis = 1) >0]

Unnamed: 0,permno,date,comnam,openprc,prc,ret
575,23570,2023-01-04,G E HEALTHCARE TECHNOLOGIES INC,54.13,60.49,
23244,11786,2023-03-10,S V B FINANCIAL GROUP,,-39.37,-0.628725
55299,76841,2023-06-09,BIOGEN INC,,-316.89999,0.025965
62794,23942,2023-07-03,FORTREA HOLDINGS INC,33.8,36.84,
63253,23944,2023-07-05,PHINIA INC,29.89,36.75,
94166,24175,2023-10-02,W K KELLOGG CO,13.8,13.35,
94169,24174,2023-10-02,VERALTO CORP,83.11,85.12,


In [14]:
# There is no duplicated values 
sp500_full_2023.duplicated().sum()

0

In [15]:
sp500_full_2023 = sp500_full_2023.dropna()

In [17]:
mapping_file = pd.read_csv("mapping_file_2023_sp500.csv")
sp500_2023_RPid = pd.merge(sp500_full_2023,mapping_file,left_on = "permno",right_on = "PERMNO",how = "left")

In [18]:
# There are some companies which are not included in RavenPack
sp500_2023_RPid[sp500_2023_RPid.isna().sum(axis = 1) !=0].groupby(["permno","comnam"]).groups.keys()

dict_keys([(12084, 'N X P SEMICONDUCTORS N V'), (12345, 'LYONDELLBASELL INDUSTRIES N V'), (13103, 'APTIV PLC'), (13586, 'PENTAIR PLC'), (14297, 'ALLEGION PLC'), (18143, 'LINDE PLC'), (18143, 'LINDE PLC NEW'), (18724, 'AMCOR PLC'), (23570, 'G E HEALTHCARE TECHNOLOGIES INC'), (23876, 'KENVUE INC'), (23942, 'FORTREA HOLDINGS INC'), (24174, 'VERALTO CORP'), (79145, 'ROYAL CARIBBEAN CRUISES LTD')])

In [19]:
# This are companies which are not included in RavenPack
dse[dse.permno.isin([12084,12345,13103,13586,14297,18143,18724,23570,23876,23942,24174,79145])]

Unnamed: 0,comnam,ncusip,namedt,nameendt,permno
7129,N X P SEMICONDUCTORS N V,N6596X10,2010-08-06,2023-12-29,12084
8008,LYONDELLBASELL INDUSTRIES N V,N5374510,2010-10-14,2014-01-07,12345
8009,LYONDELLBASELL INDUSTRIES N V,N5374510,2014-01-08,2016-12-18,12345
8010,LYONDELLBASELL INDUSTRIES N V,N5374510,2016-12-19,2020-04-06,12345
8011,LYONDELLBASELL INDUSTRIES N V,N5374510,2020-04-07,2021-03-30,12345
8012,LYONDELLBASELL INDUSTRIES N V,N5374510,2021-03-31,2022-07-18,12345
8013,LYONDELLBASELL INDUSTRIES N V,N5374510,2022-07-19,2023-09-18,12345
8014,LYONDELLBASELL INDUSTRIES N V,N5374510,2023-09-19,2023-12-29,12345
9797,DELPHI AUTOMOTIVE PLC,G2782310,2011-11-17,2016-01-11,13103
9798,DELPHI AUTOMOTIVE PLC,G2782310,2016-01-12,2017-12-04,13103


In [20]:
# There aren't duplicated companies
sp500_2023_RPid.duplicated().sum()

0

In [21]:
# Drop those companies not matched to RavenPack
sp500_2023_RPid = sp500_2023_RPid.dropna()

In [22]:
sql_query = """select rpa_date_utc,timestamp_utc,rp_entity_id,headline
                from rpna.rpa_djpr_equities_2023
            """
RP_2023 = db.raw_sql(sql_query)

In [23]:
RP_2023_test_duplicate = RP_2023.copy()
RP_2023_test_duplicate = RP_2023_test_duplicate.drop(columns = "timestamp_utc")
RP_2023_test_duplicate.rpa_date_utc = pd.to_datetime(RP_2023_test_duplicate.rpa_date_utc)
RP_2023_test_duplicate_lag1 = RP_2023_test_duplicate.copy()
RP_2023_test_duplicate_lag1.rpa_date_utc = RP_2023_test_duplicate_lag1.rpa_date_utc + dt.timedelta(days = 1)
str_RP_2023_test_duplicate = RP_2023_test_duplicate.map(lambda x:str(x))
str_RP_2023_test_duplicate_lag1 = RP_2023_test_duplicate_lag1.map(lambda x:str(x))
str_RP_2023_test_duplicate_joined = str_RP_2023_test_duplicate.apply(lambda x: "".join(x),axis = 1)
str_RP_2023_test_duplicate_lag1_joined = str_RP_2023_test_duplicate_lag1.apply(lambda x: "".join(x),axis = 1)

In [24]:
# Total number of same headlines happending in consecutive days
str_RP_2023_test_duplicate_joined.isin(str_RP_2023_test_duplicate_lag1_joined).sum()

1677138

In [25]:
# Remove all same headlines happending in consecutive days
RP_2023 = RP_2023[~(str_RP_2023_test_duplicate_joined.isin(str_RP_2023_test_duplicate_lag1_joined))]
RP_2023 = RP_2023.drop(columns = "rpa_date_utc")

In [26]:
def next_workday(date):
  if date.weekday() >= 4:
    return date+dt.timedelta(days = 7-date.weekday())
  else:
    return date+dt.timedelta(days = 1)
def headline_ret_date(ET_time):
  if ET_time.weekday()>=5:
    return next_workday(ET_time).date()
  else:
    closing_time = ET_time.replace(hour = 16,minute =30)
    if ET_time < closing_time:
      return ET_time.date()
    else:
      return next_workday(ET_time).date()

In [27]:
RP_2023_test = RP_2023.copy()
RP_2023_test.head()

Unnamed: 0,timestamp_utc,rp_entity_id,headline
0,2023-01-01 10:25:31.105,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
1,2023-01-01 10:25:31.435,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
2,2023-01-01 08:31:38.519,01316B,"Bilibili Hosts Annual New Year's Eve Gala - ""T..."
3,2023-01-01 02:00:05.447,0157B1,New Film Release: Cryptid Horror Movie Brings ...
4,2023-01-01 07:00:11.264,0157B1,The Craziest Moments From the Longest Tech Boo...


In [28]:
RP_2023_test = RP_2023_test.rename(columns= {"timestamp_utc":"timestamp"})
RP_2023_test.set_index("timestamp",inplace = True)
RP_2023_test.index = pd.to_datetime(RP_2023_test.index).tz_localize("UTC").tz_convert(pytz.timezone("America/New_York"))
RP_2023_test.head()

Unnamed: 0_level_0,rp_entity_id,headline
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 05:25:31.105000-05:00,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
2023-01-01 05:25:31.435000-05:00,00194C,BSE: Continental Controls Ltd. - Closure Of Tr...
2023-01-01 03:31:38.519000-05:00,01316B,"Bilibili Hosts Annual New Year's Eve Gala - ""T..."
2022-12-31 21:00:05.447000-05:00,0157B1,New Film Release: Cryptid Horror Movie Brings ...
2023-01-01 02:00:11.264000-05:00,0157B1,The Craziest Moments From the Longest Tech Boo...


In [29]:
RP_2023_test.reset_index(inplace = True)
# Associate headlines' times to returns' times 
RP_2023_test.timestamp = RP_2023_test.timestamp.apply(lambda x:headline_ret_date(x))

In [30]:
RP_2023_test = RP_2023_test.rename(columns = {"timestamp":"date_ret"})

In [31]:
# There are some duplicated headlines after associating to returns'dates which should
# not be dropped. For example: same headlines on Saturday and Monday go to Monday together
RP_2023_test.duplicated().sum()

2541131

In [32]:
# NO NA values
RP_2023_test.isna().sum().sum()

0

In [33]:
RP_2023_test.date_ret = pd.to_datetime(RP_2023_test.date_ret)
sp500_2023_ret_headline = sp500_2023_RPid.merge(RP_2023_test, how = "inner", left_on = ["date","RP_ENTITY_ID"],right_on = ["date_ret","rp_entity_id"])

In [34]:
# There are duplicates because of duplicates in headline data frame after associated with returns'
# dates which should be kept
sp500_2023_ret_headline.duplicated().sum()

378922

In [35]:
# sp500_2023_ret_headline = sp500_2023_ret_headline.drop_duplicates()
sp500_2023_ret_headline = sp500_2023_ret_headline[["date","permno","ret","headline"]]
sp500_2023_ret_headline.head()

Unnamed: 0,date,permno,ret,headline
0,2023-01-03,22592,0.021264,The Worldwide Industrial Food and Beverages Fi...
1,2023-01-03,22592,0.021264,Europe OTC Braces & Supports Market Report 202...
2,2023-01-03,22592,0.021264,$16.6 Billion Worldwide Ceramic Matrix Composi...
3,2023-01-03,22592,0.021264,MIMEDX Appoints Ricci S. Whitlow as Chief Oper...
4,2023-01-03,22592,0.021264,3M Tries to Contain Legal Battles Over 'Foreve...
