In [1]:
import wrds
import pandas as pd
import numpy as np
import pytz
import datetime as dt
import pandas_market_calendars as mcal
import re
import collections
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
db = wrds.Connection(wrds_username = "kevinlin5549")

Loading library list...
Done


In [3]:
def SP500_CRSP_data(year_range):
    first_year, last_year = str(year_range[0]),str(year_range[1])
    sp_500_query = f"""SELECT a.*, b.date, b.ret, b.prc, b.openprc
                        FROM crsp.dsp500list as a,
                        crsp.dsf as b
                        WHERE a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='01/01/{first_year}' and b.date<='12/31/{last_year}'
                        order by date;"""
    sp_500 = db.raw_sql(sp_500_query,date_cols=['start', 'ending', 'date'])
    dse = db.raw_sql("""
                        select comnam,ncusip, namedt, nameendt,permno
                        from crsp.dsenames
                        """, date_cols=['namedt', 'nameendt'])
    dse['nameendt']=dse['nameendt'].fillna(pd.to_datetime('today'))
    sp500_full = pd.merge(sp_500, dse, how = 'left', on = 'permno')
    sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \
                                & (sp500_full.date<=sp500_full.nameendt)]
    sp500_full.reset_index(inplace = True,drop = True)
    sp500_full = sp500_full[["permno","date","comnam","ret","openprc","prc"]]
    sp500_full["prc"] = sp500_full["prc"].apply(abs)
    sp500_full["CO_ret"] = (sp500_full['prc'] - sp500_full['openprc'])/sp500_full['openprc']
    return sp500_full

mapping_file = pd.read_csv("SP500_Mapping_file.csv")
SP500_entity_id_str = ','.join(f"'{id}'" for id in list(mapping_file.rp_entity_id)) 
mapping_file.head()

def SP500_RP_headline_data(year_range):
    years = range(int(year_range[0]),int(year_range[-1])+1)
    years_str = [str(year) for year in years]
    RP_df = pd.DataFrame()
    for year in years_str:
        RP_year_query =f"""
                        SELECT DISTINCT timestamp_utc,rp_entity_id,headline,css,relevance
                        FROM rpna.rpa_djpr_equities_{year}
                        WHERE rp_entity_id IN ({SP500_entity_id_str})
                        """
        RP_df = pd.concat((RP_df,db.raw_sql(RP_year_query)),axis = 0)
    RP_df = RP_df.drop_duplicates(("rp_entity_id","headline"))
    return RP_df

def contem_ret(year_range):
    first_year,last_year = int(year_range[0]),int(year_range[-1])

    # Create financial dataframe from crsp and link to entity id
    sp500_crsp = SP500_CRSP_data(year_range)
    sp500_crsp_rpid =  sp500_crsp.merge(mapping_file, on = "permno", how = "inner")

    # Create RavenPack headline data and map the timestamp to contemporaneous return date
    sp500_rp =  SP500_RP_headline_data(year_range)
    sp500_rp.set_index("timestamp_utc",inplace= True)
    sp500_rp["timestamp_NY"] = pd.to_datetime(sp500_rp.index).tz_localize("UTC").tz_convert("America/New_York")
    sp500_rp = sp500_rp.reset_index()
    sp500_rp['index'] = sp500_rp.index
    nyse = mcal.get_calendar("NYSE")
    nyse_tradingdays= nyse.valid_days(start_date=f"{str(first_year)}-01-01",end_date=f"{str(last_year)}-12-31")\
        .tz_localize(None).tz_localize("America/New_York")
    nyse_tradingdays_closing = nyse_tradingdays + dt.timedelta(hours = 16)
    trading_days_df = pd.DataFrame({'trading_close': nyse_tradingdays_closing, 'contem_ret_date': nyse_tradingdays_closing.date})

    # Use merge_asof to align the headlines with the trading close times
    sp500_rp = pd.merge_asof(sp500_rp.sort_values('timestamp_NY'), trading_days_df,
                            left_on='timestamp_NY', right_on='trading_close',
                            direction='forward')
    sp500_rp = sp500_rp.sort_values("index")
    sp500_rp
    sp500_rp = sp500_rp[['timestamp_utc', 'rp_entity_id', 'headline', 'css', 'relevance', 'timestamp_NY', 'contem_ret_date']]
    sp500_rp = sp500_rp.reset_index()
    sp500_rp.drop(columns = ["index"],inplace = True)

    # Merge crsp dataframe with RP dataframe
    sp500_rp_contem_ret = sp500_rp[["contem_ret_date","rp_entity_id","headline", "css", "relevance"]]
    sp500_rp_contem_ret = sp500_rp_contem_ret.dropna()
    sp500_rp_contem_ret.contem_ret_date = pd.to_datetime(sp500_rp_contem_ret.contem_ret_date)
    sp500_crsp_rpid = sp500_crsp_rpid[["date","rp_entity_id","comnam","ret"]]
    sp500_crsp_rp_contem_ret = pd.merge(sp500_crsp_rpid,sp500_rp_contem_ret,left_on=["date","rp_entity_id"],\
                                        right_on=["contem_ret_date","rp_entity_id"],how = "inner").drop(columns = "contem_ret_date")
    
    # sp500_crsp_rp_contem_ret.drop_duplicates(("rp_entity_id","headline"),inplace=True)
    sp500_crsp_rp_contem_ret.dropna(inplace=True)
    return sp500_crsp_rp_contem_ret

def future_ret(year_range):
    first_year,last_year = int(year_range[0]),int(year_range[-1])

    # Create financial dataframe from crsp and link to entity id
    sp500_crsp = SP500_CRSP_data(year_range)
    sp500_crsp_rpid =  sp500_crsp.merge(mapping_file, on = "permno", how = "inner")

    # Create RavenPack headline data and map the timestamp to future return date
    sp500_rp =  SP500_RP_headline_data(year_range)
    sp500_rp.set_index("timestamp_utc",inplace= True)
    sp500_rp["timestamp_NY"] = pd.to_datetime(sp500_rp.index).tz_localize("UTC").tz_convert("America/New_York")
    nyse = mcal.get_calendar("NYSE")
    nyse_tradingdays= nyse.valid_days(start_date=f"{str(first_year)}-01-01",end_date=f"{str(last_year)}-12-31")\
        .tz_localize(None).tz_localize("America/New_York")
    nyse_tradingdays_opening = nyse_tradingdays + dt.timedelta(hours = 9)
    nyse_tradingdays_closing = nyse_tradingdays + dt.timedelta(hours = 16)
    def future_ret_date(timestamp):
        later_opening = nyse_tradingdays_opening[nyse_tradingdays_opening>=timestamp]
        later_closing = nyse_tradingdays_closing[nyse_tradingdays_closing>=timestamp]
        if (not later_opening.empty) & (not later_closing.empty):
            next_opening = later_opening[0]
            next_closing = later_closing[0]
            if next_opening.date() == next_closing.date():
                return [next_opening.date(),1]
            else: 
                return [next_opening.date(),0]
        else:
            return [None,None]
    sp500_rp_future_ret_date = sp500_rp.apply(lambda row:future_ret_date(row['timestamp_NY']),axis = 1, result_type="expand")
    sp500_rp_future_ret_date = sp500_rp_future_ret_date.rename(columns = {0:"future_ret_date",1:"bool_CO_ret"}) 
    sp500_rp = pd.concat([sp500_rp,sp500_rp_future_ret_date],axis = 1)

    # Merge crsp dataframe with RP dataframe
    sp500_rp.future_ret_date = pd.to_datetime(sp500_rp.future_ret_date)
    sp500_crsp_rp_future_ret = sp500_crsp_rpid.merge(sp500_rp,left_on=["date","rp_entity_id"],right_on = ["future_ret_date","rp_entity_id"],how = "inner")
    sp500_crsp_rp_future_ret = sp500_crsp_rp_future_ret[["date","rp_entity_id","comnam","CO_ret","ret","headline","bool_CO_ret"]]
    
    sp500_crsp_rp_future_ret = sp500_crsp_rp_future_ret.drop_duplicates(subset = ["rp_entity_id","headline"])
    sp500_crsp_rp_future_ret["future_ret"] = sp500_crsp_rp_future_ret.apply(lambda row: row["CO_ret"] if row["bool_CO_ret"] \
                                                                            else (row["ret"] if not row["bool_CO_ret"]  else None), axis = 1)
    sp500_crsp_rp_future_ret = sp500_crsp_rp_future_ret.drop(columns = ['CO_ret','ret','bool_CO_ret'])
    # sp500_crsp_rp_future_ret.drop_duplicates(inplace = True)
    sp500_crsp_rp_future_ret.dropna(inplace = True)
    
    return sp500_crsp_rp_future_ret

# Remove individual commenter first
def remove_useless(headline):
    end_pattern = re.search(r" (-+)? By|-[0-9]+-|- ?\b[A-Z][a-z]{2}\b \d{1,2}|(null)|--?\s\w{1,}(\s\w{1,})?$|-- Barrons.com|researchandmarkets.com|by\s\w{1,}(\s\w{1,})?$|>[A-Z]{2,4}", headline, flags=re.IGNORECASE)
    if end_pattern:
        headline = headline[:end_pattern.start()]
    return headline

def remove_number_day(headline):
    headline = re.sub(r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b', '', headline, flags=re.IGNORECASE)
    headline = re.sub(r'\bMay\b', '', headline)
    headline = re.sub(r'\b(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)\b', '', headline, flags=re.IGNORECASE)
    headline = re.sub(r'\b(Sunday|sunday|Sun)\b', '', headline)
    headline = re.sub(r"\d*\.?\d+[Bb]|bn", "bln", headline)
    headline = re.sub(r"\d*\.?\d+[Mm]|mn", "mln", headline)
    headline = re.sub(r"\d*\.?\d+[Kk]", "k", headline)
    headline = re.sub(r'\b\d+(\.\d+)?(?!\s*%)\b', '', headline)
    return headline


# Tokenize the headline into tokens which are alphanumeric words including period . 
def custom_tokenizer(headline):
    tokens = re.findall(r"\b[a-zA-z0-9\.][a-zA-z0-9\.]+\b",headline.lower())  
    return tokens

# Remove tokens according to the principles 
def custom_processor(headline, remove_words):
    tokens = custom_tokenizer(headline)
    new_tokens = [token for token in tokens if token not in remove_words]
    return " ".join(new_tokens)

def get_user_selected_words(words_top, bool):
    if bool:
        print("Top words in headlines:")
        for word in words_top:
            print(word)
        print("\nPlease enter the words you want to delete, separated by commas:")
        user_input = input().strip()
        selected_words = [word.strip() for word in user_input.split(",") if word.strip() in words_top]
    else:
        selected_words = []
    return selected_words

def clean_heandline(df, relevance_threshold, top_words_num, select_manually = True):
    df = df[df['relevance'] >= relevance_threshold]
    df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)
    df.loc[:,"vocab_con_headline"] = df.loc[:,"vocab_con_headline"].apply(remove_number_day)

    #clean the stop words
    stop_words = list(ENGLISH_STOP_WORDS)
    words_list = ["release", "press", "pgr", "mw", "llp", "corp", "live", "corporation","plc", "factset",
                     "llc", "group", "target", "blog", "st", "chart", "update", "dir", "barron", "pbulletin"]
    stop_words.extend(words_list)
    df.loc[:,"vocab_con_headline"] = df.loc[:,"vocab_con_headline"].apply(lambda headline: custom_processor(headline, stop_words))

    # Remove words that appear only once and the top 100 words
    remove_words = set()
    # Create a vocab dictionary with the words and their counts
    vocab_con_headlines = df.loc[:,"vocab_con_headline"].tolist()
    vocab = collections.Counter()
    for headline in vocab_con_headlines:
        vocab.update(custom_tokenizer(headline))
    
    words_once = [word for word,count in vocab.items() if count ==1]
    remove_words.update(words_once)
    top_count = sorted(vocab.values(),reverse = True)[top_words_num]
    words_top = [word for word,count in vocab.items() if count >=top_count]
    user_selected_words = get_user_selected_words(words_top, select_manually)
    remove_words.update(user_selected_words)
    
    df.loc[:,"vocab_con_headline"] = df.loc[:,"vocab_con_headline"].apply(lambda headline: custom_processor(headline, remove_words))
    df = df.drop(columns = ["relevance"])
    df.dropna(inplace=True)
    df = df[df.vocab_con_headline != ""]
    return df

In [4]:
df_folder = '/shared/share_tm-finance'
datatype = "contem"
relevance_threshold = 75
top_words_num = 150
select_manually = False

for i in range(2014,2024):
    year_range = (i, i)
    if datatype == "contem":
        df = contem_ret(year_range)
    else:
        df = future_ret(year_range)

    
    # save processed datafram
    df_cleaned = clean_heandline(df, relevance_threshold, top_words_num, select_manually)
    print("cleaned ", i)
    df_cleaned.to_csv(df_folder+f"/Processed_df_Sentiment/One_year_window/{type}_{i}_senti.csv".format(type = datatype, year = i), index = False)

    #save embeddings
    red_headlines = df_cleaned.vocab_con_headline.tolist()
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedding_model.encode(red_headlines, show_progress_bar = True)
    embeddings = np.save(df_folder+f"/Embeddings_with_Sentiment/One_year_window/{type}_{i}_senti_embeddings.npy".format(type = datatype, year = i), embeddings)
    

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2014


Batches: 100%|██████████| 10332/10332 [03:03<00:00, 56.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2015


Batches: 100%|██████████| 9870/9870 [02:50<00:00, 57.86it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2016


Batches: 100%|██████████| 10069/10069 [02:58<00:00, 56.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2017


Batches: 100%|██████████| 10432/10432 [03:04<00:00, 56.65it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2018


Batches: 100%|██████████| 10226/10226 [02:56<00:00, 57.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2019


Batches: 100%|██████████| 10792/10792 [03:06<00:00, 57.87it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2020


Batches: 100%|██████████| 12393/12393 [03:35<00:00, 57.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2021


Batches: 100%|██████████| 12172/12172 [03:33<00:00, 57.08it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2022


Batches: 100%|██████████| 12318/12318 [03:39<00:00, 56.19it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"vocab_con_headline"] = df.loc[:,"headline"].apply(remove_useless)


cleaned  2023


Batches: 100%|██████████| 12967/12967 [03:53<00:00, 55.44it/s]


In [None]:
# df_folder = '/shared/share_tm-finance'
# datatype = "contem"
# i = 2023
# df_cleaned_grid=pd.read_csv(df_folder+"/Processed_df_Sentiment/One_year_window/{type}_{year}_senti.csv".format(type = datatype, year = i))
