In [114]:
import numpy as np
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import yfinance as yf
import seaborn as sns

In [115]:
import os
print(os.getcwd())

# Directory containing the CSV files
directory = './../data/raw/'

# List to hold dataframes
dfs = []

# Loop through files in the directory
for filename in os.listdir(directory):
    if filename.startswith('wsj'):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

# Merge all dataframes
merged_df = pd.concat(dfs, ignore_index=True)
print(merged_df)

/Users/emreyuce/Desktop/spy_eft_prediction/notebooks
                                                Headline  \
0      ObituariesDoreen M. Frasca Thrived in Muni Bon...   
1      The Saturday EssayLessons Learned From a Fiery...   
2      Pro VC New MoneySelf-Driving Trucking Startup ...   
3      Review & OutlookOpinion: Alvin Bragg’s Politic...   
4      Risk & Compliance JournalKirstjen Nielsen Join...   
...                                                  ...   
16810  Pro BankruptcyInstant Pot’s Slower Sales Tip G...   
16811  Asia EconomyJapan Ministers Ask BOJ to Help Co...   
16812  BusinessElizabeth Holmes Takes the Stand in Cr...   
16813  GolfGolf’s Fastest-Rising Star Doesn’t Have Hi...   
16814  Paid ProgramWhy an Open Architecture Approach ...   

                                 Date  
0      September 30, 2022 10:02 am ET  
1        January 10, 2025 09:00 pm ET  
2       February 10, 2021 01:26 pm ET  
3          March 19, 2023 05:14 pm ET  
4          April 27, 2022 

In [116]:
merged_df.head()

Unnamed: 0,Headline,Date
0,ObituariesDoreen M. Frasca Thrived in Muni Bon...,"September 30, 2022 10:02 am ET"
1,The Saturday EssayLessons Learned From a Fiery...,"January 10, 2025 09:00 pm ET"
2,Pro VC New MoneySelf-Driving Trucking Startup ...,"February 10, 2021 01:26 pm ET"
3,Review & OutlookOpinion: Alvin Bragg’s Politic...,"March 19, 2023 05:14 pm ET"
4,Risk & Compliance JournalKirstjen Nielsen Join...,"April 27, 2022 07:00 am ET"


In [117]:
merged_df[merged_df.duplicated()].sort_values('Headline').value_counts()

Headline                                                                                                                                                                                                                                                                                                 Date                         
Paid Program                                                                                                                                                                                                                                                                                             Tarih bulunamadı.                59
Middle EastSaudi Crown Prince Tangles With Sovereign Wealth Fund Over How to Invest Oil RichesPrince Mohammed bin Salman has vexed financial officials at times as he pushes pet projects on the Public Investment Fund, including a deal with Jared Kushner.                                            January 3, 2023 01:14 pm ET      17
Bookshe

In [118]:
merged_df = merged_df[merged_df['Date'] != 'Tarih bulunamadı.']
merged_df.drop_duplicates(inplace=True, keep='first')

merged_df.shape

(4007, 2)

In [119]:

merged_df['Date'] = merged_df['Date'].str.replace(" ET", "")

# Convert the string to datetime
merged_df['Date'] = pd.to_datetime(merged_df['Date'], format="%B %d, %Y %I:%M %p")

In [120]:
merged_df['Date'].isna().sum()

0

In [121]:
merged_df = merged_df.sort_values('Date', ascending=False).reset_index(drop=True)

In [122]:
merged_df.head()

Unnamed: 0,Headline,Date
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00


In [123]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from typing import Tuple 
device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)
labels = ["positive", "negative", "neutral"]

def estimate_sentiment(news):
    tokens = tokenizer(news, return_tensors="pt", padding=True).to(device)

    result = model(tokens["input_ids"], attention_mask=tokens["attention_mask"])[
        "logits"
    ]
    result = torch.nn.functional.softmax(torch.sum(result, 0), dim=-1)
    probability = result
    
    return probability.tolist()

In [124]:
def weighted_average(sentiment_probs, weights):
    """
    sentiment_probs: A list or tensor with probabilities for [positive, negative, neutral] classes.
    weights: A list or tensor with corresponding weights for [positive, negative, neutral] classes.
    
    Returns the weighted average sentiment score.
    """
    # Ensure inputs are numpy arrays for calculation
    sentiment_probs = np.array(sentiment_probs)
    weights = np.array(weights)
    
    # Calculate the weighted average
    weighted_avg = np.sum(sentiment_probs * weights)
    
    return weighted_avg

In [125]:
if not os.path.exists('./../data/processed/merged_df.csv'):    
    merged_df['score'] = merged_df['Headline'].apply(lambda x: weighted_average(estimate_sentiment(x), [0.5,-0.5, 0.2]))
    merged_df.to_csv('./../data/processed/merged_df.csv', index=False)
else:
    pass


In [126]:
merged_df.head()

Unnamed: 0,Headline,Date,score
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685


In [127]:
df_news = merged_df

In [128]:
df_news.rename(columns={'Date':'Datetime'}, inplace=True)

In [129]:
df_news['Date'] = df_news['Datetime'].dt.date
df_news['time'] = df_news['Datetime'].dt.time

In [130]:
df_news['after_hours'] = df_news['time'].apply(lambda x: x > pd.to_datetime('16:00:00').time())

In [131]:
# findout weekends and holidays
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2020-01-01', end='2025-01-01').to_pydatetime()

# use holidays and weekends to create a column for market open
df_news['market_open'] = df_news['Date'].apply(lambda x: x not in holidays and x.weekday() < 5)
#name the day of the week
df_news['day_name'] = df_news['Date'].apply(lambda x: x.strftime('%A'))

df_news.head()


Unnamed: 0,Headline,Datetime,score,Date,time,after_hours,market_open,day_name
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036,2025-01-31,20:39:00,True,True,Friday
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647,2025-01-31,20:00:00,True,True,Friday
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774,2025-01-31,19:03:00,True,True,Friday
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284,2025-01-31,18:00:00,True,True,Friday
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685,2025-01-31,17:41:00,True,True,Friday


In [132]:
while df_news[df_news['market_open'] == False].shape[0] > 0:
    df_news['Date'] = df_news.apply(lambda x: x['Date'] + pd.DateOffset(days=1) if x['market_open'] == False else x['Date'], axis=1)
    df_news['market_open'] = df_news['Date'].apply(lambda x: x not in holidays and x.weekday() < 5)

In [133]:
df_news['day_name'] = df_news['Date'].apply(lambda x: x.strftime('%A'))
df_news.drop(columns=['after_hours'], inplace=True)

df_news.head()

Unnamed: 0,Headline,Datetime,score,Date,time,market_open,day_name
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036,2025-01-31,20:39:00,True,Friday
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647,2025-01-31,20:00:00,True,Friday
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774,2025-01-31,19:03:00,True,Friday
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284,2025-01-31,18:00:00,True,Friday
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685,2025-01-31,17:41:00,True,Friday


In [134]:
# findout weekends and holidays
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2020-01-01', end='2025-01-01').to_pydatetime()

# use holidays and weekends to create a column for market open
df_news['market_open'] = df_news['Date'].apply(lambda x: x not in holidays and x.weekday() < 5)
#name the day of the week
df_news['day_name'] = df_news['Date'].apply(lambda x: x.strftime('%A'))

df_news.head()

Unnamed: 0,Headline,Datetime,score,Date,time,market_open,day_name
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036,2025-01-31,20:39:00,True,Friday
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647,2025-01-31,20:00:00,True,Friday
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774,2025-01-31,19:03:00,True,Friday
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284,2025-01-31,18:00:00,True,Friday
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685,2025-01-31,17:41:00,True,Friday


In [135]:
df_news['after_hours'] = df_news['time'].apply(lambda x: x > pd.to_datetime('16:00:00').time())

In [136]:
df_news.drop(columns=['time'], inplace=True)
# if afte_hours shift the date to the next day
df_news['Date'] = df_news.apply(lambda x: (x['Date'] + pd.DateOffset(days=1)).date() if x['after_hours'] else x['Date'], axis=1)
df_news[df_news['after_hours']]

Unnamed: 0,Headline,Datetime,score,Date,market_open,day_name,after_hours
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036,2025-02-01,True,Friday,True
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647,2025-02-01,True,Friday,True
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774,2025-02-01,True,Friday,True
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284,2025-02-01,True,Friday,True
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685,2025-02-01,True,Friday,True
...,...,...,...,...,...,...,...
3978,WSJ ProPro Bankruptcy Data TablesFirm Retentio...,2021-02-08 17:48:00,0.181562,2021-02-09,True,Monday,True
3989,WSJ ProPro BankruptcyPunch Bowl Secures Ch. 11...,2021-02-04 19:59:00,0.317218,2021-02-05,True,Thursday,True
3990,U.S.Giuliani Associate Faces SEC LawsuitThe co...,2021-02-04 16:59:00,-0.249280,2021-02-05,True,Thursday,True
4005,The AmericasOpinion: How AMLO Is Like Venezuel...,2021-01-31 17:22:00,0.201279,2021-02-02,True,Monday,True


In [137]:
while df_news[df_news['market_open'] == False].shape[0] > 0:
    df_news['Date'] = df_news.apply(lambda x: x['Date'] + pd.DateOffset(days=1) if x['market_open'] == False else x['Date'], axis=1)
    df_news['market_open'] = df_news['Date'].apply(lambda x: x not in holidays and x.weekday() < 5)

In [138]:
df_news['day_name'] = df_news['Date'].apply(lambda x: x.strftime('%A'))
df_news.drop(columns=['after_hours'], inplace=True)

df_news.head()

Unnamed: 0,Headline,Datetime,score,Date,market_open,day_name
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036,2025-02-01,True,Saturday
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647,2025-02-01,True,Saturday
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774,2025-02-01,True,Saturday
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284,2025-02-01,True,Saturday
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685,2025-02-01,True,Saturday


In [139]:
df_news.head()

Unnamed: 0,Headline,Datetime,score,Date,market_open,day_name
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036,2025-02-01,True,Saturday
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647,2025-02-01,True,Saturday
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774,2025-02-01,True,Saturday
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284,2025-02-01,True,Saturday
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685,2025-02-01,True,Saturday


In [140]:
df_news.market_open.value_counts()

market_open
True    4007
Name: count, dtype: int64

In [141]:
df_news['day_name'] = df_news['Date'].apply(lambda x: x.strftime('%A'))

df_news.head()

Unnamed: 0,Headline,Datetime,score,Date,market_open,day_name
0,PoliticsVenezuela Releases Six Americans to Tr...,2025-01-31 20:39:00,-0.229036,2025-02-01,True,Saturday
1,BusinessAmericans Don’t Eat Enough Bacon. The ...,2025-01-31 20:00:00,0.224647,2025-02-01,True,Saturday
2,"PoliticsVaccine Information, Transgender Refer...",2025-01-31 19:03:00,-0.332774,2025-02-01,True,Saturday
3,StocksTexas Stock Exchange Startup Asks SEC to...,2025-01-31 18:00:00,0.414284,2025-02-01,True,Saturday
4,Review & OutlookOpinion: The Dumbest Trade War...,2025-01-31 17:41:00,-0.182685,2025-02-01,True,Saturday


In [146]:
pd.to_datetime(df_news['Date']).dt.day_name()

0       Saturday
1       Saturday
2       Saturday
3       Saturday
4       Saturday
          ...   
4002      Monday
4003      Monday
4004      Monday
4005     Tuesday
4006     Tuesday
Name: Date, Length: 4007, dtype: object