Abdikarim Awil Ali                                                  
abdal16@student.sdu.dk
# A. S&P 500 and financial news

Loading and inspecting the dataset

In [27]:
import pandas as pd 

df = pd.read_csv('data/sp500_headlines_2008_2024.csv')
df_copy = df.copy()

# making sure Date column is in datetime format
df_copy['Date'] = pd.to_datetime(df_copy['Date'])
df_copy = df_copy.sort_values("Date")

print(df_copy.columns)
df_copy.info()
df_copy.head()


Index(['Title', 'Date', 'CP'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 19127 entries, 0 to 19126
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Title   19127 non-null  object        
 1   Date    19127 non-null  datetime64[ns]
 2   CP      19127 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 597.7+ KB


Unnamed: 0,Title,Date,CP
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16
2,2008 predictions for the S&P 500,2008-01-02,1447.16
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18


I want to look at the next days market movement. 

I will give a 1 if the market went up the next day and 0 if not 

In [28]:
df_copy["CP_next_day"] = df_copy["CP"].shift(-1)

df_copy["target_up"] = (df_copy["CP_next_day"] > df_copy["CP"]).astype(int)

df_copy = df_copy.dropna(subset=["CP_next_day"])

Since my prediction target is daily then I must operate at a daily level.

Therefor I am aggregating the headlines to per day.

In [29]:
daily_df = (
    df_copy
    .groupby("Date")
    .agg({
        "Title": " ".join,
        "CP": "first",
        "CP_next_day": "first",
        "target_up": "first"
    })
    .reset_index()
)

desired_columns = [ "Title","Date", "CP", "CP_next_day", "target_up"]
daily_df = daily_df[desired_columns]

daily_df.head()


Unnamed: 0,Title,Date,CP,CP_next_day,target_up
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Ne...",2008-01-02,1447.16,1447.16,0
1,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16,1416.18,0
2,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18,1409.13,0
3,How Investing in Intangibles -- Like Employee ...,2008-01-09,1409.13,1409.13,0
4,U.S. Stocks Zigzag Higher As Bernanke Speech S...,2008-01-10,1420.33,1310.5,0


The next step is to preprocessing the text 

In [30]:
import spacy

nlp = spacy.load("en_core_web_sm")

sample_text = daily_df.loc[0, "Title"]
doc = nlp(sample_text)

[token.text.lower() for token in doc[:20]]

['jpmorgan',
 'predicts',
 '2008',
 'will',
 'be',
 '"',
 'nothing',
 'but',
 'net',
 '"',
 'dow',
 'tallies',
 'biggest',
 'first',
 '-',
 'session',
 '-',
 'of',
 '-',
 'year']

I will extract sentiment from financial news headlines using afinn_sentiment()


In [31]:
afinn = pd.read_csv(
    "https://raw.githubusercontent.com/fnielsen/afinn/master/afinn/data/AFINN-111.txt",
    sep="\t",
    header=None,
    names=["word", "score"]
)
afinn_dict = dict(zip(afinn["word"], afinn["score"]))


def afinn_sentiment(text):
    doc = nlp(text)
    sentiment_score = 0
    for token in doc:
        word = token.text.lower()
        if word in afinn_dict:
            sentiment_score += afinn_dict[word]
    return sentiment_score

daily_df["sentiment_score"] = daily_df["Title"].apply(afinn_sentiment)

daily_df[["Title","Date","target_up", "sentiment_score"]].head()



Unnamed: 0,Title,Date,target_up,sentiment_score
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Ne...",2008-01-02,0,-1
1,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,0,0
2,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,0,3
3,How Investing in Intangibles -- Like Employee ...,2008-01-09,0,2
4,U.S. Stocks Zigzag Higher As Bernanke Speech S...,2008-01-10,0,0


The bag of words method

In [32]:
from collections import Counter

def bow_tokenize(text):
    d = nlp(text)
    w = [
        token.text.lower()
        for token in d
        if token.is_alpha and not token.is_stop
    ]
    return w

positive_words = {
    "gain", "gains", "up", "rise", "rises", "growth",
    "profit", "profits", "strong", "record", "increase"
}

negative_words = {
    "loss", "losses", "down", "fall", "falls", "decline", "declines",
    "weak", "drop", "drops", "dropped", "crisis", "sink", "lower"
}


def bow_sentiment(text):
    words = bow_tokenize(text)
    counts = Counter(words)

    score = 0
    for word, freq in counts.items():
        if word in positive_words:
            score += freq
        elif word in negative_words:
            score -= freq

    return score



daily_df["bow_sentiment_score"] = daily_df["Title"].apply(bow_sentiment)

daily_df[["Title", "Date", "target_up", "bow_sentiment_score"]].head()

bow_predictions = (daily_df["bow_sentiment_score"] > 0).astype(int)





# The baseline --> past market direction

In [33]:
daily_df["prev_direction"] = daily_df["target_up"].shift(1)

daily_df = daily_df.dropna(subset=["prev_direction"])

daily_df.head()

Unnamed: 0,Title,Date,CP,CP_next_day,target_up,sentiment_score,bow_sentiment_score,prev_direction
1,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16,1416.18,0,0,0,0.0
2,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18,1409.13,0,3,1,0.0
3,How Investing in Intangibles -- Like Employee ...,2008-01-09,1409.13,1409.13,0,2,0,0.0
4,U.S. Stocks Zigzag Higher As Bernanke Speech S...,2008-01-10,1420.33,1310.5,0,0,0,0.0
5,It's a Black Monday as stock markets tank in e...,2008-01-22,1310.5,1310.5,0,0,-1,0.0


# Baseline predictor

In [34]:
import numpy as np

# Baseline predictions (no text)
y_true = daily_df["target_up"].astype(int)

baseline_predictions = daily_df["prev_direction"].astype(int)

baseline_test = baseline_predictions == y_true
baseline_accuracy = np.mean(baseline_test)

print(f"Baseline accuracy: {baseline_accuracy:.2%}")


# Sentiment-based predictions
sentiment_predictions = (daily_df["sentiment_score"] > 0).astype(int)

sentiment_test = sentiment_predictions == y_true
sentiment_accuracy = np.mean(sentiment_test)

print(f"Sentiment-based accuracy: {sentiment_accuracy:.2%}")


# Bag of Words-based predictions
bow_predictions = bow_predictions.loc[daily_df.index]
bow_test = bow_predictions == y_true
bow_accuracy = np.mean(bow_test)

print(f"BoW sentiment accuracy: {bow_accuracy:.2%}")


print("\nAccuracy comparison")
print("-------------------")
print(f"Baseline (no text): {baseline_accuracy:.2%}")
print(f"With sentiment:    {sentiment_accuracy:.2%}")
print(f"With BoW sentiment: {bow_accuracy:.2%}")


Baseline accuracy: 86.25%
Sentiment-based accuracy: 50.43%
BoW sentiment accuracy: 72.02%

Accuracy comparison
-------------------
Baseline (no text): 86.25%
With sentiment:    50.43%
With BoW sentiment: 72.02%
