In [46]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm.auto import tqdm

In [47]:
df = pd.read_csv("/content/finbert_sentiment_results.csv")   # must contain 'full_text' and 'url'


In [48]:
df.head()

Unnamed: 0,source,title,url,published,content,full_text,sentiment,confidence
0,CNBC,"Apple's WWDC underwhelms on AI, but software g...",https://www.cnbc.com/2025/06/09/apple-wwdc-und...,"Tue, 10 Jun 2025 18:00:01 GMT",Apple's annual developer conference on Monday ...,"Apple's WWDC underwhelms on AI, but software g...",Neutral,0.9999
1,CNBC,Google overhauls internal learning platform to...,https://www.cnbc.com/2025/06/10/google-grow-le...,"Tue, 10 Jun 2025 18:46:31 GMT",Google is overhauling a popular internal learn...,Google overhauls internal learning platform to...,Neutral,0.8436
2,CNBC,Inflation readings and Treasury auctions to te...,https://www.cnbc.com/2025/06/10/inflation-data...,"Tue, 10 Jun 2025 18:10:22 GMT",Key readings on inflation combined with two cr...,Inflation readings and Treasury auctions to te...,Neutral,0.9379
3,CNBC,California asks judge to quickly block Trump t...,https://www.cnbc.com/2025/06/10/trump-marines-...,"Tue, 10 Jun 2025 18:53:29 GMT",California Gov. Gavin Newsom on Tuesday asked ...,California asks judge to quickly block Trump t...,Neutral,0.9792
4,CNBC,RFK Jr.'s firing of CDC vaccine panel undermin...,https://www.cnbc.com/2025/06/10/rfk-jr-firing-...,"Tue, 10 Jun 2025 18:55:54 GMT",Health and Human Services Secretary Robert F. ...,RFK Jr.'s firing of CDC vaccine panel undermin...,Neutral,0.9806


In [49]:
df["full_text"] = df["full_text"].astype(str)

In [50]:
MODEL_NAME = "abhay2727/Bert-NER-Finance"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

In [51]:
import torch
from transformers import pipeline

ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cpu


In [52]:
tqdm.pandas(desc="Extracting Entities")
def extract_entities(text):
    ents = ner_pipe(text[:512])
    return [(e["word"], e["entity_group"]) for e in ents]

df["entities"] = df["full_text"].progress_apply(extract_entities)

Extracting Entities:   0%|          | 0/90 [00:00<?, ?it/s]

In [20]:
df.to_csv("news_with_entities.csv", index=False)
print(df[["url", "entities"]].head())

                                                 url  \
0  https://www.cnbc.com/2025/06/09/apple-wwdc-und...   
1  https://www.cnbc.com/2025/06/10/google-grow-le...   
2  https://www.cnbc.com/2025/06/10/inflation-data...   
3  https://www.cnbc.com/2025/06/10/trump-marines-...   
4  https://www.cnbc.com/2025/06/10/rfk-jr-firing-...   

                                            entities  
0  [(Apple, ORG), (W, ORG), (##WD, ORG), (##C, OR...  
1  [(Google, ORG), (Google, ORG), (C, ORG), (##NB...  
2  [(Treasury, ORG), (Treasury, ORG), (Bureau of ...  
3  [(California, LOC), (Trump, PER), (LA, LOC), (...  
4  [(RF, PER), (##K Jr., PER), (CD, ORG), (##C, O...  


In [53]:
import pandas as pd
import yfinance as yf
from datetime import timedelta
from tqdm.auto import tqdm

In [54]:
df.head()

Unnamed: 0,source,title,url,published,content,full_text,sentiment,confidence,entities
0,CNBC,"Apple's WWDC underwhelms on AI, but software g...",https://www.cnbc.com/2025/06/09/apple-wwdc-und...,"Tue, 10 Jun 2025 18:00:01 GMT",Apple's annual developer conference on Monday ...,"Apple's WWDC underwhelms on AI, but software g...",Neutral,0.9999,"[(Apple, ORG), (W, ORG), (##WD, ORG), (##C, OR..."
1,CNBC,Google overhauls internal learning platform to...,https://www.cnbc.com/2025/06/10/google-grow-le...,"Tue, 10 Jun 2025 18:46:31 GMT",Google is overhauling a popular internal learn...,Google overhauls internal learning platform to...,Neutral,0.8436,"[(Google, ORG), (Google, ORG), (C, ORG), (##NB..."
2,CNBC,Inflation readings and Treasury auctions to te...,https://www.cnbc.com/2025/06/10/inflation-data...,"Tue, 10 Jun 2025 18:10:22 GMT",Key readings on inflation combined with two cr...,Inflation readings and Treasury auctions to te...,Neutral,0.9379,"[(Treasury, ORG), (Treasury, ORG), (Bureau of ..."
3,CNBC,California asks judge to quickly block Trump t...,https://www.cnbc.com/2025/06/10/trump-marines-...,"Tue, 10 Jun 2025 18:53:29 GMT",California Gov. Gavin Newsom on Tuesday asked ...,California asks judge to quickly block Trump t...,Neutral,0.9792,"[(California, LOC), (Trump, PER), (LA, LOC), (..."
4,CNBC,RFK Jr.'s firing of CDC vaccine panel undermin...,https://www.cnbc.com/2025/06/10/rfk-jr-firing-...,"Tue, 10 Jun 2025 18:55:54 GMT",Health and Human Services Secretary Robert F. ...,RFK Jr.'s firing of CDC vaccine panel undermin...,Neutral,0.9806,"[(RF, PER), (##K Jr., PER), (CD, ORG), (##C, O..."


In [55]:
data = pd.read_csv("finbert_sentiment_results.csv")
data["published"] = pd.to_datetime(data["published"], errors="coerce")
data = data.dropna(subset=["published"]).reset_index(drop=True)

In [62]:
ASSET_SYMBOL = "^GSPC"
WINDOW_HOURS = 24
CHANGE_THRESHOLD = 0.005  # 0.5%


In [67]:
import yfinance as yf
from datetime import timedelta
import pandas as pd

def label_market_reaction(ts):
    if pd.isnull(ts):
        return "UNKNOWN"
    start = ts.date()
    end   = (ts + timedelta(hours=WINDOW_HOURS)).date()
    data = yf.download(ASSET_SYMBOL, start=start, end=end, interval="1h", progress=False)
    if data.empty or len(data) < 2:
        data = yf.download(ASSET_SYMBOL, start=start, end=end, interval="1d", progress=False)
    if data.empty or len(data) < 2:
        return "UNKNOWN"
    if not data["Close"].empty:
        init = data["Close"].iloc[0].item()
        final = data["Close"].iloc[-1].item()
        change = (final - init) / init
        if change > CHANGE_THRESHOLD:   return "UP"
        if change < -CHANGE_THRESHOLD:  return "DOWN"
        return "NEUTRAL"
    return "UNKNOWN"

In [68]:
from tqdm.auto import tqdm
tqdm.pandas(desc="Labeling Market Reaction")

data["market_label"] = data["published"].progress_apply(label_market_reaction)


Labeling Market Reaction:   0%|          | 0/30 [00:00<?, ?it/s]

In [79]:
print(data["market_label"].value_counts())
data.to_csv("labeled_news_market_data.csv", index=False)

market_label
NEUTRAL    30
Name: count, dtype: int64


In [80]:
entities_df = pd.read_csv("news_with_entities.csv")                # 'url', 'full_text', 'entities'
sentiment_df = pd.read_csv("finbert_sentiment_results.csv")       # 'url', 'sentiment', 'confidence'
market_df    = pd.read_csv("/content/labeled_news_market_data.csv")

In [81]:
df_combined = entities_df.merge(
    sentiment_df[['url', 'sentiment', 'confidence']],
    on='url', how='left'
).merge(
    market_df[['url', 'market_label']],
    on='url', how='left'
)

In [82]:
final_csv = "final_financial_news_dataset.csv"
df_combined.to_csv(final_csv, index=False)
print(f" Final combined dataset saved to '{final_csv}'")

print(df_combined.info())
print(df_combined.head())

 Final combined dataset saved to 'final_financial_news_dataset.csv'
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   source        90 non-null     object 
 1   title         90 non-null     object 
 2   url           90 non-null     object 
 3   published     30 non-null     object 
 4   content       75 non-null     object 
 5   full_text     90 non-null     object 
 6   sentiment_x   90 non-null     object 
 7   confidence_x  90 non-null     float64
 8   entities      90 non-null     object 
 9   sentiment_y   90 non-null     object 
 10  confidence_y  90 non-null     float64
 11  market_label  30 non-null     object 
dtypes: float64(2), object(10)
memory usage: 8.6+ KB
None
  source                                              title  \
0   CNBC  Apple's WWDC underwhelms on AI, but software g...   
1   CNBC  Google overhauls internal learnin