In [17]:
pip install requests beautifulsoup4 pandas newspaper3k feedparser lxml


Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sgmllib3k (from feedp

In [18]:
pip install requests beautifulsoup4 trafilatura feedparser pandas lxml


Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting courlan>=1.3.2 (from trafilatura)
  Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)
Collecting htmldate>=1.9.2 (from trafilatura)
  Downloading htmldate-1.9.3-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.1 (from trafilatura)
  Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)
  Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting dateparser>=1.1.2 (from htmldate>=1.9.2->trafilatura)
  Downloading dateparser-1.2.1-py3-none-any.whl.metadata (29 kB)
Collecting lxml_html_clean (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura)
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading trafilatura-2.0.0-py3-none-any.whl (132 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m


In [24]:
import requests
from bs4 import BeautifulSoup
import feedparser
import trafilatura
import pandas as pd
from datetime import datetime
import time

headers = {'User-Agent': 'Mozilla/5.0'}
all_articles = []

def get_article_text(url):
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded)
    except:
        return None
    return None

def fetch_reuters(pages=3):
    print("Fetching Reuters...")
    for page in range(1, pages+1):
        url = f"https://www.reuters.com/markets/?view=page&page={page}"
        soup = BeautifulSoup(requests.get(url, headers=headers).text, "lxml")
        articles = soup.find_all("article")

        for a in articles:
            h3 = a.find("h3")
            link = a.find("a", href=True)
            if h3 and link:
                title = h3.text.strip()
                full_url = "https://www.reuters.com" + link['href']
                all_articles.append({
                    'source': 'Reuters',
                    'title': title,
                    'url': full_url,
                    'published': None,
                    'content': get_article_text(full_url)
                })

def fetch_cnbc():
    print("Fetching CNBC...")
    rss_url = "https://www.cnbc.com/id/100003114/device/rss/rss.html"
    feed = feedparser.parse(rss_url)
    for entry in feed.entries[:50]:
        all_articles.append({
            'source': 'CNBC',
            'title': entry.title,
            'url': entry.link,
            'published': entry.get('published', None),
            'content': get_article_text(entry.link)
        })

def fetch_investing(pages=3):
    print("Fetching Investing.com...")
    for page in range(1, pages+1):
        url = f"https://www.investing.com/news/economy/{page}"
        soup = BeautifulSoup(requests.get(url, headers=headers).text, "lxml")
        news_list = soup.select("div.textDiv a.title")

        for news in news_list:
            title = news.text.strip()
            link = "https://www.investing.com" + news["href"]
            all_articles.append({
                'source': 'Investing.com',
                'title': title,
                'url': link,
                'published': None,
                'content': get_article_text(link)
            })

def fetch_yahoo_finance():
    print("Fetching Yahoo Finance...")
    url = "https://finance.yahoo.com/"
    soup = BeautifulSoup(requests.get(url, headers=headers).text, "lxml")
    seen = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if '/news/' in href and href not in seen:
            seen.add(href)
            title = a.text.strip()
            full_url = 'https://finance.yahoo.com' + href
            all_articles.append({
                'source': 'Yahoo Finance',
                'title': title,
                'url': full_url,
                'published': None,
                'content': get_article_text(full_url)
            })
            if len(seen) >= 50:
                break

def fetch_marketwatch():
    print("Fetching MarketWatch...")
    url = "https://www.marketwatch.com/"
    soup = BeautifulSoup(requests.get(url, headers=headers).text, "lxml")
    for a in soup.select('a.article__headline')[:50]:
        title = a.text.strip()
        link = a.get('href')
        if link and title:
            all_articles.append({
                'source': 'MarketWatch',
                'title': title,
                'url': link,
                'published': None,
                'content': get_article_text(link)
            })

def fetch_forbes():
    print("Fetching Forbes...")
    url = "https://www.forbes.com/business/"
    soup = BeautifulSoup(requests.get(url, headers=headers).text, "lxml")
    articles = soup.select('a[data-ga-track]')
    count = 0
    for a in articles:
        link = a.get('href')
        title = a.text.strip()
        if link and title and "https://www.forbes.com/sites/" in link:
            all_articles.append({
                'source': 'Forbes',
                'title': title,
                'url': link,
                'published': None,
                'content': get_article_text(link)
            })
            count += 1
        if count >= 50:
            break

def fetch_business_insider():
    print("Fetching Business Insider...")
    url = "https://www.businessinsider.com/"
    soup = BeautifulSoup(requests.get(url, headers=headers).text, "lxml")
    articles = soup.select('a[data-analytics-link]')
    seen = set()
    for a in articles:
        link = a.get('href')
        title = a.text.strip()
        if link and title and "/article" in link and link not in seen:
            full_url = "https://www.businessinsider.com" + link if link.startswith('/') else link
            seen.add(full_url)
            all_articles.append({
                'source': 'Business Insider',
                'title': title,
                'url': full_url,
                'published': None,
                'content': get_article_text(full_url)
            })
        if len(seen) >= 50:
            break

# Run all scrapers
print("Starting enhanced scraping...")
fetch_reuters()
fetch_cnbc()
fetch_investing()
fetch_yahoo_finance()
fetch_marketwatch()
fetch_forbes()
fetch_business_insider()

# Save to CSV
df = pd.DataFrame(all_articles)
df.drop_duplicates(subset='url', inplace=True)

now = datetime.now().strftime("%Y-%m-%d_%H-%M")
csv_name = f"financial_news_{now}.csv"
df.to_csv(f"csv_name", index=False)

# Show summary
df.info(), csv_name


Starting enhanced scraping...
Fetching Reuters...
Fetching CNBC...
Fetching Investing.com...
Fetching Yahoo Finance...


ERROR:trafilatura.downloads:download error: https://finance.yahoo.comhttps://www.yahoo.com/news/ HTTPSConnectionPool(host='finance.yahoo.comhttps', port=443): Max retries exceeded with url: //www.yahoo.com/news/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fe5b7003690>: Failed to resolve 'finance.yahoo.comhttps' ([Errno -2] Name or service not known)"))
ERROR:trafilatura.downloads:download error: https://finance.yahoo.comhttps://www.yahoo.com/news/us/ HTTPSConnectionPool(host='finance.yahoo.comhttps', port=443): Max retries exceeded with url: //www.yahoo.com/news/us/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fe5b5d0d9d0>: Failed to resolve 'finance.yahoo.comhttps' ([Errno -2] Name or service not known)"))
ERROR:trafilatura.downloads:download error: https://finance.yahoo.comhttps://www.yahoo.com/news/politics/ HTTPSConnectionPool(host='finance.yahoo.comhttps', port=443): Max retries exceeded with url: //www.yahoo

Fetching MarketWatch...
Fetching Forbes...


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://www.forbes.com/sites/kenny-rivera/


Fetching Business Insider...
<class 'pandas.core.frame.DataFrame'>
Index: 115 entries, 0 to 117
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   source     115 non-null    object
 1   title      115 non-null    object
 2   url        115 non-null    object
 3   published  30 non-null     object
 4   content    75 non-null     object
dtypes: object(5)
memory usage: 5.4+ KB


(None, 'financial_news_2025-06-10_19-00.csv')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 117 entries, 0 to 119
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   source     117 non-null    object
 1   title      117 non-null    object
 2   url        117 non-null    object
 3   published  30 non-null     object
 4   content    76 non-null     object
dtypes: object(5)
memory usage: 5.5+ KB


In [25]:
df_clean = df.dropna(subset=['content'])


df_clean['content'] = df_clean['content'].str.strip()
df_clean['title'] = df_clean['title'].str.strip()


df_clean = df_clean.drop_duplicates(subset=['url'])
df_clean = df_clean.drop_duplicates(subset=['title'])

df_clean['content'] = df_clean['content'].apply(lambda x: x.encode('ascii', errors='ignore').decode())


df_clean.reset_index(drop=True, inplace=True)


print(df_clean.info())
print(df_clean[['source', 'title']].head())


clean_csv = f"/mnt/data/cleaned_news_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.csv"



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   source     75 non-null     object
 1   title      75 non-null     object
 2   url        75 non-null     object
 3   published  30 non-null     object
 4   content    75 non-null     object
dtypes: object(5)
memory usage: 3.1+ KB
None
  source                                              title
0   CNBC  Apple's WWDC underwhelms on AI, but software g...
1   CNBC  Google overhauls internal learning platform to...
2   CNBC  Inflation readings and Treasury auctions to te...
3   CNBC  California asks judge to quickly block Trump t...
4   CNBC  RFK Jr.'s firing of CDC vaccine panel undermin...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['content'] = df_clean['content'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['title'] = df_clean['title'].str.strip()


In [26]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   source     75 non-null     object
 1   title      75 non-null     object
 2   url        75 non-null     object
 3   published  30 non-null     object
 4   content    75 non-null     object
dtypes: object(5)
memory usage: 3.1+ KB


FinBERT Sentiment Classification on Financial News

In [2]:
pip install transformers accelerate datasets


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [6]:
!pip install -U transformers




In [9]:
pip install torch




In [11]:
pip install --upgrade "transformers[torch]>=4.5.0"




In [13]:
import transformers
print(transformers.__version__)  # should be >= 4.5.0

from transformers import pipeline
sent = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
print(sent("The Federal Reserve announced a rate cut today."))


4.52.4


ImportError: cannot import name 'pipeline' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [None]:
def get_sentiment(text):
    try:
        result = pipe(text[:512])[0]
        return result["label"], round(result["score"], 4)
    except Exception as e:
        print(f"Error: {e}")
        return "NEUTRAL", 0.0


In [None]:
sample_text = df["full_text"].iloc[0]
print(pipe(sample_text[:512]))


In [51]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch
import pandas as pd
from tqdm import tqdm

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


device = 0 if torch.cuda.is_available() else -1


pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=False,
    function_to_apply="softmax",
    device=device,
    truncation=True,
    max_length=512,
    batch_size=16,
    top_k=None
)


Device set to use cpu


In [52]:
def preprocess_text(row):
    text = ""
    if pd.notnull(row.get("title")):
        text += row["title"].strip()
    if pd.notnull(row.get("content")):
        text += " " + row["content"].strip()
    return text.replace("\n", " ").replace("  ", " ")

df["full_text"] = df.apply(preprocess_text, axis=1)
df.drop_duplicates(subset="full_text", inplace=True)
df = df[df["full_text"].str.len() > 30]  # Filter too-short entries


In [53]:
tqdm.pandas(desc="Analyzing Sentiment")

def get_sentiment(text):
    try:
        result = pipe(text[:512])[0]
        return result["label"], round(result["score"], 4)
    except Exception:
        return "NEUTRAL", 0.0

# Apply in batches
df[["sentiment", "confidence"]] = df["full_text"].progress_apply(
    lambda x: pd.Series(get_sentiment(x))
)


Analyzing Sentiment: 100%|██████████| 94/94 [00:47<00:00,  1.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["sentiment", "confidence"]] = df["full_text"].progress_apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["sentiment", "confidence"]] = df["full_text"].progress_apply(


In [54]:
print(df[["source", "title", "sentiment", "confidence"]].head())


  source                                              title sentiment  \
0   CNBC  Apple's WWDC underwhelms on AI, but software g...   NEUTRAL   
1   CNBC  Trump authorizes deploying 2,000 more National...   NEUTRAL   
2   CNBC  China’s rare-earth mineral squeeze puts defens...   NEUTRAL   
3   CNBC  Temu and Shein are pivoting to Europe in face ...   NEUTRAL   
4   CNBC  RFK Jr. removes all members of CDC panel advis...   NEUTRAL   

   confidence  
0         0.0  
1         0.0  
2         0.0  
3         0.0  
4         0.0  


In [41]:
df.head(20)

Unnamed: 0,source,title,url,published,content,full_text,sentiment,confidence,market_label
0,CNBC,"Apple's WWDC underwhelms on AI, but software g...",https://www.cnbc.com/2025/06/09/apple-wwdc-und...,2025-06-09 23:42:58,Apple's annual developer conference on Monday ...,"Apple's WWDC underwhelms on AI, but software g...",NEUTRAL,0.0,UNKNOWN
1,CNBC,"Trump authorizes deploying 2,000 more National...",https://www.cnbc.com/2025/06/09/trump-sued-nat...,2025-06-10 01:06:55,President Donald Trump has authorized deployin...,"Trump authorizes deploying 2,000 more National...",NEUTRAL,0.0,UNKNOWN
2,CNBC,RFK Jr. removes all members of CDC panel advis...,https://www.cnbc.com/2025/06/09/rfk-jr-cdc-pan...,2025-06-09 21:31:52,Health and Human Services Secretary Robert F. ...,RFK Jr. removes all members of CDC panel advis...,NEUTRAL,0.0,UNKNOWN
3,CNBC,Crypto CEO accused of laundering $500 million ...,https://www.cnbc.com/2025/06/09/crypto-russia-...,2025-06-09 20:51:15,Federal prosecutors in Brooklyn have charged t...,Crypto CEO accused of laundering $500 million ...,NEUTRAL,0.0,UNKNOWN
4,CNBC,Disney to pay Comcast $438.7 million to take f...,https://www.cnbc.com/2025/06/09/disney-comcast...,2025-06-09 21:29:43,Disney has agreed to pay Comcast $438.7 millio...,Disney to pay Comcast $438.7 million to take f...,NEUTRAL,0.0,UNKNOWN
5,CNBC,"Trump, CEOs promote savings plans for newborns",https://www.cnbc.com/2025/06/09/trump-accounts...,2025-06-09 21:59:12,Dell Technologies pledged Monday to provide $1...,"Trump, CEOs promote savings plans for newborns...",NEUTRAL,0.0,UNKNOWN
6,CNBC,"FDA approves Merck’s RSV shot for infants, ram...",https://www.cnbc.com/2025/06/09/fda-approves-m...,2025-06-09 20:13:02,The Food and Drug Administration on Monday app...,"FDA approves Merck’s RSV shot for infants, ram...",NEUTRAL,0.0,UNKNOWN
7,CNBC,Photos show Waymo vehicles on fire during LA p...,https://www.cnbc.com/2025/06/09/photos-waymo-f...,2025-06-09 21:51:45,Several Waymo autonomous vehicles were set abl...,Photos show Waymo vehicles on fire during LA p...,NEUTRAL,0.0,UNKNOWN
8,CNBC,China and U.S. set to continue trade talks as ...,https://www.cnbc.com/2025/06/09/china-and-us-t...,2025-06-10 02:15:46,U.S.-China trade talks were set to continue in...,China and U.S. set to continue trade talks as ...,NEUTRAL,0.0,UNKNOWN
9,CNBC,Jim Cramer says Circle Internet stock is 'too ...,https://www.cnbc.com/2025/06/09/jim-cramer-say...,2025-06-09 23:00:17,CNBC's Jim Cramer on Monday examined Circle In...,Jim Cramer says Circle Internet stock is 'too ...,NEUTRAL,0.0,UNKNOWN


In [42]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
NEUTRAL,30


In [17]:
pip install yfinance ta pandas tqdm


Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=84facce8c3e48cfd4f6c5881b655e733dbcf89b2610ea967f2da12864402950e
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [18]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

# For this example, let’s assume a single asset like S&P 500
ASSET_SYMBOL = "^GSPC"  # S&P 500 index
WINDOW_HOURS = 24
CHANGE_THRESHOLD = 0.005  # 0.5% up/down


In [26]:
df["published"] = pd.to_datetime(df["published"], errors="coerce")  # Force conversion, invalid → NaT
df = df.dropna(subset=["published"])  # Remove rows with invalid timestamps


In [31]:
def label_market_reaction(news_time, symbol="SPY", threshold=0.01):
    if pd.isnull(news_time):
        return "UNKNOWN"

    try:
        start_date = news_time.date()
        end_date = (news_time + timedelta(days=2)).date()  # Allow wider window

        # Try 1h interval first
        data = yf.download(symbol, start=start_date, end=end_date, interval="1h", progress=False)
        if data.empty or len(data) < 2:
            # Fallback to 1d interval
            data = yf.download(symbol, start=start_date, end=end_date, interval="1d", progress=False)

        if data.empty or len(data) < 2:
            return "UNKNOWN"

        initial_price = data["Close"].iloc[0]
        final_price = data["Close"].iloc[-1]
        change = (final_price - initial_price) / initial_price

        if change > threshold:
            return "UP"
        elif change < -threshold:
            return "DOWN"
        else:
            return "NEUTRAL"
    except Exception as e:
        print(f"Error at {news_time}: {e}")
        return "UNKNOWN"


In [35]:
from tqdm import tqdm
tqdm.pandas(desc="Labeling Market Reaction")

df["market_label"] = df["published"].progress_apply(lambda ts: label_market_reaction(ts))


Labeling Market Reaction:   7%|▋         | 2/30 [00:00<00:01, 15.20it/s]ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['SPY']: YFPricesMissingError('possibly delisted; no price data found  (1h 2025-06-10 -> 2025-06-12)')


Error at 2025-06-09 23:42:58: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  13%|█▎        | 4/30 [00:00<00:04,  6.32it/s]

Error at 2025-06-09 21:31:52: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  20%|██        | 6/30 [00:01<00:04,  5.35it/s]

Error at 2025-06-09 20:51:15: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 21:29:43: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  27%|██▋       | 8/30 [00:01<00:03,  5.64it/s]

Error at 2025-06-09 21:59:12: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 20:13:02: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  30%|███       | 9/30 [00:01<00:03,  5.33it/s]ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['SPY']: YFPricesMissingError('possibly delisted; no price data found  (1h 2025-06-10 -> 2025-06-12)')


Error at 2025-06-09 21:51:45: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  37%|███▋      | 11/30 [00:01<00:03,  5.21it/s]

Error at 2025-06-09 23:00:17: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  43%|████▎     | 13/30 [00:02<00:04,  4.22it/s]

Error at 2025-06-09 18:18:38: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 19:06:48: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  50%|█████     | 15/30 [00:02<00:02,  5.19it/s]

Error at 2025-06-09 16:33:37: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 16:14:29: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  57%|█████▋    | 17/30 [00:03<00:02,  5.13it/s]

Error at 2025-06-09 19:51:46: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 16:48:37: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  60%|██████    | 18/30 [00:03<00:02,  5.48it/s]

Error at 2025-06-09 20:09:01: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  63%|██████▎   | 19/30 [00:03<00:02,  5.10it/s]

Error at 2025-06-09 16:25:38: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  70%|███████   | 21/30 [00:04<00:01,  5.25it/s]

Error at 2025-06-09 18:10:43: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 15:38:34: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  77%|███████▋  | 23/30 [00:04<00:01,  6.94it/s]

Error at 2025-06-09 20:01:18: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 19:36:54: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 20:56:46: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  90%|█████████ | 27/30 [00:04<00:00,  8.82it/s]

Error at 2025-06-09 15:39:12: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 12:09:01: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 17:06:38: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction:  97%|█████████▋| 29/30 [00:04<00:00,  7.62it/s]

Error at 2025-06-09 19:40:37: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 10:30:01: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


Labeling Market Reaction: 100%|██████████| 30/30 [00:05<00:00,  5.85it/s]

Error at 2025-06-09 13:30:01: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Error at 2025-06-09 13:30:01: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().





In [36]:
print(df[["title", "published", "market_label"]].head(20))


                                                title           published  \
0   Apple's WWDC underwhelms on AI, but software g... 2025-06-09 23:42:58   
1   Trump authorizes deploying 2,000 more National... 2025-06-10 01:06:55   
2   RFK Jr. removes all members of CDC panel advis... 2025-06-09 21:31:52   
3   Crypto CEO accused of laundering $500 million ... 2025-06-09 20:51:15   
4   Disney to pay Comcast $438.7 million to take f... 2025-06-09 21:29:43   
5      Trump, CEOs promote savings plans for newborns 2025-06-09 21:59:12   
6   FDA approves Merck’s RSV shot for infants, ram... 2025-06-09 20:13:02   
7   Photos show Waymo vehicles on fire during LA p... 2025-06-09 21:51:45   
8   China and U.S. set to continue trade talks as ... 2025-06-10 02:15:46   
9   Jim Cramer says Circle Internet stock is 'too ... 2025-06-09 23:00:17   
10  Supreme Court just gave DOGE access to Social ... 2025-06-09 18:18:38   
11  Trump, Harvard battle over student visas could... 2025-06-09 19:06:48   

In [38]:
df["market_label"].value_counts()

Unnamed: 0_level_0,count
market_label,Unnamed: 1_level_1
UNKNOWN,30


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 0 to 29
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   source        30 non-null     object        
 1   title         30 non-null     object        
 2   url           30 non-null     object        
 3   published     30 non-null     datetime64[ns]
 4   content       30 non-null     object        
 5   full_text     30 non-null     object        
 6   sentiment     30 non-null     object        
 7   confidence    30 non-null     float64       
 8   market_label  30 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 2.3+ KB
