In [1]:
!pip install yfinance transformers newsapi-python wordcloud plotly --quiet

import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import plotly.express as px
import requests
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from datetime import datetime, timedelta
from wordcloud import WordCloud


In [2]:
# Load FinBERT (tone classification: positive, negative, neutral)
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
model.eval()

def get_sentiment_probs(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()
    return probs  # [neg, neu, pos]


In [4]:
# SET YOUR API KEY HERE
newsapi_key = "9c9b055b9586410b82de1d8775c449e7"

stock_name = "Apple"
from_date = datetime.today() - timedelta(days=14)
to_date = datetime.today() - timedelta(days=1)

params = {
    "q": stock_name,
    "language": "en",
    "from": from_date.strftime('%Y-%m-%d'),
    "to": to_date.strftime('%Y-%m-%d'),
    "sortBy": "publishedAt",
    "pageSize": 100,
    "apiKey": newsapi_key
}

news_url = "https://newsapi.org/v2/everything"
response = requests.get(news_url, params=params)

articles = response.json().get("articles", [])
df = pd.DataFrame(articles)[["title", "publishedAt"]].dropna()
df["date"] = pd.to_datetime(df["publishedAt"]).dt.date

print(f"✅ {len(df)} headlines fetched from NewsAPI.")
df.head()


✅ 96 headlines fetched from NewsAPI.


Unnamed: 0,title,publishedAt,date
0,Olivia Munn Says She Contemplated Stepping Awa...,2025-04-12T23:59:42Z,2025-04-12
1,"Apple is now exempt from Trump’s tariffs, here...",2025-04-12T23:55:19Z,2025-04-12
2,Art imitates life as the cast and crew of Myth...,2025-04-12T23:46:20Z,2025-04-12
3,Android Auto May Soon Work with Smart Glasses,2025-04-12T23:39:18Z,2025-04-12
4,"Trump Adds Tariff Exemptions for Smartphones, ...",2025-04-12T23:35:17Z,2025-04-12


In [5]:
# Run FinBERT sentiment scoring
sentiment_scores = df["title"].apply(get_sentiment_probs)
df["Neg_Prob"] = [p[0] for p in sentiment_scores]
df["Neu_Prob"] = [p[1] for p in sentiment_scores]
df["Pos_Prob"] = [p[2] for p in sentiment_scores]

# Assign top sentiment label
df["Sentiment_Label"] = df[["Neg_Prob", "Neu_Prob", "Pos_Prob"]].idxmax(axis=1).str.replace("_Prob", "")
df.head()


Unnamed: 0,title,publishedAt,date,Neg_Prob,Neu_Prob,Pos_Prob,Sentiment_Label
0,Olivia Munn Says She Contemplated Stepping Awa...,2025-04-12T23:59:42Z,2025-04-12,0.999978,1e-06,2.1e-05,Neg
1,"Apple is now exempt from Trump’s tariffs, here...",2025-04-12T23:55:19Z,2025-04-12,0.99852,2.4e-05,0.001456,Neg
2,Art imitates life as the cast and crew of Myth...,2025-04-12T23:46:20Z,2025-04-12,0.99994,1.5e-05,4.5e-05,Neg
3,Android Auto May Soon Work with Smart Glasses,2025-04-12T23:39:18Z,2025-04-12,0.999983,6e-06,1.2e-05,Neg
4,"Trump Adds Tariff Exemptions for Smartphones, ...",2025-04-12T23:35:17Z,2025-04-12,0.973774,0.020707,0.005519,Neg


In [7]:
# Average daily sentiment
daily_sentiment = df.groupby("date").agg({
    "Pos_Prob": "mean",
    "Neg_Prob": "mean"
}).reset_index()

# Use margin to define direction
margin = 0.1

def predict_direction(row):
    diff = row["Pos_Prob"] - row["Neg_Prob"]
    if diff > margin:
        return "up"
    elif diff < -margin:
        return "down"
    else:
        return "neutral"

daily_sentiment["Prediction"] = daily_sentiment.apply(predict_direction, axis=1)
daily_sentiment.head()


Unnamed: 0,date,Pos_Prob,Neg_Prob,Prediction
0,2025-04-12,0.060203,0.839152,down


In [13]:
ticker = "AAPL"

# ✅ Download WITHOUT group_by
stock_data = yf.download(ticker, start=from_date, end=to_date + timedelta(days=2), auto_adjust=True)

# ✅ Flatten MultiIndex if exists
if isinstance(stock_data.columns, pd.MultiIndex):
    stock_data.columns = [col[0] for col in stock_data.columns]

# ✅ Show what we have
print("Flattened Columns:", stock_data.columns.tolist())

# ✅ Reset index and convert date
stock_data = stock_data.reset_index()
stock_data["Date"] = pd.to_datetime(stock_data["Date"]).dt.date

# ✅ Calculate next day's close
stock_data["Next_Close"] = stock_data["Close"].shift(-1)

# ✅ Drop rows with NaNs in required columns
stock_data = stock_data.dropna(subset=["Close", "Next_Close"])

# ✅ Define real movement label
def get_real_movement(row):
    if row["Next_Close"] > row["Close"]:
        return "up"
    elif row["Next_Close"] < row["Close"]:
        return "down"
    else:
        return "neutral"

stock_data["Real_Movement"] = stock_data.apply(get_real_movement, axis=1)

# ✅ Final cleaned price_df
price_df = stock_data[["Date", "Real_Movement"]]
price_df.head()



[*********************100%***********************]  1 of 1 completed

Flattened Columns: ['Close', 'High', 'Low', 'Open', 'Volume']





Unnamed: 0,Date,Real_Movement
0,2025-03-31,up
1,2025-04-01,up
2,2025-04-02,down
3,2025-04-03,down
4,2025-04-04,down


In [14]:
# ✅ Merge sentiment predictions with real market movement
merged = pd.merge(daily_sentiment, price_df, left_on="date", right_on="Date", how="inner")

# ✅ Drop rows with missing predictions or movement
merged = merged.dropna(subset=["Prediction", "Real_Movement"])

# ✅ Debugging info
print("🧪 Merged Shape:", merged.shape)
print("🔹 Prediction Sample:", merged["Prediction"].head().tolist())
print("🔹 Real Movement Sample:", merged["Real_Movement"].head().tolist())
print("📋 Data Types:\n", merged.dtypes)

# ✅ Row-wise comparison to avoid Series mismatch
merged["Correct"] = merged.apply(lambda row: row["Prediction"] == row["Real_Movement"], axis=1)

# ✅ Calculate accuracy (excluding 'neutral')
non_neutral = merged[merged["Prediction"] != "neutral"]
accuracy = non_neutral["Correct"].mean() * 100 if not non_neutral.empty else 0.0

print(f"\n✅ Prediction Accuracy (excluding 'neutral'): {accuracy:.2f}%")
merged[["date", "Prediction", "Real_Movement", "Correct"]]


🧪 Merged Shape: (0, 6)
🔹 Prediction Sample: []
🔹 Real Movement Sample: []
📋 Data Types:
 date              object
Pos_Prob         float64
Neg_Prob         float64
Prediction        object
Date              object
Real_Movement     object
dtype: object

✅ Prediction Accuracy (excluding 'neutral'): 0.00%


Unnamed: 0,date,Prediction,Real_Movement,Correct


In [19]:
# ✅ Check available dates
print("🗓 Sentiment dates:", daily_sentiment["date"].unique())
print("📈 Stock market dates:", price_df["Date"].unique())

# ✅ Keep only dates that exist in both news & stock market
valid_dates = price_df["Date"].unique()
daily_sentiment_filtered = daily_sentiment[daily_sentiment["date"].isin(valid_dates)]

# ✅ Merge on matching trading days
merged = pd.merge(daily_sentiment_filtered, price_df, left_on="date", right_on="Date", how="inner")

# ✅ Drop any residual NaNs
merged = merged.dropna(subset=["Prediction", "Real_Movement"])

# ✅ Row-wise comparison
merged["Correct"] = merged.apply(lambda row: row["Prediction"] == row["Real_Movement"], axis=1)

# ✅ Accuracy excluding 'neutral'
non_neutral = merged[merged["Prediction"] != "neutral"]
accuracy = non_neutral["Correct"].mean() * 100 if not non_neutral.empty else 0.0

# ✅ Final output
print(f"\n✅ Final Accuracy (excluding 'neutral'): {accuracy:.2f}%")
print("✅ Merged on these dates:", merged['date'].tolist())

# ✅ Display result table
merged[["date", "Prediction", "Real_Movement", "Correct"]]


🗓 Sentiment dates: [datetime.date(2025, 4, 12)]
📈 Stock market dates: [datetime.date(2025, 3, 31) datetime.date(2025, 4, 1)
 datetime.date(2025, 4, 2) datetime.date(2025, 4, 3)
 datetime.date(2025, 4, 4) datetime.date(2025, 4, 7)
 datetime.date(2025, 4, 8) datetime.date(2025, 4, 9)
 datetime.date(2025, 4, 10)]

✅ Final Accuracy (excluding 'neutral'): 0.00%
✅ Merged on these dates: []


Unnamed: 0,date,Prediction,Real_Movement,Correct
