# Validate Sentiment Analysis

This notebook validates the FinBERT sentiment analysis for financial news.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from glob import glob

# Add src directory to path
sys.path.append('../src')

# Import local modules
from nlp.finbert_sentiment import FinBERTSentiment
from etl.news_sentiment import compute_news_sentiment, aggregate_sentiment_by_symbol

# Set up plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Test FinBERT Sentiment Analyzer

First, let's test the FinBERT sentiment analyzer on some sample financial news.

In [None]:
# Initialize the FinBERT sentiment analyzer
analyzer = FinBERTSentiment()

# Sample financial news
sample_news = [
    "Apple reports record quarterly revenue, beating analyst expectations.",
    "Microsoft shares plunge after disappointing earnings report.",
    "Google announces new AI features for its search engine.",
    "Amazon faces regulatory scrutiny over antitrust concerns.",
    "Tesla's stock remains volatile amid production challenges."
]

# Analyze sentiment
results = analyzer.analyze_with_text(sample_news)

# Display results
for result in results:
    text = result.pop("text")
    sentiment_label = max(result.items(), key=lambda x: x[1])[0]
    sentiment_score = result.get("positive", 0) - result.get("negative", 0)
    
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment_label} (score: {sentiment_score:.2f})")
    print(f"Probabilities: {result}")
    print()

In [None]:
# Create a DataFrame with the results
results_df = pd.DataFrame(results)
results_df["text"] = sample_news
results_df["sentiment_score"] = results_df["positive"] - results_df["negative"]
results_df["sentiment_label"] = results_df[["positive", "neutral", "negative"]].idxmax(axis=1)

# Display the DataFrame
results_df

In [None]:
# Plot sentiment scores
plt.figure(figsize=(12, 6))
plt.bar(range(len(results_df)), results_df["sentiment_score"])
plt.xticks(range(len(results_df)), results_df["text"], rotation=45, ha="right")
plt.title("Sentiment Scores for Sample News")
plt.xlabel("News")
plt.ylabel("Sentiment Score (positive - negative)")
plt.axhline(y=0, color="r", linestyle="-", alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot sentiment probabilities
plt.figure(figsize=(12, 6))
x = range(len(results_df))
width = 0.25

plt.bar([i - width for i in x], results_df["positive"], width=width, label="Positive")
plt.bar(x, results_df["neutral"], width=width, label="Neutral")
plt.bar([i + width for i in x], results_df["negative"], width=width, label="Negative")

plt.xticks(x, results_df["text"], rotation=45, ha="right")
plt.title("Sentiment Probabilities for Sample News")
plt.xlabel("News")
plt.ylabel("Probability")
plt.legend()
plt.tight_layout()
plt.show()

## 2. Generate Sample News Data

Let's generate some sample news data for testing the news sentiment batch job.

In [None]:
# Generate sample news data
symbols = ["AAPL", "MSFT", "GOOGL", "AMZN", "META"]
sample_news_data = []

for symbol in symbols:
    # Positive news
    sample_news_data.append({
        "title": f"{symbol} reports strong quarterly earnings, beating expectations.",
        "content": f"{symbol} announced today that its quarterly earnings exceeded analyst expectations, driven by strong product sales and growth in services revenue.",
        "published_at": datetime.now().isoformat(),
        "source": "Financial Times"
    })
    
    # Negative news
    sample_news_data.append({
        "title": f"{symbol} shares drop after disappointing guidance.",
        "content": f"{symbol} shares fell today after the company provided weaker-than-expected guidance for the next quarter, citing macroeconomic headwinds and supply chain challenges.",
        "published_at": datetime.now().isoformat(),
        "source": "Wall Street Journal"
    })
    
    # Neutral news
    sample_news_data.append({
        "title": f"{symbol} announces new product launch date.",
        "content": f"{symbol} has announced that its new product will be launched next month, as previously scheduled. The company expects the product to be well-received by consumers.",
        "published_at": datetime.now().isoformat(),
        "source": "Bloomberg"
    })

# Create a DataFrame
news_df = pd.DataFrame(sample_news_data)

# Display the DataFrame
news_df.head()

In [None]:
# Create directory for sample news data
sample_news_dir = "../data/raw/news/sample"
os.makedirs(sample_news_dir, exist_ok=True)

# Save sample news data to Parquet
sample_news_path = f"{sample_news_dir}/sample_news.parquet"
news_df.to_parquet(sample_news_path, index=False)

print(f"Saved sample news data to {sample_news_path}")

## 3. Run News Sentiment Batch Job

Now, let's run the news sentiment batch job on the sample news data.

In [None]:
# Run news sentiment batch job
sentiment_df = compute_news_sentiment(
    source_path=sample_news_path,
    output_path="../data/features/batch/sentiment",
    symbols=symbols,
    batch_size=8
)

# Display the results
sentiment_df.head()

In [None]:
# Aggregate sentiment by symbol
agg_df = aggregate_sentiment_by_symbol(sentiment_df)

# Display the aggregated results
agg_df

In [None]:
# Plot sentiment scores by symbol
plt.figure(figsize=(12, 6))
sns.barplot(x="symbol", y="sentiment_score", data=agg_df)
plt.title("Sentiment Scores by Symbol")
plt.xlabel("Symbol")
plt.ylabel("Sentiment Score (positive - negative)")
plt.axhline(y=0, color="r", linestyle="-", alpha=0.3)
plt.grid(True)
plt.show()

In [None]:
# Plot sentiment probabilities by symbol
plt.figure(figsize=(12, 6))
x = range(len(agg_df))
width = 0.25

plt.bar([i - width for i in x], agg_df["positive"], width=width, label="Positive")
plt.bar(x, agg_df["neutral"], width=width, label="Neutral")
plt.bar([i + width for i in x], agg_df["negative"], width=width, label="Negative")

plt.xticks(x, agg_df["symbol"])
plt.title("Sentiment Probabilities by Symbol")
plt.xlabel("Symbol")
plt.ylabel("Probability")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot mention count by symbol
plt.figure(figsize=(12, 6))
sns.barplot(x="symbol", y="mention_count", data=agg_df)
plt.title("Mention Count by Symbol")
plt.xlabel("Symbol")
plt.ylabel("Mention Count")
plt.grid(True)
plt.show()

## 4. Validate Feast Integration

Let's validate the integration with the Feast feature store.

In [None]:
# Check if Feast is available
try:
    from feast import FeatureStore
    feast_available = True
except ImportError:
    feast_available = False
    print("Feast is not available. Please install it with 'pip install feast'.")

In [None]:
if feast_available:
    # Initialize Feast feature store
    fs = FeatureStore(repo_path="../infra/feast/feature_repo")
    
    # Apply feature definitions
    !cd ../infra/feast/feature_repo && feast apply
    
    # Materialize features
    !cd ../infra/feast/feature_repo && feast materialize-incremental $(date +%Y-%m-%d)
    
    # Create entity DataFrame
    entity_df = pd.DataFrame({
        "symbol": symbols,
        "event_timestamp": [datetime.now()] * len(symbols)
    })
    
    # Get features from Feast
    features = fs.get_online_features(
        features=[
            "news_sentiment:positive",
            "news_sentiment:neutral",
            "news_sentiment:negative",
            "news_sentiment:mention_count"
        ],
        entity_rows=[{"symbol": symbol} for symbol in symbols]
    ).to_df()
    
    # Display features
    print("Features from Feast:")
    display(features)

## 5. Combine Sentiment with Price Data

Let's combine the sentiment data with price data to see if there's any correlation.

In [None]:
# Check if we have price data
price_data_path = "../data/processed/training_data.parquet"
if os.path.exists(price_data_path):
    # Load price data
    price_df = pd.read_parquet(price_data_path)
    
    # Filter to the latest date for each symbol
    if "date" in price_df.columns:
        latest_date = price_df["date"].max()
        price_df = price_df[price_df["date"] == latest_date]
    elif "timestamp" in price_df.columns:
        price_df["date"] = pd.to_datetime(price_df["timestamp"]).dt.date
        latest_date = price_df["date"].max()
        price_df = price_df[price_df["date"] == latest_date]
    
    # Group by symbol and get the latest price
    price_df = price_df.groupby("symbol").agg({
        "close": "last",
        "volume": "sum"
    }).reset_index()
    
    # Merge with sentiment data
    combined_df = pd.merge(agg_df, price_df, on="symbol", how="inner")
    
    # Display the combined data
    print("Combined sentiment and price data:")
    display(combined_df)
    
    # Plot sentiment score vs price
    plt.figure(figsize=(12, 6))
    plt.scatter(combined_df["sentiment_score"], combined_df["close"])
    
    # Add labels for each point
    for i, row in combined_df.iterrows():
        plt.annotate(row["symbol"], (row["sentiment_score"], row["close"]))
    
    plt.title("Sentiment Score vs Close Price")
    plt.xlabel("Sentiment Score")
    plt.ylabel("Close Price")
    plt.grid(True)
    plt.show()
    
    # Calculate correlation
    correlation = combined_df[["sentiment_score", "close"]].corr().iloc[0, 1]
    print(f"Correlation between sentiment score and close price: {correlation:.4f}")
else:
    print(f"Price data not found at {price_data_path}")

## 6. Summary and Next Steps

We have successfully validated the FinBERT sentiment analysis for financial news. The sentiment analyzer correctly identifies the sentiment of financial news articles, and the batch job successfully extracts symbols from the news and computes sentiment scores.

Next steps:
1. Collect real financial news data from sources like NewsAPI, Bloomberg, or Reuters
2. Run the sentiment analysis batch job on a regular schedule (e.g., daily)
3. Incorporate sentiment features into the time-series model
4. Analyze the correlation between sentiment and price movements over time
5. Develop trading strategies based on sentiment signals