In [16]:
import pandas as pd
import numpy as np
from transformers import pipeline
import yfinance as yf
import pandas as pd
from datetime import datetime

In [2]:
news_df = pd.read_csv('data/finnhub_news.csv')

In [3]:
news_df.head()

Unnamed: 0,date,symbol,headline,summary,source,url
0,2025-02-20,AAPL,Tracking Ray Dalio's Bridgewater Associates 13...,Bridgewater Associates' Q3 moves: Portfolio va...,SeekingAlpha,https://finnhub.io/api/news?id=f3260216083646e...
1,2025-02-20,AAPL,Madison Sustainable Equity Fund Q4 2024 Invest...,The S&P 500 ended 2024 with a 25% gain for the...,SeekingAlpha,https://finnhub.io/api/news?id=03de51df45863f6...
2,2025-02-20,AAPL,Broadcom Is Threatening AMD's Data Center Oppo...,"Explore Advanced Micro Devices, Inc.'s journey...",SeekingAlpha,https://finnhub.io/api/news?id=a8f77b9fa84d454...
3,2025-02-20,AAPL,Apple's iPhone 16E Strategy,"Nabila Popal, senior research director at the ...",Finnhub,https://finnhub.io/api/news?id=a8bd7172af2bd15...
4,2025-02-20,AAPL,Apple’s iPhone 16e Is a Big Moment for the Com...,Apple’s iPhone 16e Is a Big Moment for the Com...,MarketWatch,https://finnhub.io/api/news?id=1a3f05459e859d2...


In [4]:
pipe = pipeline("text-classification", model="ProsusAI/finbert")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use mps:0


In [11]:
# Convert summaries to list first, then process each with FinBERT
summaries = news_df['summary'].fillna('').astype(str).tolist()

# Create lists to store sentiment scores and labels
sentiment_scores = []
sentiment_labels = []

# Process each summary individually
for summary in summaries:
    result = pipe.predict(summary)[0]
    label = result['label']
    score = result['score']
    
    # Store the label
    sentiment_labels.append(label)
    
    # Calculate the modified score based on label
    if label == 'positive':
        sentiment_scores.append(score)
    elif label == 'negative':
        sentiment_scores.append(score * -1)
    else:  # neutral
        sentiment_scores.append(score / 10)

# Add both the scores and labels to the DataFrame
news_df['summary_sentiment'] = sentiment_scores
news_df['sentiment_label'] = sentiment_labels

In [13]:
# Convert headlines to list first, then process each with FinBERT
headlines = news_df['headline'].fillna('').astype(str).tolist()

# Create lists to store sentiment scores and labels
headline_sentiment_scores = []
headline_sentiment_labels = []

# Process each headline individually
for headline in headlines:
    result = pipe.predict(headline)[0]
    label = result['label']
    score = result['score']
    
    # Store the label
    headline_sentiment_labels.append(label)
    
    # Calculate the modified score based on label
    if label == 'positive':
        headline_sentiment_scores.append(score)
    elif label == 'negative':
        headline_sentiment_scores.append(score * -1)
    else:  # neutral
        headline_sentiment_scores.append(score / 10)

# Add both the scores and labels to the DataFrame
news_df['headline_sentiment'] = headline_sentiment_scores
news_df['headline_sentiment_label'] = headline_sentiment_labels

In [20]:
news_df['date']

0       2025-02-20
1       2025-02-20
2       2025-02-20
3       2025-02-20
4       2025-02-20
           ...    
2830    2025-01-17
2831    2025-01-16
2832    2025-01-16
2833    2025-01-16
2834    2025-01-15
Name: date, Length: 2835, dtype: object

In [21]:
# First, let's examine the date distribution in the original DataFrame
print("Date value counts:")
print(news_df['date'].value_counts().head(10))

# Check data types and potential issues
print("\nDate column data type:", news_df['date'].dtype)
print("Number of unique dates:", news_df['date'].nunique())
print("Date range:", news_df['date'].min(), "to", news_df['date'].max())

# Ensure dates are in the correct format
# Convert the date column to datetime if it's not already
news_df['date'] = pd.to_datetime(news_df['date'])

# Filter to our expected date range (2025-01-15 to 2025-02-20)
filtered_news_df = news_df[
    (news_df['date'] >= '2025-01-15') & 
    (news_df['date'] <= '2025-02-20')
]

print("\nAfter filtering:")
print("Number of rows:", len(filtered_news_df))
print("Number of unique dates:", filtered_news_df['date'].nunique())
print("Date range:", filtered_news_df['date'].min(), "to", filtered_news_df['date'].max())

# Now create daily sentiment with the filtered data
daily_sentiment_df = filtered_news_df.groupby(filtered_news_df['date'].dt.date).agg({
    'summary_sentiment': 'mean',
    'headline_sentiment': 'mean'
}).reset_index()

# Rename columns for clarity
daily_sentiment_df = daily_sentiment_df.rename(columns={
    'summary_sentiment': 'avg_summary_sentiment',
    'headline_sentiment': 'avg_headline_sentiment'
})

# Calculate overall average sentiment
daily_sentiment_df['avg_overall_sentiment'] = (
    daily_sentiment_df['avg_summary_sentiment'] + 
    daily_sentiment_df['avg_headline_sentiment']
) / 2

print("\nDaily sentiment DataFrame:")
print("Shape:", daily_sentiment_df.shape)
print(daily_sentiment_df.head())

Date value counts:
date
2025-02-05    194
2025-02-19    181
2025-02-10    169
2025-02-18    161
2025-02-14    159
2025-02-13    155
2025-02-11    144
2025-02-20    141
2025-02-07    139
2025-01-15    136
Name: count, dtype: int64

Date column data type: object
Number of unique dates: 41
Date range: 2020-07-22 to 2025-02-20

After filtering:
Number of rows: 2828
Number of unique dates: 37
Date range: 2025-01-15 00:00:00 to 2025-02-20 00:00:00

Daily sentiment DataFrame:
Shape: (37, 4)
         date  avg_summary_sentiment  avg_headline_sentiment  \
0  2025-01-15               0.305992                0.243975   
1  2025-01-16               0.137057                0.111931   
2  2025-01-17               0.023022               -0.003805   
3  2025-01-18               0.252645                0.046424   
4  2025-01-19               0.200463                0.301068   

   avg_overall_sentiment  
0               0.274983  
1               0.124494  
2               0.009608  
3               0.

In [17]:
start_date = "2025-01-01"
end_date = "2025-02-20"

# Download VIX data
vix_data = yf.download("^VIX", start=start_date, end=end_date)

# Extract needed columns
vix_df = vix_data[["Open", "High", "Low", "Close", "Volume"]].copy()

# Convert index to a date column and reset index
vix_df["date"] = vix_df.index.date
vix_df = vix_df.reset_index(drop=True)

# Reorder columns to have date first
vix_df = vix_df[["date", "Open", "High", "Low", "Close", "Volume"]]

# Print the first few rows to verify
print(vix_df.head())

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Price         date       Open       High        Low      Close Volume
Ticker                   ^VIX       ^VIX       ^VIX       ^VIX   ^VIX
0       2025-01-02  17.209999  19.500000  16.959999  17.930000      0
1       2025-01-03  17.660000  17.940001  16.110001  16.129999      0
2       2025-01-06  16.770000  16.870001  15.710000  16.040001      0
3       2025-01-07  16.480000  18.900000  15.790000  17.820000      0
4       2025-01-08  17.910000  19.500000  17.370001  17.700001      0



