In [2]:
import yfinance
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import finhub_scraper as scraper
from finhub_scraper import fetch_company_news_finhub
import transformers
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
from typing import Optional
import finnhub

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


recreate this paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC9955765/

In [3]:
os.environ["FINNHUB_API_KEY"] = "d3c9651r01qu125a70cgd3c9651r01qu125a70d0"

In [16]:
class dataFetcher:


    def __init__(self, ticker: str):
        self.ticker = ticker
        self.data = None
    def fetch_financial_data(self, period: str, interval: str = "1d") -> pd.DataFrame:
        """Fetch historical financial data for a given ticker."""
        try:
            stock = yfinance.Ticker(self.ticker)
            hist = stock.history(period=period, interval=interval)
            return hist
        except Exception as e:
            print(f"Error fetching data for {self.ticker}: {e}")
            return pd.DataFrame()
        
    def fetch_financial_data(self, start_date,end_date)->pd.DataFrame:
        """Fetch historical financial data for a given ticker."""
        try:
            stock = yfinance.Ticker(self.ticker)
            hist = stock.history(start=start_date,end=end_date)
            hist=yfinance.dowload(self.ticker, start=start_date, end=end_date)
            self.data = hist

            return hist
        except Exception as e:
            print(f"Error fetching data for {self.ticker}: {e}")
            return pd.DataFrame()
    def plot_financial_data(data: pd.DataFrame, ticker: str):
        """Plot closing prices of the financial data."""
        if data.empty:
            print("No data to plot.")
            return
        
        plt.figure(figsize=(10, 5))
        sns.lineplot(data=data, x=data.index, y='Close')
        plt.title(f"{ticker} Closing Prices")
        plt.xlabel("Date")
        plt.ylabel("Closing Price (USD)")
        plt.grid(True)
        plt.show()

    def fetch_company_news_finhub(self, _from_date: str, _end_date:str ) -> pd.DataFrame:
        """Fetch company news from Finnhub for the given ticker and start/end dates

        Args:
            ticker: Stock ticker (e.g., 'AAPL').
            _from_date: Start date in 'YYYY-MM-DD' format.
            _end_date: End date in 'YYYY-MM-DD' format.

        Returns:
            pandas.DataFrame with columns: ['published_date', 'headline', 'url', 'summary', 'source', 'related', 'image']

        Raises:
            ValueError: if API key is not provided.
            requests.RequestException: if the HTTP request fails.
        """

        api_key =  os.getenv("FINNHUB_API_KEY")
        

        symbol = self.ticker.upper()
        today = datetime.today()
        
        start_date=_from_date
        end_date=_end_date
    

        client=finnhub.Client(api_key)
        rows={"published_date": [],
                    "headline":[],
                    "url":[],
                    "summary": [],
                    "source":[],
                    "related": [],
        }
        
        try:
            result=client.company_news(symbol, _from=start_date, to=end_date)
            for item in result:
                published = datetime.fromtimestamp(item.get("datetime")).strftime("%Y-%m-%d %H:%M:%S") if item.get("datetime") else None
                rows["published_date"].append(published)
                rows["headline"].append(item["headline"] or "")
                rows["url"].append(item["url"] or "")
                rows["summary"].append(item["summary"] or "")
                rows["source"].append(item["source"] or "")
                rows["related"].append(item["related"] or "")
                #rows["image"].append(item["image"] or "")

            
        except Exception as e:
            print(f"Error fetching Finnhub news for {symbol}: {e}")
            raise

        df = pd.DataFrame(rows)
        # Sort newest first and return
        if "published_date" in df.columns:
            df = df.sort_values(by="published_date", ascending=False).reset_index(drop=True)

        return df






In [5]:
def calculate_rsi(data: pd.DataFrame, rsi_period: int =14)-> pd.Series:
    """Calculate the RSI for each day in the data"""
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).fillna(0)
    loss = (-delta.where(delta < 0, 0)).fillna(0)

    avg_gain = gain.rolling(window=rsi_period, min_periods=1).mean()
    avg_loss = loss.rolling(window=rsi_period, min_periods=1).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi


In [20]:
SAMPLE_TICKERS = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]
fetcher=dataFetcher("MSFT")
#sample_data=fetcher.fetch_financial_data(start_date="2024-10-01",end_date="2024-10-31")
sample_data=yfinance.download("MSFT", start="2024-10-01", end="2024-10-31")

#sample_data['RSI'] = calculate_rsi(sample_data)



article_data=fetcher.fetch_company_news_finhub("2024-10-01","2024-10-31")
#goal is to merge two datasets so stock price at each date has sentiment score for that date or whichever is closest
article_data.head()

[*********************100%***********************]  1 of 1 completed

1 Failed download:
['MSFT']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Unnamed: 0,published_date,headline,url,summary,source,related
0,2024-10-31 16:58:00,"Amazon.com Inc., Intel share gains contribute ...",https://finnhub.io/api/news?id=3efab5104e488d2...,"Amazon.com Inc., Intel share gains contribute ...",MarketWatch,MSFT
1,2024-10-31 16:30:07,Microsoft Stock Slides 5.6% In October Despite...,https://finnhub.io/api/news?id=0adb52f6a636a64...,"Enterprise software juggernaut Microsoft, like...",Yahoo,MSFT
2,2024-10-31 15:56:54,Amazon CEO Says GenAI Is Growing Three Times F...,https://finnhub.io/api/news?id=51947d0de87a25b...,The growth of generative artificial intelligen...,Yahoo,MSFT
3,2024-10-31 15:30:48,"AI Stocks: Tech Giants, Cloud Titans Face 'Sho...",https://finnhub.io/api/news?id=6a86fd373ba2617...,"AI Stocks: Tech Giants, Cloud Titans Face 'Sho...",DowJones,MSFT
4,2024-10-31 14:28:00,ChatGPT's New Web Search Tools Bring Live Upda...,https://finnhub.io/api/news?id=ef778f42ad9ec3f...,OpenAI expands ChatGPT with new web search too...,Yahoo,MSFT


In [21]:
sample_data.shape

(0, 6)

In [11]:
def set_article_index_to_datetime(article_df: pd.DataFrame, date_col: str = "published_date") -> pd.DataFrame:
   
    if article_df is None or article_df.empty:
        return pd.DataFrame()

    df = article_df.copy()
    if date_col in df.columns:
        df[date_col] = pd.to_datetime(df[date_col], errors="coerce") # date_time the column
        
        df = df.loc[~df[date_col].isna()].copy() # df that is all except rows where date_col is None
        df = df.set_index(date_col)
    else:
        df.index = pd.to_datetime(df.index, errors="coerce")
        df = df.loc[~df.index.isna()].copy()

    try:
        df.index = df.index.tz_convert(None)
    except Exception:
        pass

    df = df.sort_index()
    return df

In [None]:
def combine_headlines_with_financial(financial_df: pd.DataFrame,
                                    article_df: pd.DataFrame,
                                    date_col: str = "published_date",
                                    headline_col: str = "headline",
                                    max_days_diff: int = 1) -> pd.DataFrame:
   
    if financial_df is None or financial_df.empty:
        return pd.DataFrame()

    fin = financial_df.copy()
    # Ensure financial index is DatetimeIndex and sorted ascending
    fin.index = pd.to_datetime(fin.index)
    fin = fin.sort_index()

    # Prepare articles
    art = set_article_index_to_datetime(article_df, date_col=date_col)
    if art.empty:
        fin = fin.copy()
        fin["headlines"] = pd.NA
        return fin

    art_daily = art.copy()
    art_daily["_date"] = art_daily.index.normalize()
    
    if headline_col not in art_daily.columns:
        art_daily[headline_col] = ""

    agg = (
        art_daily.groupby("_date")[headline_col]
        .apply(lambda s: " || " .join(s.dropna().astype(str)))
        .reset_index()
        .rename(columns={"_date": "date", headline_col: "headlines"})
    )

    # Make sure both sides are sorted and datetime
    agg["date"] = pd.to_datetime(agg["date"])
    agg = agg.sort_values("date")

    fin_reset = fin.reset_index()
    left_date_col = fin_reset.columns[0]
    fin_reset = fin_reset.sort_values(left_date_col)

    # merge_asof for nearest-match; tolerance controls how far we allow the match
    merged = pd.merge_asof(
        fin_reset,
        agg,
        left_on=left_date_col,
        right_on="date",
        direction="nearest",
        tolerance=pd.Timedelta(days=max_days_diff),
    )

  
    merged = merged.set_index(left_date_col)
    
    merged.index = pd.to_datetime(merged.index)

    # If headlines column doesn't exist for any reason, create it
    if "headlines" not in merged.columns:
        merged["headlines"] = pd.NA

    return merged