In [2]:
import os
import requests, time, pytz
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm

### Retrieving news for individual companies

In [None]:
tickers = ['NVDA', 'GOOG', 'AMZN', 'AAPL', 'MSFT', 'SPOT', 'TSLA','JPM', 'GS', 'LMT']
periods = [("10162021", "10152022"),("10162022", "10152023"),("10162023", "10152024"),("10162024", "10162025")]
api = "yqc0pblfolwc9njhjdi5n6nde1dkhzmgsxenb9wd"
base_url = "https://stocknewsapi.com/api/v1"

# make directory
os.makedirs("../news_data", exist_ok=True)

def fetch_news(ticker, start_date, end_date, api_key):
    """Fetch historical news for one ticker between start_date and end_date."""
    params = {
        "tickers": ticker,
        "items": 100,
        "date": f"{start_date}-{end_date}",
        "token": api_key,
    }
    page = 1
    news = []

    while True:
        params["page"] = page
        resp = requests.get(base_url, params=params).json()
        data = resp.get("data", [])
        if not data:
            break
        news.extend(data)
        page += 1

    # Convert to DataFrame
    df = pd.DataFrame(news)
    return df


for ticker in tqdm(tickers, desc="Progress"):
    for start_date, end_date in periods:
        df = fetch_news(ticker, start_date, end_date, api)
        csv_path = f"../data/news_data/{ticker}_news_{start_date}_{end_date}.csv"
        df.to_csv(csv_path, index=False)

Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Progress: 100%|██████████| 10/10 [06:25<00:00, 38.51s/it]


In [3]:
tickers = ['NVDA', 'GOOG', 'AMZN', 'AAPL', 'MSFT', 'SPOT', 'TSLA','JPM', 'GS', 'LMT']
periods = [("10162021", "10152022"),("10162022", "10152023"),("10162023", "10152024"),("10162024", "10162025")]
dfs = {}

for ticker in tickers:
    df_list = []
    for start_date, end_date in periods:
        path = f"../data/news_data/{ticker}_news_{start_date}_{end_date}.csv"
        df = pd.read_csv(path)
        if not df.empty:
            df["Date"] = pd.to_datetime(df["date"], utc = True)
            df["Date"] = df["Date"].dt.tz_convert("America/New_York")
            df_list.append(df)
        
    df_merged = pd.concat(df_list, ignore_index=True)
    dfs[ticker] = df_merged

# An example news DataFrame
dfs['NVDA'].head(5)

Unnamed: 0,news_url,image_url,title,text,source_name,date,topics,sentiment,type,tickers,Date
0,https://www.fool.com/investing/2022/10/15/a-qu...,https://cdn.snapi.dev/images/v1/5/h/b/semi1-15...,A Quick Update on Nvidia's Gaming and Automoti...,Nvidia stock was down roughly 5% on Friday.,The Motley Fool,"Sat, 15 Oct 2022 10:09:00 -0400",['podcast'],Neutral,Article,['NVDA'],2022-10-15 10:09:00-04:00
1,https://www.fool.com/investing/2022/10/15/an-i...,https://cdn.snapi.dev/images/v1/p/u/x/computer...,An Investor's Look at Semiconductors,"""They are in everything from the remote contro...",The Motley Fool,"Sat, 15 Oct 2022 07:20:00 -0400",['podcast'],Neutral,Article,"['AAPL', 'AMAT', 'AMZN', 'ASML', 'CDNS', 'GFS'...",2022-10-15 07:20:00-04:00
2,https://investorplace.com/2022/10/take-advanta...,https://cdn.snapi.dev/images/v1/i/m/9/semi16-1...,Take Advantage of AMD Warning and Bargain Buy ...,"On Friday, Oct. 7, Advanced Micro Devices (NAS...",InvestorPlace,"Fri, 14 Oct 2022 11:31:45 -0400",[],Positive,Article,['NVDA'],2022-10-14 11:31:45-04:00
3,https://seekingalpha.com/article/4546523-china...,https://cdn.snapi.dev/images/v1/q/7/i/semi5-15...,China Chip Export Restriction Analysis - AMD V...,Semiconductor stocks are now trading at levels...,Seeking Alpha,"Fri, 14 Oct 2022 02:12:22 -0400",['paylimitwall'],Negative,Article,"['AMD', 'NVDA']",2022-10-14 02:12:22-04:00
4,https://247wallst.com/investing/2022/10/13/the...,https://cdn.snapi.dev/images/v1/s/h/l/catalog-...,These Were The Five Best And Worst Performing ...,"The third quarter started with a relief rally,...",24/7 Wall Street,"Thu, 13 Oct 2022 19:50:57 -0400",[],Neutral,Article,"['AMZN', 'KO', 'LLY', 'META', 'NVDA', 'PFE', '...",2022-10-13 19:50:57-04:00


Since we aim to predict whether the closing price on a given day is higher than that of the previous day, we will use market news published from the after-hours of the previous day up to 3 p.m. on the current day. For example, for 2025-10-01, the relevant news will be those released between 2025-09-30 4 p.m and 2025-10-01 3 p.m., and we assign 2025-09-30 to this period of time because we're trying to predict the price movements for 2025-10-01. We believe that sentiment expressed in news during this period has the greatest influence on the closing price.

In [4]:
def trading_date(ts):
    if isinstance(ts, pd.Timestamp):
        pass
    else:
        ts = pd.to_datetime(ts)
    time = ts.time()
    # If time is after 4pm, treat as next day's trading date
    if time >= pd.Timestamp("16:00:00").time():
        date_to_predict = (ts + timedelta(days=1)).date()
    # If before 3pm, keep same date
    else: 
        date_to_predict = ts.date()
    return (pd.Timestamp(date_to_predict) - pd.Timedelta(days = 1)).date()

for ticker in tickers:
    df =  dfs[ticker]
    df["Date"] = df["Date"].apply(trading_date)
    cols = ['Date','tickers','title','text','source_name','sentiment']
    new_df = df[cols].sort_values("Date").reset_index(drop=True)
    dfs[ticker] = new_df

dfs['NVDA']

Unnamed: 0,Date,tickers,title,text,source_name,sentiment
0,2021-10-17,"['BABA', 'CRM', 'NVDA', 'PYPL', 'TSLA', 'V']",8 Hot Stocks With the Potential to Join the El...,"At one point, it didn't seem like there would ...",InvestorPlace,Positive
1,2021-10-17,"['NVDA', 'TDOC']",Better Growth Stock: Nvidia or Teladoc Health?,Both have exceptional growth opportunities.,The Motley Fool,Positive
2,2021-10-17,['NVDA'],This Is What Whales Are Betting On NVIDIA,A whale with a lot of money to spend (and poss...,Benzinga,Positive
3,2021-10-18,"['NVDA', 'RKT']","Top Stock Picks for Week of October 18, 2021",A Stock Exhibiting Above Average Growth in Fin...,Zacks Investment Research,Positive
4,2021-10-18,['NVDA'],Why NVIDIA Can Thrive Through a Downturn,Slower economic growth shouldn't stall this bu...,The Motley Fool,Negative
...,...,...,...,...,...,...
21828,2025-10-15,['NVDA'],Why Nvidia stock is climbing fast after TSMC's...,Nvidia stock (NASDAQ: NVDA) saw a notable rise...,Invezz,Positive
21829,2025-10-15,['NVDA'],Nvidia (NVDA) is a Top-Ranked Growth Stock: Sh...,"Whether you're a value, growth, or momentum in...",Zacks Investment Research,Positive
21830,2025-10-15,"['AMZN', 'GOOG', 'HSAI', 'MBLY', 'NVDA', 'QS',...",7 Driverless Vehicle Stocks That Could Set You...,The driverless vehicle market has the potentia...,The Motley Fool,Positive
21831,2025-10-15,"['MU', 'NFLX', 'NVDA', 'TMUS']","Wall Street Analysts are Bullish on NVDA, MU, ...",The image featured for this article is © onein...,24/7 Wall Street,Positive


### Compute the sentiment scores

In [5]:
sentiment_scores = {}
sentiment_map = {"Positive": 1, "Neutral": 0, "Negative": -1}
for ticker in tickers:
     df = dfs[ticker]
     df["sentiment_score"] = df["sentiment"].map(sentiment_map)
     daily_sentiment = (df.groupby("Date").agg(sentiment_mean=("sentiment_score", "mean"),article_count=("sentiment_score", "count")).reset_index())
     sentiment_scores[ticker] = daily_sentiment
     csv_path = f"../data/sentiment_scores/{ticker}_sentiment.csv"
     daily_sentiment.to_csv(csv_path, index=False)

In [6]:
ticker = 'AAPL'
pd.read_csv(f"../data/sentiment_scores/{ticker}_sentiment.csv")

Unnamed: 0,Date,sentiment_mean,article_count
0,2021-10-15,1.000000,3
1,2021-10-16,0.000000,1
2,2021-10-17,0.625000,32
3,2021-10-18,0.470588,17
4,2021-10-19,0.888889,9
...,...,...,...
1440,2025-10-12,1.000000,8
1441,2025-10-13,0.000000,5
1442,2025-10-14,0.500000,26
1443,2025-10-15,0.421053,19
