In [3]:
import os
import sys
import pandas as pd

# Make sure we can import from src/
ROOT_DIR = os.path.abspath("..")
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

DATA_DIR = os.path.join(ROOT_DIR, "data")
NEWS_CSV = os.path.join(DATA_DIR, "cleaned_news.csv")  # adapt name if different
OUTPUT_DIR = os.path.join(DATA_DIR, "processed")

TICKERS = ["AAPL", "AMZN", "GOOG", "META", "MSFT", "NVDA", "TSLA"]

from scripts.news_stock_alignment import (
    load_price_data,
    load_cleaned_news,
    filter_to_overlap,
    align_news_with_prices,
    save_aligned_data,
)


In [4]:
prices = load_price_data(TICKERS, DATA_DIR, date_col="Date")

print("Price data sample:")
display(prices.head())

print("\nPrice date range:", prices["Date"].min(), "→", prices["Date"].max())



Price data sample:


Unnamed: 0,Date,Ticker,Close,High,Low,Open,Volume
0,2009-01-02,AAPL,2.721686,2.730385,2.554037,2.57563,746015200
1,2009-01-05,AAPL,2.836553,2.884539,2.780469,2.794266,1181608400
2,2009-01-06,AAPL,2.789767,2.914229,2.770872,2.877641,1289310400
3,2009-01-07,AAPL,2.729484,2.77417,2.70699,2.753477,753048800
4,2009-01-08,AAPL,2.780169,2.793666,2.700393,2.71209,673500800



Price date range: 2009-01-02 00:00:00 → 2023-12-29 00:00:00


In [5]:
# Rename 'Ticker' column in prices to 'stock' so it matches the news dataframe
if "stock" not in prices.columns:
    if "Ticker" in prices.columns:
        prices = prices.rename(columns={"Ticker": "stock"})
        print("Renamed 'Ticker' -> 'stock' in prices.")
    else:
        raise KeyError("Neither 'Ticker' nor 'stock' columns found in prices.")
else:
    print("'stock' column already present in prices.")

display(prices.head())
print("columns:", prices.columns.tolist())

Renamed 'Ticker' -> 'stock' in prices.


Unnamed: 0,Date,stock,Close,High,Low,Open,Volume
0,2009-01-02,AAPL,2.721686,2.730385,2.554037,2.57563,746015200
1,2009-01-05,AAPL,2.836553,2.884539,2.780469,2.794266,1181608400
2,2009-01-06,AAPL,2.789767,2.914229,2.770872,2.877641,1289310400
3,2009-01-07,AAPL,2.729484,2.77417,2.70699,2.753477,753048800
4,2009-01-08,AAPL,2.780169,2.793666,2.700393,2.71209,673500800


columns: ['Date', 'stock', 'Close', 'High', 'Low', 'Open', 'Volume']


In [6]:
news = load_cleaned_news(
    NEWS_CSV,
    date_col="Date",     # change if your column name is different
    ticker_col="stock", # change if your column name is different
)

print("News data sample:")
display(news.head())

print("\nNews date range:", news["Date"].min(), "→", news["Date"].max())


News data sample:


Unnamed: 0.1,Date,stock,Unnamed: 0,headline,url,publisher
0,2020-06-05,A,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights
1,2020-06-03,A,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights
2,2020-05-26,A,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin
3,2020-05-22,A,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin
4,2020-05-22,A,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer



News date range: 2009-02-14 00:00:00 → 2020-06-11 00:00:00


In [7]:
prices_f, news_f, (start, end) = filter_to_overlap(prices, news, date_col="Date")

print("Overlapping date window:", start, "→", end)
print("Filtered price shape:", prices_f.shape)
print("Filtered news shape:", news_f.shape)


Overlapping date window: 2009-02-14 00:00:00 → 2020-06-11 00:00:00
Filtered price shape: (18785, 7)
Filtered news shape: (1407328, 6)


In [8]:
print(news_f.columns.tolist())


['Date', 'stock', 'Unnamed: 0', 'headline', 'url', 'publisher']


In [9]:
aligned = align_news_with_prices(
    prices_f,
    news_f,
    date_col="Date",
    ticker_col="stock",
    price_close_col="Close",  # from your price CSV
    how="inner",              # keep only rows where we have both price & news
)

print("Aligned data sample:")
display(aligned.head())

print("\nAligned shape:", aligned.shape)


Aligned data sample:


Unnamed: 0.1,Date,stock,Unnamed: 0,headline,url,publisher,ClosePrice
0,2020-06-10,AAPL,7120,Tech Stocks And FAANGS Strong Again To Start D...,https://www.benzinga.com/government/20/06/1622...,JJ Kinahan,85.566071
1,2020-06-10,AAPL,7121,10 Biggest Price Target Changes For Wednesday,https://www.benzinga.com/analyst-ratings/price...,Lisa Levin,85.566071
2,2020-06-10,AAPL,7122,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",https://www.benzinga.com/short-sellers/20/06/1...,Benzinga Newsdesk,85.566071
3,2020-06-10,AAPL,7123,"Deutsche Bank Maintains Buy on Apple, Raises P...",https://www.benzinga.com/news/20/06/16219873/d...,Benzinga Newsdesk,85.566071
4,2020-06-10,AAPL,7124,Apple To Let Users Trade In Their Mac Computer...,https://www.benzinga.com/news/20/06/16218697/a...,Neer Varshney,85.566071



Aligned shape: (6703, 7)


In [10]:
output_path = os.path.join(OUTPUT_DIR, "news_price_aligned.csv")
save_aligned_data(aligned, output_path, index=False)

print(f"Aligned news+price data saved to:\n{output_path}")


Aligned news+price data saved to:
c:\Users\filimon.hailemariam\Downloads\Price Movement -Week1\Predicting-Price-Moves-with-News-Sentiment\data\processed\news_price_aligned.csv
