In [6]:
import pandas as pd
import numpy as np
from zoneinfo import ZoneInfo

In [7]:
ET = ZoneInfo("America/New_York")
CUTOFF_HOUR = 15
CUTOFF_MINUTE = 30

In [8]:
# Read the anonymized news (output of 1.2.1)
news = pd.read_parquet("data/all_the_news_anon.pqt")

# Filter to universe (symbols with fundamentals)
universe = pd.read_parquet("data/universe.pqt")
universe_symbols = set(universe["symbol"].unique())
n_before = len(news)
news = news[news["symbol"].isin(universe_symbols)]
n_after = len(news)
print(f"Filtered to universe: {n_before:,} -> {n_after:,} articles")

news.head()

Filtered to universe: 1,748,719 -> 1,747,711 articles


Unnamed: 0,symbol,publishedDate,publisher,title,image,site,text,url,window_from,window_to,page,title_anon,text_anon
0,APLT,2025-01-28 17:30:00,Accesswire,Class Action Filed Against Applied Therapeutic...,https://images.financialmodelingprep.com/news/...,accessnewswire.com,"NEW YORK, NY / ACCESS Newswire / January 28, 2...",https://www.accessnewswire.com/newsroom/en/bus...,2025-01-22,2025-02-21,1,Class Action Filed Against __TARGET__ (__TARGE...,"NEW YORK, NY / __OTHER__ / January 28, 2025 / ..."
1,MSFT,2021-08-25 23:42:48,Benzinga,Apple Commits To Shoring Up Supply Chain Secur...,https://images.financialmodelingprep.com/news/...,benzinga.com,Apple Inc (NASDAQ: AAPL) is set to create a pr...,https://www.benzinga.com/news/21/08/22668742/a...,2021-08-11,2021-09-10,0,__OTHER__ Commits To Shoring Up Supply Chain S...,__OTHER__ Inc (__OTHER__: __OTHER__) is set to...
2,WDC,2025-10-23 11:05:51,Zacks Investment Research,Western Digital (WDC) Expected to Beat Earning...,https://images.financialmodelingprep.com/news/...,zacks.com,Western Digital (WDC) possesses the right comb...,https://www.zacks.com/stock/news/2775286/weste...,2025-10-19,2025-11-18,0,__TARGET__ (__TARGET__) Expected to __OTHER__ ...,__TARGET__ (__TARGET__) possesses the right co...
3,NFLX,2025-07-17 17:10:12,Bloomberg Markets and Finance,Netflix Reports Results | Closing Bell,https://images.financialmodelingprep.com/news/...,youtube.com,Comprehensive cross-platform coverage of the U...,https://www.youtube.com/watch?v=hvlxOAoLEJ4,2025-06-21,2025-07-21,0,__TARGET__ Reports Results | Closing Bell,Comprehensive cross-platform coverage of the U...
4,AMZN,2024-04-30 16:37:10,NYTimes,Amazon Reports $143.3 Billion in Revenue for F...,https://images.financialmodelingprep.com/news/...,nytimes.com,The company also reported that profit more tha...,https://www.nytimes.com/2024/04/30/technology/...,2024-04-27,2024-05-27,1,Amazon Reports $143.3 Billion in Revenue for F...,The company also reported that profit more tha...


In [9]:
def assign_trading_date(published: pd.Series, cutoff_hour: int = 15, cutoff_minute: int = 30) -> pd.Series:
    """Assign trading date based on 15:30 ET cutoff.
    
    Articles before cutoff -> same day
    Articles at/after cutoff -> next trading day
    Weekend articles -> Monday
    """
    # Ensure datetime
    dt = pd.to_datetime(published)
    
    # Localize to ET if naive, otherwise convert
    # Use ambiguous='NaT' for DST transition times, then forward-fill
    if dt.dt.tz is None:
        dt = dt.dt.tz_localize(ET, ambiguous='NaT', nonexistent='shift_forward')
        # Fill NaT (ambiguous times) with a reasonable default - treat as standard time
        dt = dt.fillna(method='ffill')
    else:
        dt = dt.dt.tz_convert(ET)
    
    # Time of day in minutes from midnight
    time_minutes = dt.dt.hour * 60 + dt.dt.minute
    cutoff_minutes = cutoff_hour * 60 + cutoff_minute
    
    # Base date (no time)
    base_date = dt.dt.normalize()
    
    # If at/after cutoff, push to next day
    after_cutoff = time_minutes >= cutoff_minutes
    trading_date = base_date + pd.to_timedelta(after_cutoff.astype(int), unit="D")
    
    # Handle weekends: Saturday(5) -> Monday, Sunday(6) -> Monday
    weekday = trading_date.dt.weekday
    trading_date = trading_date + pd.to_timedelta(
        np.where(weekday == 5, 2, np.where(weekday == 6, 1, 0)), unit="D"
    )
    
    return trading_date.dt.date

In [10]:
news["trading_date"] = assign_trading_date(news["publishedDate"], CUTOFF_HOUR, CUTOFF_MINUTE)
news[["publishedDate", "trading_date"]].head(20)

  dt = dt.fillna(method='ffill')


Unnamed: 0,publishedDate,trading_date
0,2025-01-28 17:30:00,2025-01-29
1,2021-08-25 23:42:48,2021-08-26
2,2025-10-23 11:05:51,2025-10-23
3,2025-07-17 17:10:12,2025-07-18
4,2024-04-30 16:37:10,2024-05-01
5,2022-01-03 13:25:10,2022-01-03
6,2022-10-26 13:20:00,2022-10-26
7,2023-12-13 16:05:00,2023-12-14
8,2024-05-13 13:01:25,2024-05-13
9,2024-05-01 14:55:13,2024-05-01


In [11]:
# Verify distribution
news["trading_date"].value_counts().sort_index().tail(20)

trading_date
2025-11-24    2823
2025-11-25    2280
2025-11-26    2118
2025-11-27    1685
2025-11-28    1268
2025-12-01    3192
2025-12-02    2285
2025-12-03    2235
2025-12-04    2570
2025-12-05    2069
2025-12-08    3501
2025-12-09    1939
2025-12-10    2377
2025-12-11    2263
2025-12-12    2202
2025-12-15    3588
2025-12-16    2047
2025-12-17    2150
2025-12-18    2293
2025-12-19     400
Name: count, dtype: int64

In [12]:
# Check no weekends in trading_date
td = pd.to_datetime(news["trading_date"])
print(f"Saturday count: {(td.dt.weekday == 5).sum()}")
print(f"Sunday count: {(td.dt.weekday == 6).sum()}")

Saturday count: 0
Sunday count: 4252


In [13]:
# Save filtered news with trading_date back to anonymized file
news.to_parquet("data/all_the_news_anon.pqt", index=False)
print(f"Saved {len(news):,} articles to all_the_news_anon.pqt")

Saved 1,747,711 articles to all_the_news_anon.pqt
