In [None]:
import pandas as pd
import numpy as np
from zoneinfo import ZoneInfo

In [None]:
ET = ZoneInfo("America/New_York")
CUTOFF_HOUR = 15
CUTOFF_MINUTE = 30

In [None]:
news = pd.read_parquet("data/all_the_news.pqt")
news.head()

In [None]:
def assign_trading_date(published: pd.Series, cutoff_hour: int = 15, cutoff_minute: int = 30) -> pd.Series:
    """Assign trading date based on 15:30 ET cutoff.
    
    Articles before cutoff -> same day
    Articles at/after cutoff -> next trading day
    Weekend articles -> Monday
    """
    # Ensure datetime
    dt = pd.to_datetime(published)
    
    # Localize to ET if naive, otherwise convert
    if dt.dt.tz is None:
        dt = dt.dt.tz_localize(ET)
    else:
        dt = dt.dt.tz_convert(ET)
    
    # Time of day in minutes from midnight
    time_minutes = dt.dt.hour * 60 + dt.dt.minute
    cutoff_minutes = cutoff_hour * 60 + cutoff_minute
    
    # Base date (no time)
    base_date = dt.dt.normalize()
    
    # If at/after cutoff, push to next day
    after_cutoff = time_minutes >= cutoff_minutes
    trading_date = base_date + pd.to_timedelta(after_cutoff.astype(int), unit="D")
    
    # Handle weekends: Saturday(5) -> Monday, Sunday(6) -> Monday
    weekday = trading_date.dt.weekday
    trading_date = trading_date + pd.to_timedelta(
        np.where(weekday == 5, 2, np.where(weekday == 6, 1, 0)), unit="D"
    )
    
    return trading_date.dt.date

In [None]:
news["trading_date"] = assign_trading_date(news["publishedDate"], CUTOFF_HOUR, CUTOFF_MINUTE)
news[["publishedDate", "trading_date"]].head(20)

In [None]:
# Verify distribution
news["trading_date"].value_counts().sort_index().tail(20)

In [None]:
# Check no weekends in trading_date
td = pd.to_datetime(news["trading_date"])
print(f"Saturday count: {(td.dt.weekday == 5).sum()}")
print(f"Sunday count: {(td.dt.weekday == 6).sum()}")

In [None]:
news.to_parquet("data/all_the_news.pqt", index=False)