In [None]:
import pandas as pd
df_news = pd.read_csv("News1year.csv", parse_dates=["date"])

# Standardize tickers (for BRK-B)
df_news["ticker"] = df_news["ticker"].str.upper().str.replace('.', '-', regex=False)

# lagged features
df_news = df_news.sort_values(["ticker", "date"])
df_news["news_sentiment_lag1"] = df_news.groupby("ticker")["average_sentiment"].shift(1)
df_news["news_volume_lag1"] = df_news.groupby("ticker")["comment_volume"].shift(1)
print(df_news.shape)

df_news_lag = df_news[["date", "ticker", "news_sentiment_lag1", "news_volume_lag1"]]


In [None]:
print(df_news_lag.shape)
print(df_news_lag.head())
print(f"NaNs?:\n{df_news_lag[['news_sentiment_lag1', 'news_volume_lag1']].isna().sum()}")

df_yr = pd.read_csv('yahooredditcombined.csv', parse_dates = ['date'])

combined = df_yr.merge(
    df_news_lag,
    on=["date", "ticker"],
    how="left"
)

In [None]:
print(combined.shape)
print(combined.head(100))
print(f"NaNs?:\n{combined[['news_sentiment_lag1', 'news_volume_lag1']].isna().sum()}")

missing_news = combined[combined["news_sentiment_lag1"].isna()]
print(missing_news[["date", "ticker"]].head(10))

missing_per_ticker = (
    combined[combined["news_sentiment_lag1"].isna()]
    .groupby("ticker")
    .size()
    .sort_values(ascending=False)
)

print("Tickers with most missing news sentiment:")
print(missing_per_ticker.head(10))




In [None]:
combined["news_missing"] = combined["news_sentiment_lag1"].isna().astype(int)

# === Fill missing values with 0.0 for models that can't handle NaNs (e.g. logistic regression) ===
combined["news_sentiment_lag1"] = combined["news_sentiment_lag1"].fillna(0.0)
combined["news_volume_lag1"] = combined["news_volume_lag1"].fillna(0.0)

In [None]:
print("Any NaNs left?", combined.isna().sum().sum())
print(combined.isna().sum())
print(combined.head())


In [None]:
combined.to_csv('FINAL_1YEAR_DATA.csv',index=False)