In [None]:
import pandas as pd
df_news = pd.read_csv("News1year.csv", parse_dates=["date"])

# Standardize tickers (for BRK-B)
df_news["ticker"] = df_news["ticker"].str.upper().str.replace('.', '-', regex=False)

# lagged features
df_news = df_news.sort_values(["ticker", "date"])
df_news["news_sentiment_lag1"] = df_news.groupby("ticker")["average_sentiment"].shift(1)
df_news["news_volume_lag1"] = df_news.groupby("ticker")["comment_volume"].shift(1)
print(df_news.shape)

df_news_lag = df_news[["date", "ticker", "news_sentiment_lag1", "news_volume_lag1"]]


(2988, 6)


In [None]:
print(df_news_lag.shape)
print(df_news_lag.head())
print(f"NaNs?:\n{df_news_lag[['news_sentiment_lag1', 'news_volume_lag1']].isna().sum()}")

df_yr = pd.read_csv('yahooredditcombined.csv', parse_dates = ['date'])

combined = df_yr.merge(
    df_news_lag,
    on=["date", "ticker"],
    how="left"
)

(2988, 4)
         date ticker  news_sentiment_lag1  news_volume_lag1
0  2024-02-02   AAPL                  NaN               NaN
12 2024-02-05   AAPL             0.462060              30.0
21 2024-02-06   AAPL             0.492700              40.0
32 2024-02-07   AAPL             0.384226              66.0
42 2024-02-08   AAPL             0.169687              78.0
NaNs?:
news_sentiment_lag1    15
news_volume_lag1       15
dtype: int64


In [None]:
print(combined.shape)
print(combined.head(100))
print(f"NaNs?:\n{combined[['news_sentiment_lag1', 'news_volume_lag1']].isna().sum()}")

missing_news = combined[combined["news_sentiment_lag1"].isna()]
print(missing_news[["date", "ticker"]].head(10))

missing_per_ticker = (
    combined[combined["news_sentiment_lag1"].isna()]
    .groupby("ticker")
    .size()
    .sort_values(ascending=False)
)

print("Tickers with most missing news sentiment:")
print(missing_per_ticker.head(10))




(4892, 9)
         date ticker  RealizedVol_3d    Target  reddit_sentiment_lag1  \
0  2024-02-05   AAPL        0.016407  0.009968               0.252806   
1  2024-02-05   AMZN        0.051287  0.043985               0.318450   
2  2024-02-05  BRK-B        0.010539  0.005626               0.000000   
3  2024-02-05   COST        0.012018  0.006546               0.288800   
4  2024-02-05   GOOG        0.045971  0.002081               0.663400   
..        ...    ...             ...       ...                    ...   
95 2024-02-13   COST        0.004214  0.005826               0.489267   
96 2024-02-13   GOOG        0.015175  0.019438               0.000000   
97 2024-02-13  GOOGL        0.015625  0.020004               0.000000   
98 2024-02-13     HD        0.003959  0.014348               0.485400   
99 2024-02-13     KO        0.003552  0.004404              -0.030533   

    reddit_volume_lag1  reddit_sentiment_missing  news_sentiment_lag1  \
0                 33.0                  

In [None]:
combined["news_missing"] = combined["news_sentiment_lag1"].isna().astype(int)

# === Fill missing values with 0.0 for models that can't handle NaNs (e.g. logistic regression) ===
combined["news_sentiment_lag1"] = combined["news_sentiment_lag1"].fillna(0.0)
combined["news_volume_lag1"] = combined["news_volume_lag1"].fillna(0.0)

In [None]:
print("Any NaNs left?", combined.isna().sum().sum())
print(combined.isna().sum())
print(combined.head())


Any NaNs left? 0
date                        0
ticker                      0
RealizedVol_3d              0
Target                      0
reddit_sentiment_lag1       0
reddit_volume_lag1          0
reddit_sentiment_missing    0
news_sentiment_lag1         0
news_volume_lag1            0
news_missing                0
dtype: int64
        date ticker  RealizedVol_3d    Target  reddit_sentiment_lag1  \
0 2024-02-05   AAPL        0.016407  0.009968               0.252806   
1 2024-02-05   AMZN        0.051287  0.043985               0.318450   
2 2024-02-05  BRK-B        0.010539  0.005626               0.000000   
3 2024-02-05   COST        0.012018  0.006546               0.288800   
4 2024-02-05   GOOG        0.045971  0.002081               0.663400   

   reddit_volume_lag1  reddit_sentiment_missing  news_sentiment_lag1  \
0                33.0                         0             0.462060   
1                12.0                         0             0.360583   
2                 0.0

In [None]:
combined.to_csv('FINAL_1YEAR_DATA.csv',index=False)