In [None]:
import pandas as pd


In [None]:
# 1) Load data
df_stock  = pd.read_csv("yfinance_final.csv", parse_dates=["date"])
df_reddit = pd.read_csv("clean_reddit_data.csv", parse_dates=["date"])
# Ensure date column is datetime
df_stock['date'] = pd.to_datetime(df_stock['date'])
df_reddit['date'] = pd.to_datetime(df_reddit['date'])

# Uppercase and replace dots with dashes (like yfinance style)
df_stock["ticker"] = df_stock["ticker"].str.upper().str.replace('.', '-', regex=False)
df_reddit["ticker"] = df_reddit["ticker"].str.upper().str.replace('.', '-', regex=False)

# 2) Define date range and filter both DataFrames
start, end = "2024-02-02", "2025-04-08"
mask_stock = (df_stock['date'] >= start) & (df_stock['date'] <= end)
mask_reddit = (df_reddit['date'] >= start) & (df_reddit['date'] <= end)


df_stock = df_stock.loc[mask_stock].reset_index(drop=True)
df_reddit = df_reddit.loc[mask_reddit].reset_index(drop=True)
print("shapes: ")
print(df_stock.shape)
print(df_reddit.shape)


In [None]:
# 3) Build full date-ticker grid
all_dates = pd.DataFrame({'date': df_stock["date"].drop_duplicates().sort_values()})
tickers = df_reddit["ticker"].unique()
full_index = pd.MultiIndex.from_product(
    [tickers, all_dates["date"]],
    names=["ticker", "date"]
).to_frame(index=False)

# 4) Merge with Reddit data and forward fill
df_reddit_full = full_index.merge(df_reddit, on=["ticker", "date"], how="left")
df_reddit_full = df_reddit_full.sort_values(["ticker", "date"])


In [None]:
# Forward fill missing sentiment and volume
#df_reddit_full["average_sentiment_score"] = df_reddit_full.groupby("ticker")["average_sentiment_score"].ffill()
#df_reddit_full["comment_volume"] = df_reddit_full.groupby("ticker")["comment_volume"].ffill()

# 5) Create lag features (t-1)
df_reddit_full["comment_sentiment_lag1"] = df_reddit_full.groupby("ticker")["average_sentiment_score"].shift(1)
df_reddit_full["comment_volume_lag1"] = df_reddit_full.groupby("ticker")["comment_volume"].shift(1)

# 6) Keep only lagged features for merge
df_reddit_lag = df_reddit_full[[
    "date", "ticker", "comment_sentiment_lag1", "comment_volume_lag1"
]]

# 7) Merge with stock data
df_merged = df_stock.merge(
    df_reddit_lag,
    on=["date", "ticker"],
    how="left"
)

# 8) Check result
print(df_merged.head(100))
print(f"NaNs after merge:\n{df_merged[['comment_sentiment_lag1', 'comment_volume_lag1']].isna().sum()}")

In [None]:
#REMOVE MRK ONLY ONE POINT OF DATA
df_merged = df_merged[df_merged["ticker"] != "MRK"]
print("MRK in df_merged:", "mrk" in df_merged["ticker"].unique())

missing = df_merged[df_merged["comment_sentiment_lag1"].isna()]
print(missing["ticker"].value_counts().head(10))  # Most affected tickers
print(missing.groupby("ticker")["date"].min().head(10))  # First missing date per ticker

# See how many and what tickers/dates are affected
missing = df_merged[df_merged["comment_sentiment_lag1"].isna()]
print(missing[["date", "ticker"]].drop_duplicates().head(20))
print(f"Missing rows: {len(missing)} / {len(df_merged)}")
print(df_reddit_lag["date"].nunique())
print(df_stock["date"].nunique())
print("merged: ")
print(df_merged["date"].nunique())

#CHECK MISMATCH IN TICKER FORMAT?
stock_tickers = set(df_stock["ticker"])
reddit_tickers = set(df_reddit["ticker"])
print("Tickers in stock but not in reddit:", stock_tickers - reddit_tickers)
print("Tickers in reddit but not in stock:", reddit_tickers - stock_tickers)



#HOW MANY NaNs?
print(f"NaNs after merge:\n{df_merged[['comment_sentiment_lag1', 'comment_volume_lag1']].isna().sum()}")

#REMOVE FIRST DAY BECAUSE OF T - 1
start, end = "2024-02-03", "2025-04-08"
masker = (df_merged['date'] >= start) & (df_merged['date'] <= end)
df_merged = df_merged.loc[masker].reset_index(drop=True)

#HOW MANY NaNs?
print(f"NaNs after merge:\n{df_merged[['comment_sentiment_lag1', 'comment_volume_lag1']].isna().sum()}")


In [None]:
missing = df_merged[df_merged["comment_sentiment_lag1"].isna()]
total_counts = df_merged["ticker"].value_counts()
missing_counts = missing["ticker"].value_counts()
coverage = (
    pd.DataFrame({
        "total": total_counts,
        "missing": missing_counts
    })
    .fillna(0)  # if some tickers have no missing values
)
coverage["missing_pct"] = coverage["missing"] / coverage["total"]
good_tickers = coverage[coverage["missing_pct"] <= 0.7].index
df_filtered = df_merged[df_merged["ticker"].isin(good_tickers)].copy()

print(f"Kept {len(good_tickers)} tickers out of {len(coverage)}")
print(f"Filtered dataset shape: {df_filtered.shape}")



In [None]:
print(df_filtered.head())

print(df_filtered['ticker'].unique())


# Add missing indicators
df_filtered["reddit_sentiment_missing"] = df_filtered["comment_sentiment_lag1"].isna().astype(int)
# Fill NaNs with zeros
df_filtered["reddit_sentiment_lag1"] = df_filtered["comment_sentiment_lag1"].fillna(0.0)
df_filtered["reddit_volume_lag1"] = df_filtered["comment_volume_lag1"].fillna(0.0)



In [None]:
print(df_filtered.head())
print(df_filtered.shape)
df_filtered[['date','ticker','RealizedVol_3d','Target','reddit_sentiment_lag1','reddit_volume_lag1','reddit_sentiment_missing']].to_csv('yahooredditcombined.csv', index = False)