In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
project_root = Path.cwd().parent
from src.utils.data_loader import load_and_merge_data
from src.models.config import CONFIG


## Statistics of news per day

In [40]:
# creating the merged dataframe 

data_dir = project_root / "data"
df = load_and_merge_data(
    data_dir,
    start_date=CONFIG["start_date"],
    end_date=CONFIG["end_date"],
)

print(df.shape)

Reading data from: /Users/brianramesh/Documents/ MAT_implementation/Modality-aware-transformer/data
Loading datasets...

--- Merging Data ---
Merged Market: (3254401, 12)
Merged Ratios: (3254401, 17)
Merged Macro: (3254401, 24)
Merged Text: (3254401, 31)
Filling NaN values...
Keeping records between 2010-01-01 and 2023-12-15...
Done! Final Data Shape: (2500247, 31)
(2500247, 31)


In [41]:
df

Unnamed: 0,date,permno,target,mkt_log_ret,mkt_cap_rank,mkt_mom_1m,mkt_mom_3m,mkt_volatility,mkt_drawdown,mkt_turnover,...,macro_yield_curve,macro_risk_free,macro_vix,emb_mean,sent_score_mean,sent_pos_mean,sent_neg_mean,log_n_news,sent_score_std,has_news
756,2010-01-04,10078.0,-0.343378,0.001066,0.088685,0.127156,0.045808,-4.517852,0.000000,0.009364,...,0.0271,0.0385,3.121484,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000,0.000000,0.000000,0.000000,0.000000,0.0
757,2010-01-05,10078.0,-0.696787,0.001065,0.085835,0.131860,0.040200,-4.530426,0.000000,0.005449,...,0.0276,0.0385,3.046425,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000,0.000000,0.000000,0.000000,0.000000,0.0
758,2010-01-06,10078.0,-0.194579,-0.003200,0.080172,0.103464,0.050393,-4.848275,-0.003200,0.004388,...,0.0276,0.0377,3.013081,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000,0.000000,0.000000,0.000000,0.000000,0.0
759,2010-01-07,10078.0,-0.738244,0.004265,0.079155,0.105361,0.049055,-4.912637,0.000000,0.006612,...,0.0284,0.0385,3.003700,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000,0.000000,0.000000,0.000000,0.000000,0.0
760,2010-01-08,10078.0,0.231038,-0.002130,0.074919,0.099691,0.039135,-4.950801,-0.002130,0.003971,...,0.0282,0.0385,2.998728,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3254136,2023-12-11,93436.0,-0.467862,-0.016957,2.472342,0.132544,-0.109490,-3.734203,-0.201777,0.030164,...,-0.0048,0.0423,2.591516,"[0.01965, 0.00074, 0.010315, 0.002565, 0.02441...",0.443372,0.634558,0.191186,3.583519,0.562789,1.0
3254137,2023-12-12,93436.0,-0.579503,-0.011452,2.458230,0.099095,-0.135123,-3.759665,-0.213230,0.029437,...,-0.0048,0.0423,2.612273,"[0.01692, -3.54e-05, 0.02898, 0.0009627, 0.027...",0.310590,0.584206,0.273616,3.044523,0.632807,1.0
3254138,2023-12-13,93436.0,1.342665,0.009574,2.461548,0.067327,-0.142869,-3.780947,-0.203656,0.044855,...,-0.0053,0.0420,2.570319,"[0.01587, -0.00207, 0.01813, 0.0006576, 0.0209...",0.196215,0.504547,0.308332,3.135494,0.595684,1.0
3254139,2023-12-14,93436.0,0.741246,0.047976,2.502583,0.055865,-0.088898,-3.752039,-0.155680,0.048814,...,-0.0042,0.0404,2.579459,"[0.02551, 0.00441, 0.02205, 0.003414, 0.02922,...",0.159521,0.521112,0.361591,2.944439,0.663220,1.0


In [None]:
df = df.copy()
df["date"] = pd.to_datetime(df["date"])

daily = (
    df.groupby("date")
      .agg(
          n_stocks=("permno", "nunique"),
          n_with_news=("has_news", "sum"),
      )
      .assign(pct_with_news=lambda x: 100 * x["n_with_news"] / x["n_stocks"])
      .reset_index()
)

daily["pct_with_news_ma20"] = daily["pct_with_news"].rolling(20, min_periods=1).mean()

output_dir = project_root / "reports" / "predictions" / "figures"
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "daily_share_stocks_with_news.png"

plt.figure(figsize=(12, 5))
plt.plot(
    daily["date"],
    daily["pct_with_news"],
    alpha=0.4,
    linewidth=0.8,
    label="Daily % of stocks with news",
)
plt.plot(
    daily["date"],
    daily["pct_with_news_ma20"],
    linewidth=1.5,
    label="20-day moving average",
)

plt.xlabel("Date")
plt.ylabel("% stocks with news")
plt.title("Daily share of stocks with news")
plt.legend()
plt.tight_layout()
plt.savefig(output_path, dpi=300)
plt.close()

In [None]:
df = df.copy()
df["date"] = pd.to_datetime(df["date"])

daily = (
    df.groupby("date")
      .agg(
          n_stocks=("permno", "nunique"),
          n_with_news=("has_news", "sum"),
      )
      .assign(pct_with_news=lambda x: 100 * x["n_with_news"] / x["n_stocks"])
      .reset_index()
)

annual = (
    daily.assign(year=daily["date"].dt.year)
         .groupby("year")
         .agg(
             avg_pct_with_news=("pct_with_news", "mean"),
             median_pct_with_news=("pct_with_news", "median"),
             min_pct_with_news=("pct_with_news", "min"),
             max_pct_with_news=("pct_with_news", "max"),
             n_days=("pct_with_news", "count"),
         )
         .reset_index()
)

output_dir = project_root / "reports" / "predictions" / "tables"
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "annual_share_stocks_with_news.csv"
annual.to_csv(output_path, index=False)