In [1]:
import os
import pandas as pd
def merge_news_parts(out_dir="fmp_news_backfill", out_file="news_all.parquet"):
    parts_dir = os.path.join(out_dir, "parts")
    files = sorted([os.path.join(parts_dir, f) for f in os.listdir(parts_dir) if f.endswith(".parquet")])
    all_df = pd.concat((pd.read_parquet(f) for f in files), ignore_index=True)
    all_df.to_parquet(os.path.join(out_dir, out_file), index=False)
    return all_df

In [2]:
all_the_news = merge_news_parts()

In [3]:
universe = pd.read_parquet('data/universe.pqt')
universe.head()

Unnamed: 0,symbol,Security Name
0,AACB,Artius II Acquisition Inc. - Class A Ordinary ...
1,AACBU,Artius II Acquisition Inc. - Units
2,AACG,ATA Creativity Global - American Depositary Sh...
3,AAL,"American Airlines Group, Inc. - Common Stock"
4,AAME,Atlantic American Corporation - Common Stock


In [4]:
all_the_news.head()

Unnamed: 0,symbol,publishedDate,publisher,title,image,site,text,url,window_from,window_to,page
0,AACG,2020-12-18 07:00:00,GlobeNewsWire,ACG Announces Results of Annual General Meetin...,https://images.financialmodelingprep.com/news/...,globenewswire.com,"BEIJING, Dec. 18, 2020 (GLOBE NEWSWIRE) -- ATA...",https://www.globenewswire.com/news-release/202...,2020-12-18,2021-01-13,0
1,AACG,2021-02-04 23:28:05,Benzinga,Why ATA Creativity Skyrocketed 951% Today,https://images.financialmodelingprep.com/news/...,benzinga.com,ATA Creativity Global (NASDAQ: AACG) stock clo...,https://www.benzinga.com/news/21/02/19504724/w...,2021-01-13,2021-02-12,0
2,AACG,2021-02-04 14:17:34,Pulse2,AACG Stock Price Increases Over 400%: Why It H...,https://images.financialmodelingprep.com/news/...,pulse2.com,The stock price of ATA Creativity Global (NASD...,https://pulse2.com/aacg-stock-price-increases-...,2021-01-13,2021-02-12,0
3,AACG,2021-02-04 11:23:20,InvestorPlace,AACG Stock: 7 Things to Know About ATA Creativ...,https://images.financialmodelingprep.com/news/...,investorplace.com,ATA Creativity (AACG) stock is rocketing highe...,https://investorplace.com/2021/02/aacg-stock-7...,2021-01-13,2021-02-12,0
4,AACG,2021-04-13 16:05:00,GlobeNewsWire,ATA Creativity Global Announces Filing of Annu...,https://images.financialmodelingprep.com/news/...,globenewswire.com,"BEIJING, China, April 13, 2021 (GLOBE NEWSWIRE...",https://www.globenewswire.com/news-release/202...,2021-03-14,2021-04-13,0


In [5]:
from anonymize_news import build_name_map, anonymize_text, AnonymizeConfig

In [6]:
test_df = all_the_news.sample(100).copy()

In [7]:
name_map = build_name_map(universe)

In [8]:
def apply_anonymization(news_df: pd.DataFrame, name_map: dict[str, list[str]]) -> pd.DataFrame:
    df = news_df.copy()

    def anon(text: str, sym: str) -> str:
        cfg = AnonymizeConfig(target_symbol=sym, name_map=name_map)
        return anonymize_text(text, cfg)

    for col in ["title", "text"]:
        if col in df.columns:
            df[f"{col}_anon"] = [
                anon(t, sym) for t, sym in zip(df[col].tolist(), df["symbol"].tolist())
            ]
    return df

In [None]:
anon_df = apply_anonymization(all_the_news,name_map)

In [None]:
anon_df.to_parquet('data/all_the_news.pqt')