In [10]:
import os
import pandas as pd
def merge_news_parts(out_dir="fmp_news_backfill", out_file="news_all.parquet"):
    parts_dir = os.path.join(out_dir, "parts")
    files = sorted([os.path.join(parts_dir, f) for f in os.listdir(parts_dir) if f.endswith(".parquet")])
    all_df = pd.concat((pd.read_parquet(f) for f in files), ignore_index=True)
    all_df.to_parquet(os.path.join(out_dir, out_file), index=False)
    return all_df

In [11]:
all_the_news = merge_news_parts()

# Dedupe on (url, symbol)
n_before = len(all_the_news)
all_the_news = all_the_news.drop_duplicates(subset=["url", "symbol"], keep="first")
print(f"Loaded {n_before:,} articles, {n_before - len(all_the_news):,} duplicates removed, {len(all_the_news):,} remaining")

Loaded 2,046,884 articles, 65,309 duplicates removed, 1,981,575 remaining


In [12]:
universe = pd.read_parquet('data/universe.pqt')
universe.head()

Unnamed: 0,symbol,Security Name
0,AACB,Artius II Acquisition Inc. - Class A Ordinary ...
1,AACBU,Artius II Acquisition Inc. - Units
2,AACG,ATA Creativity Global - American Depositary Sh...
3,AAL,"American Airlines Group, Inc. - Common Stock"
4,AAME,Atlantic American Corporation - Common Stock


In [13]:
all_the_news.head()

Unnamed: 0,symbol,publishedDate,publisher,title,image,site,text,url,window_from,window_to,page
0,AACG,2020-12-18 07:00:00,GlobeNewsWire,ACG Announces Results of Annual General Meetin...,https://images.financialmodelingprep.com/news/...,globenewswire.com,"BEIJING, Dec. 18, 2020 (GLOBE NEWSWIRE) -- ATA...",https://www.globenewswire.com/news-release/202...,2020-12-18,2021-01-13,0
1,AACG,2021-02-04 23:28:05,Benzinga,Why ATA Creativity Skyrocketed 951% Today,https://images.financialmodelingprep.com/news/...,benzinga.com,ATA Creativity Global (NASDAQ: AACG) stock clo...,https://www.benzinga.com/news/21/02/19504724/w...,2021-01-13,2021-02-12,0
2,AACG,2021-02-04 14:17:34,Pulse2,AACG Stock Price Increases Over 400%: Why It H...,https://images.financialmodelingprep.com/news/...,pulse2.com,The stock price of ATA Creativity Global (NASD...,https://pulse2.com/aacg-stock-price-increases-...,2021-01-13,2021-02-12,0
3,AACG,2021-02-04 11:23:20,InvestorPlace,AACG Stock: 7 Things to Know About ATA Creativ...,https://images.financialmodelingprep.com/news/...,investorplace.com,ATA Creativity (AACG) stock is rocketing highe...,https://investorplace.com/2021/02/aacg-stock-7...,2021-01-13,2021-02-12,0
4,AACG,2021-04-13 16:05:00,GlobeNewsWire,ATA Creativity Global Announces Filing of Annu...,https://images.financialmodelingprep.com/news/...,globenewswire.com,"BEIJING, China, April 13, 2021 (GLOBE NEWSWIRE...",https://www.globenewswire.com/news-release/202...,2021-03-14,2021-04-13,0


In [14]:
from anonymize_news import build_name_map, Anonymizer

In [15]:
name_map = build_name_map(universe)

In [16]:
from tqdm.auto import tqdm
from multiprocessing import Pool, cpu_count

# Global for worker processes
_anon = None

def _init_worker(name_map: dict[str, list[str]]):
    global _anon
    _anon = Anonymizer(name_map)

def _anon_row(args: tuple[str, str]) -> str:
    text, sym = args
    return _anon(text, sym)

def apply_anonymization(news_df: pd.DataFrame, name_map: dict[str, list[str]], n_workers: int | None = None) -> pd.DataFrame:
    df = news_df.copy()
    n_workers = n_workers or cpu_count()

    for col in ["title", "text"]:
        if col not in df.columns:
            continue
        pairs = list(zip(df[col].tolist(), df["symbol"].tolist()))
        
        with Pool(n_workers, initializer=_init_worker, initargs=(name_map,)) as pool:
            results = list(tqdm(
                pool.imap(_anon_row, pairs, chunksize=1000),
                total=len(pairs),
                desc=f"Anonymizing {col}"
            ))
        df[f"{col}_anon"] = results
    return df

In [17]:
ANON_OUTPUT_PATH = "data/all_the_news_anon.pqt"

# Check for existing anonymized data
try:
    existing_anon = pd.read_parquet(ANON_OUTPUT_PATH)
    already_done = set(zip(existing_anon["url"], existing_anon["symbol"]))
    print(f"Found {len(already_done):,} existing anonymized articles")
except FileNotFoundError:
    existing_anon = None
    already_done = set()
    print("No existing anonymized data found")

# Filter to only new articles
all_the_news["_key"] = list(zip(all_the_news["url"], all_the_news["symbol"]))
to_anon = all_the_news[~all_the_news["_key"].isin(already_done)].copy()
to_anon.drop(columns=["_key"], inplace=True)
all_the_news.drop(columns=["_key"], inplace=True)
print(f"{len(to_anon):,} articles to anonymize")

Found 1,748,717 existing anonymized articles
232,858 articles to anonymize


In [None]:
# Run anonymization on new articles only
if len(to_anon) == 0:
    print("Nothing new to anonymize")
    anon_df = existing_anon
else:
    new_anon_df = apply_anonymization(to_anon, name_map)
    
    # Append to existing or create new
    if existing_anon is not None:
        anon_df = pd.concat([existing_anon, new_anon_df], ignore_index=True)
    else:
        anon_df = new_anon_df
    
    anon_df.to_parquet(ANON_OUTPUT_PATH, index=False)
    print(f"Saved {len(anon_df):,} total anonymized articles")

Anonymizing title:   0%|          | 0/232858 [00:00<?, ?it/s]

Anonymizing text:   0%|          | 0/232858 [00:00<?, ?it/s]

In [13]:
# Show duplicate URLs\n
url_counts = all_the_news["url"].value_counts()
dup_urls = url_counts[url_counts > 1]
print(f"{len(dup_urls):,} URLs appear more than once")
print(f"Top duplicates:\\n{dup_urls.head(10)}")

# Show example articles with same URL
example_url = dup_urls.index[0]
all_the_news[all_the_news["url"] == example_url][["symbol", "publishedDate", "title", "url"]]

116,275 URLs appear more than once
Top duplicates:\nurl
https://www.gurufocus.com/news/3240667/first-look-fed-cut-looms-spacex-ipo-jj-myeloma-data-google-probe                        29
https://www.investors.com/news/technology/biotech-stocks-the-top-5-to-watch-amid-a-blazing-hot-run/                            26
https://www.gurufocus.com/news/3215323/market-today-tech-slide-meta-win-ai-deals-wbd-bids                                      26
https://www.investors.com/etfs-and-funds/sectors/sp500-millennials-top-stock-is-up-195-percent-and-youve-never-heard-of-it/    24
https://seekingalpha.com/article/4574414-restaurant-stocks-here-are-3-picks-for-2023                                           24
https://www.gurufocus.com/news/3229645/first-look-versace-deal-crypto-slump-wbd-bids-and-airbus-woes                           24
https://www.cnbc.com/2023/10/30/here-are-16-stocks-jim-cramer-is-watching-including-microsoft-chevron-southwest.html           24
https://seekingalpha.com/article/4

Unnamed: 0,symbol,publishedDate,title,url
47531,ACI,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
56361,ADBE,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
98674,ALK,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
184397,AVGO,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
206568,BITF,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
232672,CART,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
259724,CLSK,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
276467,CORZ,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
280715,COST,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
525535,IREN,2025-12-10 07:37:00,First Look: Fed Cut Looms; SpaceX IPO; J&J Mye...,https://www.gurufocus.com/news/3240667/first-l...
