#### 01. News Ingestion (Import & Setup)

This pipeline pulls biotech news from **Google News RSS** and the **Finnhub API** over the period **2024‑02‑01 → 2025‑07‑01** for our biotech small‑cap universe.

- **Google RSS**: fetches titles + summaries, filtered by keywords (biotech, FDA, clinical trial).  
- **Finnhub**: company‑news endpoint with additional regex filtering for approvals, clinical phases, partnerships, etc.  
- Finally we’ll merge the two sets into ~6,791 unique `(ticker, date, title)` rows.

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
from dotenv import load_dotenv, find_dotenv
ROOT = Path(__file__).resolve().parents[0] if "__file__" in globals() else Path.cwd()
DATA_DIR = Path(os.getenv("DATA_DIR", ROOT / "data"))  
def p(file): return DATA_DIR / file
load_dotenv(find_dotenv(usecwd=True), override=False)

True

In [4]:
import feedparser
import time
from urllib.parse import quote
from datetime import datetime
from time import mktime
import re
import time, re, gc, math, random
from datetime import datetime,timedelta
import requests
from requests.adapters import HTTPAdapter, Retry
ml      = pd.read_parquet(p("ML.parquet"))
tickers = ml["ticker"].unique().tolist()

In [None]:



start_dt = datetime(2024,1,1)
end_dt   = datetime(2025,7,30)


BASE_RSS = "https://news.google.com/rss/search?q="
records  = []

for tk in tickers:
   
    q   = quote(f'"{tk}" biotech OR FDA OR "clinical trial"')
    url = f"{BASE_RSS}{q}&hl=en-US&gl=US&ceid=US:en"
    
    feed = feedparser.parse(url)
    for e in feed.entries:
        
        if hasattr(e, "published_parsed") and e.published_parsed:
            dt = datetime.fromtimestamp(mktime(e.published_parsed))
        else:
            dt = pd.to_datetime(e.get("published", None), errors="coerce")
        
        if not pd.isna(dt) and start_dt <= dt <= end_dt:
            records.append({
                "ticker":      tk,
                "title":       e.get("title"),
                "summary":     e.get("summary"),
                "link":        e.get("link"),
                "publishedAt": dt
            })
   
    time.sleep(2)


df_news = pd.DataFrame(records)
print("Articoli totali nel periodo:", len(df_news))
print(df_news.head())

df_news.to_parquet("biotech_google_news_2024_20250627.parquet", index=False)

**Output:**

Total: 4121
  ticker                                              title  \
0   CDXS  What analysts say about CDXS stock - Remarkabl...   
1   CDXS  Codexis (NASDAQ:CDXS investor three-year losse...   
2   CDXS  Opaleye Management Inc. Increases Stake in Cod...   
3   CDXS  Codexis, Inc. (NASDAQ:CDXS) Just Released Its ...   
4   CDXS  Casdin Capital, LLC Increases Stake in Codexis...   

                                             summary  \
0  <a href="https://news.google.com/rss/articles/...   
1  <a href="https://news.google.com/rss/articles/...   
2  <a href="https://news.google.com/rss/articles/...   
3  <a href="https://news.google.com/rss/articles/...   
4  <a href="https://news.google.com/rss/articles/...   

                                                link         publishedAt  
0  https://news.google.com/rss/articles/CBMigwFBV... 2025-07-25 20:04:52  
1  https://news.google.com/rss/articles/CBMi1wFBV... 2025-03-01 08:00:00  
2  https://news.google.com/rss/articles/CBMijAFBV... 2024-09-20 08:00:00  
3  https://news.google.com/rss/articles/CBMi2AFBV... 2025-05-17 08:00:00  
4  https://news.google.com/rss/articles/CBMihwFBV... 2024-09-26 08:00:00  


#  2. Google News RSS

For each ticker:
- build the query `"TICKER" biotech OR FDA OR "clinical trial"`,  
- download the RSS feed,  
- extract title, summary, link, and publication date,  
- respect a 2 s delay to avoid overloading Google,  
- cache results on disk for faster iterations during development.

In [None]:
tickers_list = ml["ticker"].unique().tolist()
BASE_URL = os.getenv("BASE_URL", "https://news.google.com/rss/search?q=")
API_KEY  = os.getenv("API_KEY")   
FROM_DATE = datetime(2024,1,1)
TO_DATE   = datetime(2025,7,30)




MAX_REQ_PER_MIN = 55
PAUSE_SEC       = 60 / MAX_REQ_PER_MIN      

CHUNK_SIZE  = 25    
CHUNK_PAUSE = 30     


kw_pattern = (
    r"\bapproval\b"
    r"| \bphase\s*(?:I|II|III)\b"
    r"| \btrial\b"
    r"| \borphan\b"
    r"| \bbreakthrough\b"
    r"| \blicense\b"
    r"| \bdeal\b"
    r"| \bacqui(?:sition|re)\b"
    r"| \bpartner(?:ship)?\b"
    r"| \bFDA\b"
)
kw_re = re.compile(kw_pattern, flags=re.IGNORECASE | re.VERBOSE)


def daterange(start_date, end_date, step_days=90):
    cur = start_date
    while cur <= end_date:
        nxt = min(cur + timedelta(days=step_days - 1), end_date)
        yield cur, nxt
        cur = nxt + timedelta(days=1)


def _build_session() -> requests.Session:
    sess = requests.Session()
    sess.headers["User-Agent"] = "biotech-news-screener/0.2"

    retry = Retry(
        total=5,
        backoff_factor=0.5,              
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False,
    )
    sess.mount("https://", HTTPAdapter(max_retries=retry))
    return sess

SESSION = _build_session()


def fetch_company_news(ticker: str) -> pd.DataFrame:
    frames = []
    for d0, d1 in daterange(FROM_DATE, TO_DATE, step_days=90):
        params = {
            "symbol": ticker,
            "from":   d0.date().isoformat(),
            "to":     d1.date().isoformat(),
            "token":  API_KEY,
        }
        t0 = time.time()
        resp = SESSION.get(
            BASE_URL,
            params=params,
            timeout=(3, 30)        
        )
        if resp.status_code != 200:
           
            raise RuntimeError(f"{ticker} HTTP {resp.status_code}")

        data = resp.json() or []
        if not data:
           
            time.sleep(max(0, PAUSE_SEC - (time.time() - t0)))
            continue

        df = pd.DataFrame(data)
        df["date"] = pd.to_datetime(df["datetime"], unit="s").dt.date
        df = df[(df["date"] >= d0.date()) & (df["date"] <= d1.date())]

        if not df.empty:
            mask = (
                df["headline"].str.contains(kw_re, na=False)
                | df["summary"].str.contains(kw_re, na=False)
            )
            df = df[mask]
            if not df.empty:
                frames.append(
                    df.loc[:, ["date", "headline", "source", "url"]]
                      .rename(columns={"headline": "title"})
                      .assign(ticker=ticker)
                )

       
        time.sleep(max(0, PAUSE_SEC - (time.time() - t0)))

    return (
        pd.concat(frames, ignore_index=True)
          .drop_duplicates(subset=["ticker", "date", "title"])
    ) if frames else pd.DataFrame()


def scrape_all(tickers: list[str]) -> pd.DataFrame:
    all_events = []
    for i, tk in enumerate(tickers, 1):
        try:
            all_events.append(fetch_company_news(tk))
        except Exception as e:
            print(f"[{i}/{len(tickers)}] {tk}  ERRORE: {e}")

        
        if i % CHUNK_SIZE == 0:
            time.sleep(CHUNK_PAUSE)

    df_all = (
        pd.concat(all_events, ignore_index=True)
          .drop_duplicates(subset=["ticker", "date", "title"])
          .sort_values(["ticker", "date"])
          .reset_index(drop=True)
    )
    return df_all


if __name__ == "__main__":
    tickers_list = ml["ticker"].unique().tolist()    
    df_all = scrape_all(tickers_list)
    print("Totale eventi raccolti:", len(df_all))
    df_all.to_parquet("biotech_finnhub_events.parquet", index=False)


#  3. Finnhub Company News

To enrich with **headline  filtered by biotech keywords:
- split the date range into 30‑day blocks  
- hit the **company-news** endpoint  
- apply a regex filter for approval, clinical phase, partnership keywords  
- implement a back‑off if we hit a `429` rate limit  
- cache each ticker’s results on disk  



In [6]:
df_fin = pd.read_parquet("biotech_finnhub_events.parquet")
df_news = pd.read_parquet("biotech_google_news_2024_20250627.parquet")
df_fin = df_fin.drop(['source','url'],axis =1)
df_fin['ticker'] = df_fin['ticker'].str.upper()
df_fin['date'] = pd.to_datetime(df_fin['date']).dt.normalize()
df_news['date'] = df_news['publishedAt']
df_news.columns = ['ticker' , 'rss_titles' , 'rss_summaries', 'link' , 'publishedAt' , 'date']
df_fin.columns = ['date','finnhub_title','ticker']

df_news['ticker'] = df_news['ticker'].str.upper()
df_news['date'] = pd.to_datetime(df_news['date']).dt.normalize()


df_all = pd.merge(
    df_fin,
    df_news,
    on=['ticker', 'date'],
    how='outer',
    sort=True
)

def _normalize(txt: str) -> str:
    txt = str(txt).lower()
    txt = re.sub(r'[^a-z0-9]+', ' ', txt)        
    return re.sub(r'\s+', ' ', txt).strip()

df_all['canonical_title'] = (
      df_all['finnhub_title'].str.strip().replace('', pd.NA)
        .fillna(df_all['rss_titles'].str.strip())
        .fillna('')
        .apply(_normalize)
)


dup_mask = df_all.duplicated(
    subset=['ticker', 'date', 'canonical_title'],
    keep='first'
)

n_total = len(df_all)
n_dups  = dup_mask.sum()
print(f"Duplicated {n_dups}/{n_total}  ({n_dups/n_total:.1%})")


df_all = df_all.loc[~dup_mask].reset_index(drop=True)


df_all['finnhub_title'] = df_all['finnhub_title'].fillna('')
df_all['rss_titles']    = df_all['rss_titles']   .fillna('')
df_all['rss_summaries'] = df_all['rss_summaries'].fillna('')

df_all['all_titles'] = df_all['finnhub_title'] + ' ' + df_all['rss_titles']


df_all = df_all.drop(columns=['canonical_title'])


df_all.to_csv('news_merged_dedup.csv', index=False)
df_all.tail()

⚠️  Duplicated 1709/8501  (20.1%)


Unnamed: 0,date,finnhub_title,ticker,rss_titles,rss_summaries,link,publishedAt,all_titles
6787,2025-07-17,,ZYME,"Zymeworks Sets Q2 Earnings Date, Announces Maj...","<a href=""https://news.google.com/rss/articles/...",https://news.google.com/rss/articles/CBMivgFBV...,2025-07-17 11:00:00,"Zymeworks Sets Q2 Earnings Date, Announces Ma..."
6788,2025-07-27,,ZYME,Zymeworks (NASDAQ:ZYME) Stock Rating Upgraded ...,"<a href=""https://news.google.com/rss/articles/...",https://news.google.com/rss/articles/CBMiqgFBV...,2025-07-27 07:28:34,Zymeworks (NASDAQ:ZYME) Stock Rating Upgraded...
6789,2025-07-28,Zymeworks Announces FDA Clearance of Investiga...,ZYME,FDA Green Lights Revolutionary First-in-Class ...,"<a href=""https://news.google.com/rss/articles/...",https://news.google.com/rss/articles/CBMitwFBV...,2025-07-28 11:00:00,Zymeworks Announces FDA Clearance of Investiga...
6790,2025-07-29,,ZYME,Zymeworks (ZYME) Is Down 6.0% After FDA Clears...,"<a href=""https://news.google.com/rss/articles/...",https://news.google.com/rss/articles/CBMi2wFBV...,2025-07-29 12:04:57,Zymeworks (ZYME) Is Down 6.0% After FDA Clear...
6791,2025-07-29,,ZYME,Zymeworks' IND for liver cancer antibody-drug ...,"<a href=""https://news.google.com/rss/articles/...",https://news.google.com/rss/articles/CBMiwwFBV...,2025-07-29 18:25:41,Zymeworks' IND for liver cancer antibody-drug...


#  4. Merge RSS + Finnhub

We join the two datasets on `(ticker, date)`, keep all rows, and create an `all_titles = rss_title + " ; " + finn_title` field for downstream sentiment analysis.