In [1]:
import pandas as pd, re
from pathlib import Path
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
## clean the raw abstracts
inp = Path("../data/raw/physics_condmat_2005_2025.csv")
out = Path("../data/processed/physics_clean.parquet")
df = pd.read_csv(inp)

def clean(t: str) -> str:
    t = str(t).lower()
    t = re.sub(r"http\S+|www\.\S+", " ", t) ## remove links
    toks = re.findall(r"[a-z]+", t)
    toks = [w for w in toks if w not in ENGLISH_STOP_WORDS and len(w) > 2] ##remove stop words
    return " ".join(toks)

df["year"] = pd.to_datetime(df["published"], errors="coerce").dt.year
df = df[df["year"].between(2005, 2025)]
df["abstract_clean"] = df["abstract"].map(clean)
df = df.dropna(subset=["abstract", "abstract_clean"])
out.parent.mkdir(parents=True, exist_ok=True)

table = pa.Table.from_pandas(df, preserve_index=False, safe=False)
pq.write_table(table, out)
print("Saved:", out, "rows:", len(df))

Saved: ../data/processed/physics_clean.parquet rows: 480862
