    PIPELINE NOTEBOOK

In [None]:
# Jupyter Notebook (Python 3, with your venv activated)
# Sanity Check: Legal Text Processing Pipeline (Tiny Sample)

import os, sys
from pathlib import Path
import pandas as pd

# Make sure we can import from src/
sys.path.append(str(Path("src")))

# --- Parameters ---
BATCHES_DIR   = Path("data/batches")
EMB_DIR       = Path("data/processed/embeddings_sanity")
LDA_OUT       = Path("models/lda_sanity")
BERTOPIC_OUT  = Path("models/bertopic_sanity")
TIMECLS_DIR   = Path("data/processed/timecls_sanity")
DOMAINCLS_DIR = Path("data/processed/domaincls_sanity")

# Optional: limit docs for quick run
MAX_DOCS = 200


    1. Generate embeddings (tiny subset)

In [None]:
!python -m src.ltm.embeddings \
  --batches-dir $BATCHES_DIR \
  --out-dir $EMB_DIR \
  --model sentence-transformers/all-MiniLM-L6-v2 \
  --window 128 --stride 100 --batch-size 8 --max-length 128 \
  --max-docs $MAX_DOCS --device auto


    2. Topic Modeling: LDA

In [None]:
!python -m src.ltm.topic_lda \
  --batches-dir $BATCHES_DIR \
  --out-dir $LDA_OUT \
  --n-topics 10 --vocab-size 5000 --min-freq 2 \
  --n-top-words 8 --max-docs $MAX_DOCS


Gloss (top words + timelines):

In [None]:
!python -m src.ltm.lda_gloss \
  --topics-csv $LDA_OUT/lda_topics_k10.csv \
  --timelines-csv $LDA_OUT/lda_timelines_k10.csv \
  --out-csv $LDA_OUT/lda_gloss_timelines_k10.csv \
  --top-n 8
pd.read_csv(f"{LDA_OUT}/lda_gloss_timelines_k10.csv").head()


    3. Topic Modeling: BERTopic

In [None]:
!python -m src.ltm.topic_bertopic \
  --emb-dir $EMB_DIR \
  --batches-dir $BATCHES_DIR \
  --out-dir $BERTOPIC_OUT \
  --min-cluster-size 10 --umap-n-neighbors 5 --umap-n-components 3 \
  --max-docs $MAX_DOCS


Gloss:

In [None]:
!python -m src.ltm.bertopic_gloss \
  --topics-csv $BERTOPIC_OUT/bertopic_topics.csv \
  --timelines-csv $BERTOPIC_OUT/bertopic_timelines.csv \
  --out-csv $BERTOPIC_OUT/bertopic_gloss_timelines.csv \
  --top-n 8
pd.read_csv(f"{BERTOPIC_OUT}/bertopic_gloss_timelines.csv").head()


    4. Prepare classification datasets (time & domain)
Time (decade bins):

In [None]:
!python -m src.ltm.prepare_timecls \
  --batches-dir $BATCHES_DIR \
  --out-dir $TIMECLS_DIR \
  --max-docs $MAX_DOCS --min-docs-per-class 5 --val-ratio 0.3
pd.read_csv(f"{TIMECLS_DIR}/train.csv").head()


Domain (requires domain field in batches OR external labels CSV):

In [None]:
# Example if domains live inside batches
!python -m src.ltm.prepare_domaincls \
  --batches-dir $BATCHES_DIR \
  --out-dir $DOMAINCLS_DIR \
  --max-docs $MAX_DOCS --min-docs-per-class 5 --val-ratio 0.3
pd.read_csv(f"{DOMAINCLS_DIR}/train.csv").head()


    5. Run baselines (TF-IDF → LR/MLP)
Time:

In [None]:
!python -m src.ltm.classify_baselines \
  --train-csv $TIMECLS_DIR/train.csv \
  --valid-csv $TIMECLS_DIR/valid.csv \
  --out-dir models/timecls_sanity


Domain:

In [None]:
!python -m src.ltm.classify_baselines \
  --train-csv $DOMAINCLS_DIR/train.csv \
  --valid-csv $DOMAINCLS_DIR/valid.csv \
  --out-dir models/domaincls_sanity
