# FOIA Accelerate — Demo Notebook
This notebook demonstrates the four prototype modules on synthetic data.
> Note: This is **not** a production system and should only be used on non-sensitive data.


In [1]:
import sys, os
# Add repo root (one level up from notebooks/) to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from pathlib import Path
import pandas as pd
from src.routing_classifier import RoutingClassifier
from src.pii_redaction import detect_pii, redact_text
from src.deduplication import cluster_near_duplicates
from src.summarizer import summarize
from src.utils import set_seed

set_seed(42)
df = pd.read_csv(Path("..", 'data', 'sample_requests.csv'))
df.head()

os.getcwd()  C:\learning\hoover\policy_proposal\code\github_repo\notebooks


Unnamed: 0,request_id,text,label
0,REQ-001,All communications about immigration policy be...,DHS
1,REQ-002,Budget documents for fiscal year 2022 regardin...,OMB
2,REQ-003,Emails mentioning cybersecurity threats in 2023.,CISA
3,REQ-004,Records related to water quality monitoring ne...,EPA
4,REQ-005,Copies of contracts awarded for airport screen...,TSA


## 1) Train routing classifier (baseline)

In [2]:
rc = RoutingClassifier()
mask = df['label'].notna()
rc.fit(df.loc[mask, 'text'], df.loc[mask, 'label'])
rc.predict(['Budget docs for FY22'])

['OMB']

## 2) Redaction suggestions

In [3]:
sample = 'Contact me at jane.doe@example.com on 03/14/2024. SSN 123-45-6789.'
findings = detect_pii(sample)
sample, findings, redact_text(sample, findings)

('Contact me at jane.doe@example.com on 03/14/2024. SSN 123-45-6789.',
 [{'start': 14, 'end': 34, 'label': 'EMAIL', 'value': 'jane.doe@example.com'},
  {'start': 54, 'end': 65, 'label': 'SSN', 'value': '123-45-6789'},
  {'start': 38, 'end': 48, 'label': 'DATE', 'value': '03/14/2024'}],
 'Contact me at ████████████████████ on ██████████. SSN ███████████.')

## 3) Deduplication clustering

In [4]:
texts = df['text'].tolist()
clusters = cluster_near_duplicates(texts, threshold=0.35)
clusters

[[0], [1], [2], [3], [4], [5], [6], [7]]

## 4) Extractive summary

In [5]:
long_text = (
  'The agency received a significant number of requests this year. '
  'Processing times increased due to staffing shortages. '
  'However, new tooling improved triage. '
  'Future investments could reduce the backlog.'
)
summarize(long_text, max_sentences=2)

AttributeError: module 'networkx' has no attribute 'pagerank_numpy'