# NOTEBOOK 01: Will the Bill Make It Through Capitol Hill?
Section 1–4: Data Loading, Cleaning, Feature Engineering, EDA

# SECTION 1: DATA LOADING

## 1. Imports & Global Settings

In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)

## 2. Load Raw Dataset

In [2]:
PATH = r"C:/Users/saram/Desktop/Erdos_Institute/project-2025/"
FILE = "bill_id_law_text.csv"

bills = pd.read_csv(PATH + FILE, dtype=str)

print(bills.shape)
bills.head()

(13812, 4)


Unnamed: 0,id,date,law,full_text
0,118.HR.9124,2024-07-24 00:00:00,True,<html><body><pre>\n[118th Congress Public Law ...
1,118.HR.8667,2024-06-07 00:00:00,True,<html><body><pre>\n[118th Congress Public Law ...
2,119.HJRES.9,2025-01-03 00:00:00,False,<html><body><pre>\n[Congressional Bills 119th ...
3,119.HJRES.8,2025-01-03 00:00:00,False,<html><body><pre>\n[Congressional Bills 119th ...
4,119.HJRES.2,2025-01-03 00:00:00,False,<html><body><pre>\n[Congressional Bills 119th ...


In [49]:
bills.iloc[0]['full_text'][:10000]  # Display the first 10000 characters of the law text

"<html><body><pre>\n[118th Congress Public Law 259]\n[From the U.S. Government Publishing Office]\n\n\n\n[[Page 138 STAT. 2973]]\n\nPublic Law 118-259\n118th Congress\n\n                                 An Act\n\n\n \n To name the Department of Veterans Affairs community-based outpatient \n       clinic in Auburn, California, as the ``Louis A. Conter VA \n            Clinic''. &lt;&lt;NOTE: Jan. 4, 2025 -  [H.R. 9124]&gt;&gt; \n\n    Be it enacted by the Senate and House of Representatives of the \nUnited States of America in Congress assembled,\nSECTION 1. FINDINGS.\n\n    Congress finds the following:\n            (1) Louis ``Lou'' Anthony Conter was born on September 13, \n        1921, in Ojibwa, Wisconsin.\n            (2) Lt. Commander Lou Conter, the last remaining survivor of \n        the attack on the USS Arizona at Pearl Harbor, was an American \n        hero.\n            (3) On that fearful day, Petty Officer Conter helped \n        evacuate shipmates who were blinded, wou

In [50]:
bills["law"].value_counts()  # Check the distribution of the target variable

law
False    13767
True        45
Name: count, dtype: int64

In [51]:
bills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13812 entries, 0 to 13811
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         13812 non-null  object
 1   date       13812 non-null  object
 2   law        13812 non-null  object
 3   full_text  13812 non-null  object
dtypes: object(4)
memory usage: 431.8+ KB


## 3. Parse Bill IDs into Structure

In [None]:
# Parse bill ID into components
def parse_bill_id(bill_id):
    m = re.match(r"(\d+)\.([A-Z]+)\.(\d+)", bill_id)

    if not m:
        return pd.Series([None, None, None, None])

    congress = int(m.group(1))
    bill_type = m.group(2)
    bill_number = int(m.group(3))

    if bill_type.startswith("H"):
        chamber = "House"
    elif bill_type.startswith("S"):
        chamber = "Senate"
    else:
        chamber = "Other"

    return pd.Series([congress, bill_type, bill_number, chamber])


cols = ["congress","bill_type","bill_number","chamber"]

bills[cols] = bills["id"].apply(parse_bill_id)


In [5]:
bills.head()

Unnamed: 0,id,date,law,full_text,congress,bill_type,bill_number,chamber
0,118.HR.9124,2024-07-24 00:00:00,True,<html><body><pre>\n[118th Congress Public Law ...,118,HR,9124,House
1,118.HR.8667,2024-06-07 00:00:00,True,<html><body><pre>\n[118th Congress Public Law ...,118,HR,8667,House
2,119.HJRES.9,2025-01-03 00:00:00,False,<html><body><pre>\n[Congressional Bills 119th ...,119,HJRES,9,House
3,119.HJRES.8,2025-01-03 00:00:00,False,<html><body><pre>\n[Congressional Bills 119th ...,119,HJRES,8,House
4,119.HJRES.2,2025-01-03 00:00:00,False,<html><body><pre>\n[Congressional Bills 119th ...,119,HJRES,2,House


## 4. Construct Proper Label

In [None]:
# Create binary label column
bills["label"] = (
    bills["law"]
    .astype(str)
    .map({"True":1,"False":0})
    .astype(int)
)

In [7]:
# Count of bills labeled as law vs not law
bills["label"].value_counts()

label
0    13767
1       45
Name: count, dtype: int64

In [8]:
# Proportion of bills labeled as law vs not law
bills["label"].value_counts(normalize=True)

label
0    0.996742
1    0.003258
Name: proportion, dtype: float64

**Class balance:**

Positives: 45 / 13,812 = 0.33%

Any model that predicts “all fail” gets ~99.7% accuracy.

Precision–Recall (PR-AUC) matters far more than ROC-AUC here.

## 5. Time Features

In [None]:
bills["date"] = pd.to_datetime(bills["date"])

bills["year"] = bills["date"].dt.year
bills["congress_year"] = bills["congress"] * 2 + 1787

# SECTION 2: CLEAN TEXT WITHOUT LEAKAGE

## 6. Leakage-Safe Text Cleaning

### 6.1 Strip HTML

In [None]:
def strip_html(text):
    if not isinstance(text, str):
        return ""
    return BeautifulSoup(text, "html.parser").get_text(" ")

### 6.2 Remove outcome leakage phrases

In [None]:
LEAKAGE_PATTERNS = [
    r"public law\s+\d+-\d+",
    r"approved\s+[a-z]+\s+\d{1,2},\s+\d{4}",
    r"considered and passed.*",
    r"became (a )?law",
    r"was enacted",
    r"enrolled bill",
    r"stat\.?\s*\d+"
]

def remove_leakage(text):
    for p in LEAKAGE_PATTERNS:
        text = re.sub(p, " ", text, flags=re.IGNORECASE)
    return text

### 6.3 Cut all post-vote sections

In [13]:
CUT_MARKERS = [
    "legislative history",
    "passed the house",
    "passed the senate",
    "approved",
    "became law",
    "stat."
]

def cut_postvote(text):
    t = text.lower()
    for m in CUT_MARKERS:
        i = t.find(m)
        if i > 0:
            return text[:i]
    return text


### 6.4 Remove bracket junk, digits-tokens, normalize whitespace

In [14]:
def normalize_text(text):
    
    # remove markup debris
    text = re.sub(r"\[\[.*?\]\]", " ", text)
    text = re.sub(r"<<.*?>>", " ", text)

    # remove tokens containing digits (law codes, dates, votes)
    text = re.sub(r"\b\S*\d+\S*\b", " ", text)
    
    # allow only words + standard punctuation
    text = re.sub(r"[^A-Za-z\s\.\,\;\:\-]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text


### 6.5 Full cleaning pipeline

In [15]:
bills["clean_text"] = (
    bills["full_text"]
    .astype(str)
    .apply(strip_html)
    .apply(cut_postvote)
    .apply(remove_leakage)
    .apply(normalize_text)
)


In [16]:
# Check 
bills["clean_text"].str[:500].sample(5)

6497     Congressional Bills Congress From the U.S. Gov...
768      Congressional Bills Congress From the U.S. Gov...
12410    Congressional Bills Congress From the U.S. Gov...
3838     Congressional Bills Congress From the U.S. Gov...
11902    Congressional Bills Congress From the U.S. Gov...
Name: clean_text, dtype: object

In [17]:
bills.to_csv(PATH + "bill_id_law_text_cleaned.csv", index=False)

## 6.6 Create a sample dataset for GitHub

In [32]:
# original counts
counts = bills["law"].value_counts()

total = counts.sum()
law_ratio = counts["True"] / total   # ~0.0033

n_total = 1000

# target sizes preserving true imbalance
n_law = max(1, int(law_ratio * n_total))
n_nolaw = n_total - n_law

# split classes
law = bills[bills["law"] == "True"]
nolaw = bills[bills["law"] == "False"]

# sample
law_s = law.sample(n=n_law, random_state=42, replace=False)
nolaw_s = nolaw.sample(n=n_nolaw, random_state=42, replace=False)

# merge + shuffle
sample = (
    pd.concat([law_s, nolaw_s])
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

print(sample.shape)

(1000, 15)


In [33]:
print(sample["law"].value_counts())

law
False    997
True       3
Name: count, dtype: int64


In [34]:
print(sample["law"].value_counts(normalize=True))

law
False    0.997
True     0.003
Name: proportion, dtype: float64


In [36]:
sample.to_csv(PATH + r"bill_id_law_text_cleaned_sample_1000.csv", index=False)

# SECTION 3: FEATURE ENGINEERING

## 7. Document Structural Features

In [18]:
bills["len_chars"]     = bills["clean_text"].str.len()
bills["len_words"]    = bills["clean_text"].str.split().apply(len)
bills["section_count"] = bills["clean_text"].str.count(r"\b(SEC\.|Section)\b")

In [19]:
NUM_FEATURES = ["congress","len_words","section_count"]
CAT_FEATURES = ["bill_type","chamber"]

# SECTION 4: EDA & SPLITTING

## 8. Basic EDA

In [21]:
# Pass rate by congress
bills.groupby("congress")["label"].mean()

congress
113    0.028037
114    0.010687
115    0.016923
116    0.000418
117    0.003599
118    0.001244
119    0.000000
Name: label, dtype: float64

In [22]:
# Pass rate by bill type
bills.groupby("bill_type")["label"].mean()


bill_type
HJRES    0.006192
HR       0.001428
S        0.008101
SJRES    0.000000
Name: label, dtype: float64

In [23]:
# Text length distribution
bills["len_words"].describe()


count     13812.000000
mean        846.301984
std        1663.203089
min          22.000000
25%         276.000000
50%         487.000000
75%         931.250000
max      133768.000000
Name: len_words, dtype: float64

## 9. Time-Based Split Construction

NOTE: The temporal distribution of passing bills is extremely skewed toward early congresses. Later congresses (116+) barely pass anything per our dataset.

In [30]:
# Find a way to split data into train/val/test based on congress sessions
train_mask = bills["congress"].isin([113,114])
val_mask   = bills["congress"].isin([115,116])
test_mask  = bills["congress"] >= 117


In [None]:
# Check
def print_split(mask, name):
    s = bills[mask]["label"]
    print(name)
    print("Rows:", len(s))
    print("Positives:", s.sum())

print_split(train_mask,"TRAIN")
print_split(val_mask,  "VAL")
print_split(test_mask, "TEST")

TRAIN
Rows: 1083
Positives: 19
VAL
Rows: 5433
Positives: 13
TEST
Rows: 7296
Positives: 13


* TRAIN (19): Minimum viable training signal
* VAL (13): Too small for fine threshold tuning
* TEST (13): Enough for evaluation

## 10. Save Cleaned Dataset

In [None]:
KEEP_COLS = [
    "id","date","congress","bill_type","bill_number","chamber",
    "label","clean_text",
    "len_words","section_count"
]

bills[KEEP_COLS].to_csv(PATH + "bills_clean_phase1.csv", index=False)