# Preprocessing

## Setup & Imports

In [5]:
!pip install -U datasets
!pip install spacy
!pip install huggingface_hub[hf_xet]

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

Collecting hf-xet<2.0.0,>=1.1.1 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hTraceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
^C


In [6]:
import pandas as pd
from dateutil import parser
import spacy
from typing import List, Tuple , Any
import re
import math
from tqdm.auto import tqdm
from google.colab import files
import io
from nltk.tokenize import word_tokenize
import nltk
import gc

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser"])
nlp.add_pipe("sentencizer")
nltk.download('punkt_tab')
nlp.max_length = 12_000_000

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
from datasets import load_dataset

multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20230518")


In [7]:
small_train = multi_lexsum["train"].train_test_split(test_size=0.10, seed=42)["test"]
df_train = pd.DataFrame(small_train)

In [None]:
print(len(df_train))
df_train.head()

318


Unnamed: 0,id,sources,sources_metadata,summary/long,summary/short,summary/tiny,case_metadata
0,EE-AR-0002,[FILED\nU.S. DISTRRIClTeiCAORUKRATNSAS\nIN THE...,"{'doc_id': ['EE-AR-0002-0001', 'EE-AR-0002-000...","In September 2000, the Memphis District Office...",,,{'case_name': 'EEOC v. AFFILIATED FOODS SOUTHW...
1,JI-TX-0002,[IN THE UNITED STATES DISTRICT COURT FOR THE S...,"{'doc_id': ['JI-TX-0002-0002', 'JI-TX-0002-000...","In September 2006, the Civil Rights Division o...",,,"{'case_name': 'U.S.A. v. State of Texas', 'cas..."
2,IM-CA-0160,"[ACCO,TRO,194,STAYED\nUNITED STATES DISTRICT C...","{'doc_id': ['IM-CA-0160-9000', 'IM-CA-0160-000...",COVID-19 Summary: Two immigrants detained pend...,Two immigrants detained pending removal procee...,Two immigrants detained pending removal procee...,"{'case_name': 'Castillo v. Barr', 'case_type':..."
3,FH-VA-0005,[Case 1:13-cv-01214-AJT-JFA Document 7 Filed 1...,"{'doc_id': ['FH-VA-0005-0001', 'FH-VA-0005-000...","On September 30, 2013, the United States Depa...",Following an investigation completed by the Of...,,{'case_name': 'United States v. Chevy Chase Ba...
4,PR-ME-0001,[1/18/2021\nQuery Reports\nCreate an Alert for...,"{'doc_id': ['PR-ME-0001-9000', 'PR-ME-0001-000...",COVID-19 Summary: This is a suit brought by se...,"On May 25, 2020, several rural campgrounds and...",Campgrounds and individuals wishing to travel ...,"{'case_name': 'Bayley's Campground, Inc v. Mil..."


In [8]:
def mean_words_in_sources(df):

    total_words = 0
    num_documents = len(df)

    for sources in df["sources"]:
        for paragraph in sources:

            total_words += len(paragraph.split())

    if num_documents > 0:
        return total_words / num_documents
    else:
        return 0

mean_words = mean_words_in_sources(df_train)
print(f"The mean number of words in the 'sources' column is: {mean_words}")

The mean number of words in the 'sources' column is: 57088.86163522013


## Pre-processing

In [None]:
# Normalization of Characters
def normalize_chars_detail(text: str):
    original = text
    removed = []

    count = text.count("“") + text.count("”")
    if count:
        removed.append(('“/”', '"', count))
    text = text.replace("“", '"').replace("”", '"')

    count = text.count("‘") + text.count("’")
    if count:
        removed.append(("‘/’", "'", count))
    text = text.replace("‘", "'").replace("’", "'")

    count = text.count("—") + text.count("–")
    if count:
        removed.append(("—/–", "-", count))
    text = text.replace("—", "-").replace("–", "-")

    multi_punct = re.findall(r"([.!?;])\1+", text)
    if multi_punct:
        punct_counts = {}
        for char in multi_punct:
            punct_counts[char] = punct_counts.get(char, 0) + 1
        for char, count in punct_counts.items():
            removed.append((char * 2 + "+", char, count))
        text = re.sub(r"([.!?;])\1+", r"\1", text)

    return text, removed


In [None]:
df_train["sources_joined"] = df_train["sources"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)

normalized_data = df_train["sources_joined"].apply(normalize_chars_detail)

df_train["normalized_sources"] = normalized_data.apply(lambda x: x[0])
df_train["normalization_changes"] = normalized_data.apply(lambda x: x[1])

df_train["normalization_changes"].iloc[1]

[('“/”', '"', 746), ('‘/’', "'", 362), ('—/–', '-', 46), ('..+', '.', 3)]

In [None]:
# 1) Light cleaning  removing ascii , pagination and citations/refrences
def light_clean_detail(text: str):
    removed = []
    patterns = [
    (r'[\f\x0c]', "form-feed"),
    (r'Page\s+\d+\s+of\s+\d+', "page-count"),
    (r'\b\d+\s+of\s+\d+\b', "ordinal-range"),  # "3 of 3"
    (r'\[\d+\]', "square-bracket-citation"),  # [1], [12]
    (r'\(\s*\d+\s*\)', "numeric-paren-citation"),  # (3), ( 2 )
    (r'\(\s*[a-zA-Z]\s*\)', "alpha-paren-citation"),  # (f), ( A )
    ]
    for pat, label in patterns:
        matches = re.findall(pat, text, flags=re.IGNORECASE)
        removed.extend([(label, m) for m in matches])
        text = re.sub(pat, '', text, flags=re.IGNORECASE)

    return text, removed

In [None]:
light_clean_data = df_train["normalized_sources"].apply(light_clean_detail)

df_train["light_cleaned_sources"] = light_clean_data.apply(lambda x: x[0])
df_train["light_cleaned_changes"] = light_clean_data.apply(lambda x: x[1])

df_train["light_cleaned_changes"].iloc[0]

[('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('form-feed', '\x0c'),
 ('ordinal-range', '1 of 3'),
 ('ordinal-range', '2 of 3'),
 ('ordinal-range', '3 of 3'),
 ('numeric-paren-citation', '(1)'),
 ('numeric-paren-citation', '(3)'),
 ('numeric-paren-citation', '(1)'),
 ('numeric-paren-citation', '(3)'),
 ('numeric-paren-citation', '(1)'),
 ('numeric-paren-citation', '(3)'),
 ('numeric-paren-citation', '(1)'),
 ('numeric-paren-citation', '(3)'),
 ('numeric-paren-citation', '(901)'),
 ('numeric-paren-citation', '(3)

In [None]:
# Removing Headers and Footers
def remove_docket_lines(text: str) -> Tuple[str, List[str]]:
    lines = text.splitlines()
    kept_lines = []
    removed_lines = []

    # Patterns to remove
    docket_patterns = [
        r'^THIS DOCUMENT', r'^FILED\b', r'^U\.?S\.? (DISTRICT|COURT)', r'^IN THE UNITED STATES DISTRICT COURT',
        r'UNITED STATES DISTRICT COURT', r'CM/ECF', r'^Docket (Text|Report)', r'^CASE NO\.', r'^Re: \d+[:-]\d+[:-]cv[:-]\d+',
        r'Case \d{1,2}[:-]\d{2}[:-]\d{5}-[A-Z]{2,4}-[A-Z]{1,3} Document \d+ Filed \d{2}/\d{2}/\d{2} Page \d+ of \d+ PageID: \d+',
        r'# Docket Text',
        r'\d{2}/\d{2}/\d{4}',
        r'\(Entered: \d{2}/\d{2}/\d{4}\)',
        r'^\d+\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\s+\d+:\d+\s+(AM|PM)',
        r'^\d+\s+(COMPLAINT|ANSWER|ORDER|MOTION|SCHEDULING|CONSENT|MEMORANDUM)',
        r'^\d+\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}',
        r'^True and correct copies', r'^Date Filed', r'^Assigned to:', r'^Demand:', r'^Cause:', r'^Jury Demand:',
        r'^Nature of Suit:', r'^Jurisdiction:', r'^Plaintiff$', r'^Defendant$', r'^Intervenor$', r'^represented by',
        r'ATTORNEY TO BE NOTICED', r'LEAD ATTORNEY', r'se ANOIOR', r'^CERTIFICATE OF SERVICE', r'^I hereby certify',
        r'^Date Terminated:', r'^EEOC v\.', r'^\(Entered:', r'^V\. Defendant', r'^\\NOTICE of Hearing:',
        r'^\s*\d+\s*$', r'^\s*Page \d+ of \d+\s*$'
    ]

    # Exceptions: keep lines that match these even if they also match a docket pattern
    keep_patterns = [
        r'CONSENT DECREE', r'resolves all issues and claims', r'shall be binding upon the parties',
    ]

    docket_re = re.compile('|'.join(docket_patterns), re.IGNORECASE)
    keep_re = re.compile('|'.join(keep_patterns), re.IGNORECASE)

    for line in lines:
        stripped = line.strip()
        if keep_re.search(stripped):
            kept_lines.append(line)
        elif docket_re.search(stripped):
            removed_lines.append(line)
        else:
            kept_lines.append(line)

    # Clean up excess blank lines
    result = []
    prev_blank = False
    for line in kept_lines:
        if not line.strip():
            if not prev_blank:
                result.append(line)
            prev_blank = True
        else:
            result.append(line)
            prev_blank = False

    while result and not result[-1].strip():
        result.pop()
    while result and not result[0].strip():
        result.pop(0)

    return '\n'.join(result), removed_lines


In [None]:
docket_data = df_train["light_cleaned_sources"].apply(remove_docket_lines)

df_train["docket_cleaned_sources"] = docket_data.apply(lambda x: x[0])
df_train["docket_cleaned_changes"] = docket_data.apply(lambda x: x[1])

print(df_train["docket_cleaned_changes"].iloc[1])

['IN THE UNITED STATES DISTRICT COURT FOR THE DISTRICT OF COLUMBIA', 'Case No. 1:14-cv-1319 Jury Trial Demanded', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', 'UNITED STATES DISTRICT COURT FOR THE DISTRICT OF COLUMBIA', '2', ' UNITED STATES DISTRICT COURT FOR THE DISTRICT OF COLUMBIA', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', ' Case: 1:14-cv-1319 As of: 04/01/2018 10:31 PM EDT ', 'U.S. District Court District of Columbia (Washington, DC) CIVIL DOCKET FOR CASE #: 1:14−cv−01319−TSC', 'Plaintiff', 'Date Filed: 08/04/2014 Date Terminated: 04/25/2017 Jury Demand: Plaintiff Nature of Suit: 440 Civil Rights: Other Jurisdiction: Federal Question', 'represented by Alec George Karakatsanis Civil Rights Corp

In [None]:
print(df_train["docket_cleaned_changes"].iloc[0])

['FILED', 'IN THE UNITED STATES DISTRICT COURT EASTERN OlSi', 'JURISDICTION AND VENUE 1. Jurisdiction of this Court is invoked pursuant to 28 U.S.C.§§ 451,1331,1337,1343 and 1345. This action is authorized and instituted pursuant to Section 706   and  of Title VII of the Civil Rights Act of 1964, as amended, 42 U.S.c. §2000e-5 and ("Title VII") and Section 102 of the Civil Rights Act of 1991,42 U.S.c. §1981a. 2. The employment practices alleged to be unlawful were committed within the jurisdiction of the United States District Court for the Eastern District of Arkansas, Pine Bluff Division.', '2', '3', '4', '5', '6', 'PLAINTIFF', 'CASE NO. 5:00-CV-00356', 'DEFENDANT', 'INTERVENOR', 'CERTIFICATE OF SERVICE', 'IN THE UNITED STATES DISTRICT COuRT FEB 26 2001', 'PLAINTIFF', 'DEFENDANT', 'INTERVENOR', 'I. JURISDICTION The United States District Court for the Eastern District of Arkansas, Pine Bluff Division, has jurisdiction over the parties and subject matter of this litigation.', 'THIS DO

In [None]:
# 3) Date normalization with detail
def normalize_dates_detail(text: str):
    removed = []
    pat1 = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s*\d{4}\b'
    pat2 = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
    matches1 = re.findall(pat1, text)
    matches2 = re.findall(pat2, text)
    removed.extend([("date", m) for m in matches1 + matches2])
    def replace_dt(match):
        try:
            return parser.parse(match.group()).strftime("%-d %B %Y")
        except:
            return match.group()
    text = re.sub(pat1, replace_dt, text)
    text = re.sub(pat2, replace_dt, text)
    return text, removed

In [None]:
date_normalized_data = df_train["docket_cleaned_sources"].apply( normalize_dates_detail)

df_train["date_normalized_sources"] = date_normalized_data.apply(lambda x: x[0])
df_train["date_normalized_changes"] = date_normalized_data.apply(lambda x: x[1])

df_train["date_normalized_changes"].iloc[5]

[('date', 'March 26, 2013'),
 ('date', 'March 23, 2013'),
 ('date', 'March 26, 2013'),
 ('date', 'August 4, 2014'),
 ('date', 'March 31, 2016'),
 ('date', 'March 26, 2013'),
 ('date', 'March 23, 2013'),
 ('date', 'March 26, 2013'),
 ('date', 'March 26, 2013'),
 ('date', 'March 31, 2016'),
 ('date', 'March 31, 2016'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '08/04/14'),
 ('date', '03/31/16'),
 ('date', '03/31/16'),
 ('date', '8/5/2014'),
 ('date', '8/26/2014'),
 ('date', '8/5/2014'),
 ('date', '4/14/16'),
 ('date', '4/22/16'),
 ('date', '4/21/16')]

In [None]:
# Case number , contact information removal
def cleanup_regex_detail(text: str) -> Tuple[str, List[Tuple[str, str]]]:

    removed = []

    for m in re.findall(r'[_~]{2,}', text):
        removed.append(("underscore_tilde", m))
    text = re.sub(r'[_~]{2,}', '', text)

    for m in re.findall(r'\b[^A-Za-z0-9\s]{3,}[A-Za-z0-9]*\b', text):
        removed.append(("heavy_word", m))
    text = re.sub(r'\b[^A-Za-z0-9\s]{3,}[A-Za-z0-9]*\b', '', text)

    for m in re.findall(r'Case[:\s]*\d+:\d+-[\w-]+', text, flags=re.IGNORECASE):
        removed.append(("case_number", m))
    text = re.sub(r'Case[:\s]*\d+:\d+-[\w-]+', '', text, flags=re.IGNORECASE)

    for m in re.findall(r'\b\S+@\S+\.\w+\b', text):
        removed.append(("email", m))
    text = re.sub(r'\b\S+@\S+\.\w+\b', '', text)

    for m in re.findall(r'https?[:\.]\S+', text):
        removed.append(("url", m))
    text = re.sub(r'https?[:\.]\S+', '', text)

    doc_filed_pattern = (
        r'\bDocument\s+[\d\-]+\s+Filed\s+\d{1,2}\s+'
        r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
    )
    for match in re.finditer(doc_filed_pattern, text, flags=re.IGNORECASE):
        removed.append(("doc_filed_date", match.group(0)))
    text = re.sub(doc_filed_pattern, '', text, flags=re.IGNORECASE)

    text = re.sub(r'\n\s*\n+', '\n\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)

    return text, removed


In [None]:
rg_data = df_train["date_normalized_sources"].apply(cleanup_regex_detail)

df_train["regex_cleaned_sources"] = rg_data.apply(lambda x: x[0])
df_train["regex_cleaned_changes"] = rg_data.apply(lambda x: x[1])

df_train["regex_cleaned_changes"].iloc[5]

[('underscore_tilde', '___________________________________'),
 ('underscore_tilde', '__'),
 ('underscore_tilde', '______________'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319'),
 ('case_number', 'Case 1:14-cv-01319-TSC'),
 ('case_number', 'Case 1

In [None]:
# 7) Whitespace normalization with detail
def normalize_whitespace_detail(text: str):
    removed = []
    # multiple newlines
    for m in re.findall(r'\n{3,}', text):
        removed.append(("multi_newline", m))
    text = re.sub(r'\n{3,}', '\n\n', text)
    # excess spaces/tabs
    for m in re.findall(r'[ \t\f\v]{2,}', text):
        removed.append(("multi_space", m))
    text = re.sub(r'[ \t\f\v]+', ' ', text)
    # space padding around newlines
    text, cnt1 = re.subn(r' *\n *', '\n', text)
    if cnt1:
        removed.append(("newline_padding", cnt1))
    return text, removed

In [None]:
whitespace_data = df_train["regex_cleaned_sources"].apply(normalize_whitespace_detail)

df_train["whitespace_sources"] = whitespace_data.apply(lambda x: x[0])
df_train["whitespace_sources_changes"] = whitespace_data.apply(lambda x: x[1])

df_train["whitespace_sources_changes"].iloc[5]

In [None]:
# load abbreviations.txt (the one containing lines like “Abbrev.”, “Abd.”, etc.)
with open("abbreviations.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

raw_patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith("#")]
patterns = [re.escape(p).replace(r"\.", r"\.") for p in raw_patterns]
abbrev_pat = re.compile(r"(?:{})$".format("|".join(patterns)), flags=re.IGNORECASE)

# Legal patterns
header_pattern = re.compile(r"^(FACTS|COUNT\s+[A-Z]+|ISSUE|ARGUMENT|BACKGROUND|CONCLUSION|PRAYER|REMEDIES)[\s\d]*\.?$", re.IGNORECASE)
uppercase_line = re.compile(r"^[A-Z\s,\d]+[\.:]?$")
legal_citation = re.compile(r"\b\w+ v\. \w+, \d+ F\. \d+.*?\(\d{4}\)")

# sentance noise filter
def is_noise_sentence(text: str) -> bool:
    text = text.strip()
    if not text:
        return True
    if len(text) < 2:
        return True
    if text.lower() in ["n/a", "none"]:
        return True
    if (
        header_pattern.match(text)
        or uppercase_line.match(text) and len(text.split()) > 3
        or legal_citation.search(text)
    ):
        return True
    return False if len(text.split()) >= 3 else True
# segment sentances using batchs
def batch_segment(texts, batch_size=8):
    results = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        sents, buf = [], ""
        for sent in doc.sents:
            chunk = sent.text.strip()
            if buf:
                chunk = buf + " " + chunk
                buf = ""
            if abbrev_pat.search(chunk) and len(chunk) < 15:
                buf = chunk
                continue
            if not is_noise_sentence(chunk):
                sents.append(chunk)
        if buf and not is_noise_sentence(buf):
            sents.append(buf)
        results.append(sents)
    return results

In [None]:
from typing import List, Tuple
from tqdm.auto import tqdm

def process_and_segment(
    docs: List[str]
) -> Tuple[List[str], List[List[Tuple[str,str,int]]], List[List[str]]]:

    cleaned_texts:     List[str]   = []
    removed_per_doc:   List[List[Tuple[str,str,int]]] = []
    sentences_per_doc: List[List[str]] = []

    #  Cleaning loop
    for doc in tqdm(docs, desc="Cleaning docs"):
        text = doc
        doc_removed: List[Tuple[str,str,int]] = []

        text, changes = normalize_chars_detail(text)
        doc_removed.extend(changes)

        text, changes = light_clean_detail(text)
        doc_removed.extend(changes)

        text, removed_lines = remove_docket_lines(text)
        doc_removed.extend([(line, "", 1) for line in removed_lines])

        text, changes = cleanup_regex_detail(text)
        doc_removed.extend([(c[0], c[1], 1) for c in changes])


        text, changes = normalize_dates_detail(text)
        doc_removed.extend(changes)

        text, changes = normalize_whitespace_detail(text)
        doc_removed.extend(changes)

        cleaned_texts.append(text)
        removed_per_doc.append(doc_removed)

    #  Sentence segmentation
    for text in tqdm(cleaned_texts, desc="Segmenting sentences"):
        sents = batch_segment([text])[0]
        sentences_per_doc.append(sents)

    return cleaned_texts, removed_per_doc, sentences_per_doc


In [None]:
def process_dataframe(df, column_name="sources", batch_size=1):

    result_df = df.copy()

    if batch_size and len(df) > batch_size:
        # Process in batches
        processed_dfs = []
        for i in range(0, len(df), batch_size):
            batch_df = df.iloc[i:i+batch_size].copy()

            # Process batch
            raw_docs = batch_df[column_name].tolist()
            raw_docs = [" ".join(chunks) if isinstance(chunks, list) else chunks for chunks in raw_docs]
            cleaned, removed_list, sentences_list = process_and_segment(raw_docs)

            # Add results to batch dataframe
            batch_df["cleaned_text"] = cleaned
            batch_df["removed_per_doc"] = removed_list
            batch_df["sentences"] = sentences_list

            processed_dfs.append(batch_df)

        # Combine all processed batches
        result_df = pd.concat(processed_dfs).reset_index(drop=True)
    else:
        # Process entire dataframe at once
        raw_docs = df[column_name].tolist()
        raw_docs = [" ".join(chunks) if isinstance(chunks, list) else chunks for chunks in raw_docs]
        cleaned, removed_list, sentences_list = process_and_segment(raw_docs)

        # Add results to dataframe
        result_df["cleaned_text"] = cleaned
        result_df["removed_per_doc"] = removed_list
        result_df["sentences"] = sentences_list

    return result_df

In [None]:
batch_size = len(df_train) // 3
processed_df = process_dataframe(df_train, column_name="sources", batch_size=batch_size)

Cleaning docs:   0%|          | 0/106 [00:00<?, ?it/s]

Segmenting sentences:   0%|          | 0/106 [00:00<?, ?it/s]

Cleaning docs:   0%|          | 0/106 [00:00<?, ?it/s]

Segmenting sentences:   0%|          | 0/106 [00:00<?, ?it/s]

Cleaning docs:   0%|          | 0/106 [00:00<?, ?it/s]

Segmenting sentences:   0%|          | 0/106 [00:00<?, ?it/s]

In [None]:
processed_df.head()

Unnamed: 0,id,sources,sources_metadata,summary/long,summary/short,summary/tiny,case_metadata,cleaned_text,removed_per_doc,sentences
0,EE-AR-0002,[FILED\nU.S. DISTRRIClTeiCAORUKRATNSAS\nIN THE...,"{'doc_id': ['EE-AR-0002-0001', 'EE-AR-0002-000...","In September 2000, the Memphis District Office...",,,{'case_name': 'EEOC v. AFFILIATED FOODS SOUTHW...,U.S. DISTRRIClTeiCAORUKRATNSAS\nFOR THE EASTER...,"[(..+, ., 5), (form-feed, ), (form-feed, ), ...",[U.S. DISTRRIClTeiCAORUKRATNSAS\nFOR THE EASTE...
1,JI-TX-0002,[IN THE UNITED STATES DISTRICT COURT FOR THE S...,"{'doc_id': ['JI-TX-0002-0002', 'JI-TX-0002-000...","In September 2006, the Civil Rights Division o...",,,"{'case_name': 'U.S.A. v. State of Texas', 'cas...","UNITED STATES OF AMERICA, 950 Pennsylvania Ave...","[(“/”, "", 69), (‘/’, ', 67), (—/–, -, 5), (..+...","[UNITED STATES OF AMERICA, 950 Pennsylvania Av..."
2,IM-CA-0160,"[ACCO,TRO,194,STAYED\nUNITED STATES DISTRICT C...","{'doc_id': ['IM-CA-0160-9000', 'IM-CA-0160-000...",COVID-19 Summary: Two immigrants detained pend...,Two immigrants detained pending removal procee...,Two immigrants detained pending removal procee...,"{'case_name': 'Castillo v. Barr', 'case_type':...","ACCO,TRO,194,STAYED\nCIVIL DOCKET FOR CASE #: ...","[(“/”, "", 334), (‘/’, ', 295), (—/–, -, 96), (...","[ACCO,TRO,194,STAYED\nCIVIL DOCKET FOR CASE #:..."
3,FH-VA-0005,[Case 1:13-cv-01214-AJT-JFA Document 7 Filed 1...,"{'doc_id': ['FH-VA-0005-0001', 'FH-VA-0005-000...","On September 30, 2013, the United States Depa...",Following an investigation completed by the Of...,,{'case_name': 'United States v. Chevy Chase Ba...,Document 7 Filed 2 October 2013 PageID# 51\n\...,"[(—/–, -, 2), (..+, ., 1), (form-feed, ), (fo...",[Document 7 Filed 2 October 2013 PageID# 51\n\...
4,PR-ME-0001,[1/18/2021\nQuery Reports\nCreate an Alert for...,"{'doc_id': ['PR-ME-0001-9000', 'PR-ME-0001-000...",COVID-19 Summary: This is a suit brought by se...,"On May 25, 2020, several rural campgrounds and...",Campgrounds and individuals wishing to travel ...,"{'case_name': 'Bayley's Campground, Inc v. Mil...",18 January 2021\nQuery Reports\nCreate an Aler...,"[(“/”, "", 590), (‘/’, ', 259), (—/–, -, 46), (...",[18 January 2021\nQuery Reports\nCreate an Ale...


In [None]:
from google.colab import drive
drive.mount('/content/drive')
processed_df.to_pickle("/content/drive/MyDrive/processed_df_checkpoint.pkl")

Mounted at /content/drive


In [None]:
legal_stopwords = pd.read_pickle("stopwords.pickle")
stopwords = nlp.Defaults.stop_words | set(legal_stopwords)


def process_sentence(sent: str) -> Tuple[str, List[Tuple[str, Any]]]:
    changes = []

    # First lemmatize
    doc = nlp(sent.lower())
    lemmatized_tokens = []

    for token in doc:
        if token.pos_ in {'VERB', 'ADJ', 'ADV', 'DET', 'PRON', 'AUX'}:
            if token.lemma_ != token.text:
                changes.append(("lemmatized", f"{token.text}→{token.lemma_}"))
            lemmatized_tokens.append(token.lemma_)
        else:
            lemmatized_tokens.append(token.text)

    # Then remove stopwords
    filtered_tokens = []
    for token in lemmatized_tokens:
        if token.lower() in stopwords:
            changes.append(("stopword", token))
        else:
            filtered_tokens.append(token)

    processed_sent = " ".join(filtered_tokens)
    return processed_sent, changes

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
def process_document_sentences(sentences: List[str]) -> Tuple[List[str], List[List[Tuple[str, Any]]]]:
    processed_sentences = []
    all_changes = []

    for sent in sentences:
        processed_sent, changes = process_sentence(sent)
        processed_sentences.append(processed_sent)
        all_changes.append(changes)

    return processed_sentences, all_changes

In [None]:
def process_all_documents(df: pd.DataFrame, sentences_column: str = "sentences") -> pd.DataFrame:
    result_df = df.copy()

    # Initialize new columns
    result_df["sentences_tf_idf"] = None
    result_df["sentence_changes"] = None

    # Process each document's sentences with document-level tqdm
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Documents"):
        sentences = row[sentences_column]
        processed_sents, changes = process_document_sentences(sentences)

        # Store results
        result_df.at[idx, "sentences_tf_idf"] = processed_sents
        result_df.at[idx, "sentence_changes"] = changes

    return result_df

In [None]:
processed_df_with_lemma = process_all_documents(processed_df)


Documents:   0%|          | 0/318 [00:00<?, ?it/s]

In [None]:
processed_df_with_lemma.to_pickle("/content/drive/MyDrive/processed_df_with_lemma_checkpoint.pkl")

In [None]:
# After running the processing
for idx, row in processed_df_with_lemma.iterrows():
    assert len(row["sentences"]) == len(row["sentences_tf_idf"]), f"Mismatch at index {idx}"

print("All sentence lists have the same length!")

All sentence lists have the same length!
