In [99]:
from datasets import load_dataset
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import wordcloud
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Input, Embedding, Dropout, BatchNormalization, Activation
import re
from transformers import pipeline
import torch

In [None]:
#!pip install datasets

## Downloading the Dataset here

In [7]:
multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20230518", trust_remote_code=True)

In [8]:
train_df = multi_lexsum['train'].to_pandas()
val_df = multi_lexsum['validation'].to_pandas()
test_df = multi_lexsum['test'].to_pandas()

In [9]:
val_df

Unnamed: 0,id,sources,sources_metadata,summary/long,summary/short,summary/tiny,case_metadata
0,EE-MO-0105,[Page 1\nLEXSEE 2003 U.S. DIST. CT. PLEADINGS ...,"{'doc_id': ['EE-MO-0105-0001', 'EE-MO-0105-000...","On January 23, 2004, Plaintiff filed an amende...",This case was brought in 2004 by a female form...,,"{'case_name': 'Stocking v. AT&T Corporation', ..."
1,EE-MI-0138,[UNITED STATES DISTRICT COURT WESTERN DISTRICT...,"{'doc_id': ['EE-MI-0138-0001', 'EE-MI-0138-000...","In September 2003, the Detroit office of the E...",,,{'case_name': 'EEOC v. ROBERT BOSCH CORPORATIO...
2,EE-FL-0124,[Page 1 of 4\nIN THE UNITED STATES DISTRICT CO...,"{'doc_id': ['EE-FL-0124-0002', 'EE-FL-0124-000...","On May 10, 2002, the United States filed a law...",,,{'case_name': 'Jones v. City of Fort Lauderdal...
3,DR-TX-0003,[Case: 3:16-cv-1113 As of: 05/18/2020 01:43 PM...,"{'doc_id': ['DR-TX-0003-9000', 'DR-TX-0003-000...","On April 25, 2016, three individuals that were...","In 2016, three individuals who were blind law ...",,"{'case_name': 'Stanley v. Barbri', 'case_type'..."
4,EE-GA-0095,[ORIGINAL\n\nIN THE UNITED STATES DISTRICT COU...,"{'doc_id': ['EE-GA-0095-0001', 'EE-GA-0095-000...","In March 2001, the EEOC district office in Atl...",,,"{'case_name': 'EEOC v. WREN CHEVROLET INC', 'c..."
...,...,...,...,...,...,...,...
449,IM-WA-0004,[~TO~’ & MKI.ER O3URT REPORTER5 601 W. RIVERSI...,"{'doc_id': ['IM-WA-0004-0001', 'IM-WA-0004-000...","On March 28, 2000 several former employees of ...",,,"{'case_name': 'Mandoza v. Zirkle Fruit', 'case..."
450,EE-IL-0028,[IN THE UNITED STATES DISTRICT COURT FOR THE N...,"{'doc_id': ['EE-IL-0028-0001', 'EE-IL-0028-000...",The EEOC's Chicago District Office sued Bice ...,,,"{'case_name': 'EEOC v. BICE OF CHICAGO INC', '..."
451,FA-TX-0001,[Case 5:11-cv-00422-FB Document 136 Filed 02/0...,"{'doc_id': ['FA-TX-0001-0001', 'FA-TX-0001-000...",Summary posted to the Clearinghouse on August ...,"On May 26, 2011, an agnostic graduating senior...","In 2012, this Texas High School agreed to end ...",{'case_name': 'Schultz v. Medina Valley Indepe...
452,FA-CO-0009,[Case: 1:20-cv-1480 As of: 06/21/2020 03:10 PM...,"{'doc_id': ['FA-CO-0009-9000', 'FA-CO-0009-000...",COVID-19 Summary: This is a suit brought by a ...,"On May 25, 2020, a small church and its pastor...",Court denies TRO and preliminary injunction in...,{'case_name': 'High Plains Harvest Church v. P...


## Build a dataframe that collects the full case, summaries, and metadata (include the 'class_action_sought' and 'case_type' data as it will be used later).

In [13]:
def extract_case_info(dataset_split):
    # Convert to pandas
    df = dataset_split.to_pandas()
    
    # Normalize the nested metadata
    meta_df = pd.json_normalize(df["case_metadata"])
    
    # Concatenate relevant columns
    combined_df = pd.concat([
        df["sources"],
        df["summary/long"],
        df["summary/short"],
        df["summary/tiny"],
        meta_df[["class_action_sought", "case_type"]]
    ], axis=1)
    
    return combined_df.rename(columns={
        "sources": "full_case",
        "summary/long": "summary_long",
        "summary/short": "summary_short",
        "summary/tiny": "summary_tiny"
    })

# Apply to train/val/test
train_cases_df = extract_case_info(multi_lexsum['train'])
val_cases_df = extract_case_info(multi_lexsum['validation'])
test_cases_df = extract_case_info(multi_lexsum['test'])


## Text Cleaning: The light clean is used for the text summarization, and heavy clean is for the Model


In [101]:

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Set of legal boilerplate terms to drop for modeling
HEAVY_DROP = {
    'plaintiff', 'defendant', 'claimant', 'respondent', 'attorney', 'esq', 'bar', 'justice',
    'judge', 'clerk', 'motion', 'order', 'filed', 'signed', 'entered', 'docket', 'exhibit',
    'complaint', 'appeal', 'united', 'states', 'america', 'federal', 'district', 'southern',
    'northern', 'eastern', 'western', 'division', 'supreme', 'magistrate', 'florida', 'fort',
    'lauderdale', 'v', 'vs', 'versus', 'agreement', 'settlement', 'section', 'chapter', 'appendix',
    'amended', 'et', 'seq'
}

# Base cleaning: URLs, emails, phone/fax, page markers
def base_clean(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '', text)
    text = re.sub(r'page \d+ of \d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Remove common legal boilerplate
def heavy_clean(text):
    text = base_clean(text)
    # Drop long header lines
    text = re.sub(r'in the united states district court.*', '', text)
    text = re.sub(r'united states district.*?fort lauderdale.*', '', text)
    # Remove date footers
    text = re.sub(r'\b8/\d{1,2}/\d{4}\b', '', text)
    # Remove enumeration artifacts like '1 .' or 'FIRST COUNT'
    text = re.sub(r'\b\d+\s*[.)]', '', text)
    text = re.sub(r'first count|second count', '', text)
    # Normalize stray punctuation
    text = re.sub(r'\s*[,;:]\s*', ', ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    # Strip settlement/training sections for summarization later
    return text.strip()

# Additional trimming for summarization: stop at settlement or training appendix
def trim_for_summary(text):
    # Keep only complaint portion, drop settlement and training appendices
    parts = re.split(r'settlement agreement|anti-?discrimination training', text)
    return parts[0]

# For summarization: keep punctuation, sentence structure, and core complaint only
def summarization_task_clean(df):
    df_clean = df.copy()
    cleaned_cases = []
    for paras in tqdm(df_clean['full_case'], desc="Summarization clean"):
        all_sents = []
        for para in paras:
            cleaned = heavy_clean(para)
            trimmed = trim_for_summary(cleaned)
            # Split into sentences, filter out empty or boilerplate
            sents = sent_tokenize(trimmed)
            for sent in sents:
                # remove very short or boilerplate-only sentences
                if len(sent.split()) < 5:
                    continue
                all_sents.append(sent)
        # Rejoin into a multi-sentence paragraph
        cleaned_cases.append(' '.join(all_sents))
    df_clean['full_case'] = cleaned_cases
    return df_clean

# For modeling: remove punctuation, stopwords, heavy boilerplate
def modeling_task_clean(df):
    df_clean = df.copy()
    cleaned_cases = []
    for paras in tqdm(df_clean['full_case'], desc="Modeling clean"):
        all_tokens = []
        for para in paras:
            cleaned = heavy_clean(para)
            # strip punctuation completely
            cleaned = re.sub(r'[^\w\s]', '', cleaned)
            tokens = word_tokenize(cleaned)
            filtered = [lemmatizer.lemmatize(w) for w in tokens 
                        if w.isalpha() and w not in stop_words and w not in HEAVY_DROP]
            all_tokens.extend(filtered)
        cleaned_cases.append(' '.join(all_tokens))
    df_clean['full_case'] = cleaned_cases
    return df_clean

val_summarize_cleaned = summarization_task_clean(val_cases_df)
#val_model_cleaned = modeling_task_clean(val_cases_df)
#train_light_cleaned = clean_full_light_case(train_cases_df)
#train_heavy_cleaned = clean_full_heavy_case(train_cases_df)
#test_light_cleaned = clean_full_light_case(test_cases_df)
#testl_heavy_cleaned = clean_full_heavy_case(test_cases_df)

Summarization clean: 100%|███████████████████████████████████████████████████████████| 454/454 [00:37<00:00, 12.20it/s]


In [107]:
val_summarize_cleaned.full_case[1]

'united states district court western district of michigan southern division " r-, ’, p ", 9 equal employment opportunity commission plaintiff, vo -4 honorable swene~io.dr, e]ul .as..ndiilest~rict, judge robert bosch corporation complaint and jury demand defendant. / nature of tile action this is an action under title vii of the civil rights act of 1964 and title i of the civil rights act of 1991 to correct unlawful employment practices on the bases of religion, and to provide appropriate relief to jeff carter who was adversely affected by such practices. the united states equal employment opportunity commission (hereinafter "eeoc") alleges that robert bosch corporation (hereinafter "defendant") failed to provide a reasonable accommodation to the known religious practices of carter, who is a member of the international old path church of god inc. the defendant unlawfully terminated carter because his religious practices conflicted with an employment requirement. jurisdiction and venue 

## Summarization: 

In [73]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from transformers import LEDTokenizer, LEDForConditionalGeneration

tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

#tokenizer = AutoTokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
#model = AutoModelForSeq2SeqLM.from_pretrained("VincentMuriuki/legal-summarizer")



## Long T5

In [91]:
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary").to("cuda")

# Prefix the task!
input_text = "summarize: " + val_summarize_cleaned.full_case[0]

# Tokenize with truncation up to 16k tokens
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    max_length=16384,
    truncation=True
).to("cuda")

# Generate summary
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=1024,
    min_length=100,
    num_beams=4,
    do_sample=False
)

# Decode
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)

and us  us us  be we-- runs run with riches  .    at unter   plus thingwe growing livingmakingrunnerarma gar running " stream gather   floating  Windsor   frominduENGworth- --  kitchen : gear by pour   shop  the  money  over engine art man stuff things topryreending cooking all Everything first First Things Conbau-bine store-- industry---mp-- market-- grocery-- We- way-- factory- highest crap riches-- supplies-- secret-- stash-- Underground-- mess-- trade-- un More weird- total-- ran- party-- mystery-- last-- shop word-- York Gas Word money money- more-- ever least second- let probably spendshop- The manufacturer-- ways-- world-- earn Way Worldest marketplaces sales show two -- overnight- just-- Secret--world-- Mass- Way Conf Shop Money--)- money money money awesome--_-- fame-- James-- Earn-- Merchant-- Amazing third money money cheat sell-- Most Way Way Market Way Way Way Part Way Way Cost Type World World World Min most money money so which that very- So Which That- Over earned money

In [85]:
val_cases_df.full_case[2]

array(['Page 1 of 4\nIN THE UNITED STATES DISTRICT COURT FOR THE SOUTHERN DISTRICT OF FLORIDA\nFORT LAUDERDALE DIVISION UNITED STATES OF AMERICA, ) ) Plaintiff, ) ) v. ) CASE NO. ) CITY OF FORT LAUDERDALE, ) FLORIDA, ) ) Defendant. ) ____________________________________)\nCOMPLAINT AND DEMAND FOR JURY TRIAL Plaintiff, United States of America, alleges: 1. This action is brought by the United States to enforce the provisions of Title VII of the Civil Rights Act of 1964, as amended, 42 U.S.C. § 2000e, et seq. ("Title VII"). 2. This Court has jurisdiction over the action under 42 U.S.C. § 2000e-5(f) and 28 U.S.C. §§ 1331, 1345. 3. Defendant, the City of Fort Lauderdale ("Defendant"), is a political subdivision of the State of Florida, created pursuant to Florida law. 4. Defendant is a person within the meaning of 42 U.S.C.§ 2000e(a), and an employer within the meaning of 42 U.S.C. § 2000e(b). 5. The Equal Employment Opportunity Commission ("EEOC") received a timely charge filed by Elgin O

## Legal text summarization. Split chunk and summarize

In [182]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Using device:", device)

def chunk_text(text, max_words=500):
    """Split text into chunks by words, keeping sentences coherent if possible."""
    words = text.split()
    return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

def summarize_chunk(chunk, max_len=300, min_len=100):
    """Summarize a chunk with repetition control and no repeat n-grams."""
    input_ids = tokenizer.encode(chunk, return_tensors="pt", truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_len,
            min_length=min_len,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=4,
            repetition_penalty=2.0
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

def summarize_long_text(text, chunk_size=500, chunk_summary_maxlen=300, chunk_summary_minlen=100, final_summary_maxlen=300, final_summary_minlen=100):
    """Pipeline to chunk, summarize, and compress long legal input."""
    chunks = chunk_text(text, max_words=chunk_size)
    print(f"[INFO] Chunking input text into {len(chunks)} segments...")

    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        print(f"[INFO] Summarizing chunk {i+1}/{len(chunks)}")
        summary = summarize_chunk(chunk, max_len=chunk_summary_maxlen, min_len=chunk_summary_minlen)
        chunk_summaries.append(summary)

    # Join chunk-level summaries
    joined_summary = " ".join(chunk_summaries)
    print(f"[INFO] Intermediate summary length: {len(joined_summary.split())} words")

    # Final summarization of combined result
    final_summary = summarize_chunk(
        joined_summary,
        max_len=final_summary_maxlen,
        min_len=final_summary_minlen
    )

    return final_summary


Using device: cuda


In [184]:
# Replace with your legal document text
long_text_input = val_summarize_cleaned.full_case[0]  # e.g., from val_summarize_cleaned.full_case[2]

# Generate summary
final_summary = summarize_long_text(
    long_text_input,
    chunk_size=1000,
    chunk_summary_maxlen=256,
    chunk_summary_minlen=100,
    final_summary_maxlen=1024, 
    final_summary_minlen=500
)

print("\n--- Final Summary ---\n")
print(final_summary)


[INFO] Chunking input text into 18 segments...
[INFO] Summarizing chunk 1/18
[INFO] Summarizing chunk 2/18
[INFO] Summarizing chunk 3/18
[INFO] Summarizing chunk 4/18
[INFO] Summarizing chunk 5/18
[INFO] Summarizing chunk 6/18
[INFO] Summarizing chunk 7/18
[INFO] Summarizing chunk 8/18
[INFO] Summarizing chunk 9/18
[INFO] Summarizing chunk 10/18
[INFO] Summarizing chunk 11/18
[INFO] Summarizing chunk 12/18
[INFO] Summarizing chunk 13/18
[INFO] Summarizing chunk 14/18
[INFO] Summarizing chunk 15/18
[INFO] Summarizing chunk 16/18
[INFO] Summarizing chunk 17/18
[INFO] Summarizing chunk 18/18
[INFO] Intermediate summary length: 1582 words

--- Final Summary ---

 U.S. district court for the western district of missouri, western division 2003 u.s. dist. ct. pleadings 3030 . Class action is being discriminated against in violation of title vii of the civil rights act of 1964 and the pregnancy discrimination act (pda) at&t corp. ("at&t") for offering and/or providing health insurance that doe

In [166]:
val_summarize_cleaned.summary_long[0]

"On January 23, 2004, Plaintiff filed an amended complaint under Title VII of the Civil Rights Act of 1964 and the Pregnancy Discrimination Act, 42 U.S.C. § 2000e et seq. and 2000e(k), against AT&T Corporation in the United States District Court for the Western District of Missouri.  The plaintiff, represented by private counsel, was a former AT&T employee and asked the Court for declaratory and injunctive relief, as well as damages, alleging that AT&T's health insurance policy discriminated against women.  Specifically, the plaintiff contended that the defendant's health insurance plan, which did not provide prescription contraceptives (birth control) before 2002 and only through the mail after 2002, violated female employees' civil rights.\n\nThe action originally started in the U.S. District Court for the District of Kansas, but was transferred to Missouri on May 12, 2003.  This is the date on which the docket begins.  Originally, there were two plaintiffs on the case, but one dropp

## T5 Large

In [158]:

tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Using device:", device)

def chunk_text(text, max_words=500):
    """Split text into chunks of max_words (T5 handles fewer tokens than BART)."""
    words = text.split()
    return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

def summarize_chunk(chunk, max_len=200, min_len=80):
    """Summarize a single chunk using T5 with summarization prompt."""
    input_text = "summarize: " + chunk
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_len,
            min_length=min_len,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=4,
            repetition_penalty=1.8
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

def summarize_long_text(text, chunk_size=300, chunk_summary_maxlen=200, chunk_summary_minlen=80, final_summary_maxlen=256, final_summary_minlen=128):
    """Pipeline for summarizing long legal text using t5-large."""
    chunks = chunk_text(text, max_words=chunk_size)
    print(f"[INFO] Chunking input text into {len(chunks)} segments...")

    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        print(f"[INFO] Summarizing chunk {i+1}/{len(chunks)}")
        summary = summarize_chunk(chunk, max_len=chunk_summary_maxlen, min_len=chunk_summary_minlen)
        chunk_summaries.append(summary)

    joined_summary = " ".join(chunk_summaries)
    print(f"[INFO] Intermediate summary length: {len(joined_summary.split())} words")

    # Final summarization step
    final_summary = summarize_chunk(
        joined_summary,
        max_len=final_summary_maxlen,
        min_len=final_summary_minlen
    )
    return final_summary

# Example usage
long_text_input = val_summarize_cleaned.full_case[1]  # Replace as needed
final_summary = summarize_long_text(
    long_text_input,
    chunk_size=512,
    chunk_summary_maxlen=200,
    chunk_summary_minlen=100,
    final_summary_maxlen=200,
    final_summary_minlen=50
)

print("\n--- Final Summary ---\n")
print(final_summary)


Using device: cuda
[INFO] Chunking input text into 37 segments...
[INFO] Summarizing chunk 1/37
[INFO] Summarizing chunk 2/37
[INFO] Summarizing chunk 3/37
[INFO] Summarizing chunk 4/37
[INFO] Summarizing chunk 5/37
[INFO] Summarizing chunk 6/37
[INFO] Summarizing chunk 7/37
[INFO] Summarizing chunk 8/37
[INFO] Summarizing chunk 9/37
[INFO] Summarizing chunk 10/37
[INFO] Summarizing chunk 11/37
[INFO] Summarizing chunk 12/37
[INFO] Summarizing chunk 13/37
[INFO] Summarizing chunk 14/37
[INFO] Summarizing chunk 15/37
[INFO] Summarizing chunk 16/37
[INFO] Summarizing chunk 17/37
[INFO] Summarizing chunk 18/37
[INFO] Summarizing chunk 19/37
[INFO] Summarizing chunk 20/37
[INFO] Summarizing chunk 21/37
[INFO] Summarizing chunk 22/37
[INFO] Summarizing chunk 23/37
[INFO] Summarizing chunk 24/37
[INFO] Summarizing chunk 25/37
[INFO] Summarizing chunk 26/37
[INFO] Summarizing chunk 27/37
[INFO] Summarizing chunk 28/37
[INFO] Summarizing chunk 29/37
[INFO] Summarizing chunk 30/37
[INFO] Summar