# 📜 Legal Document Preprocessing Pipeline (CUDA-Optimized with InLegalBERT)


This notebook preprocesses legal documents while **preserving context** and prepares them for embedding.  
Key steps:  
✔ Cleaning text  
✔ Removing stopwords  
✔ Sentence tokenization  
✔ Splitting long text into chunks  
✔ Extracting legal clauses  
✔ Tokenizing using InLegalBERT  
✔ Batch processing for large datasets  
✔ CUDA acceleration (if available)  


In [1]:
import os
import re
import nltk
import pandas as pd
import torch
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer

# Set NLTK data path manually (fix for missing stopwords issue)
nltk_path = "C:/Users/darsh/AppData/Roaming/nltk_data"
if not os.path.exists(nltk_path):
    os.makedirs(nltk_path)

nltk.data.path.append(nltk_path)

# Download required NLTK datasets if missing
nltk.download("stopwords", download_dir=nltk_path)
nltk.download("punkt", download_dir=nltk_path)

# Load stopwords
stop_words = set(stopwords.words("english"))

# ✅ Load InLegalBERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:/Users/darsh/AppData/Roaming/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/darsh/AppData/Roaming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cuda


In [2]:
import nltk

nltk.download('punkt')  # Normal sentence tokenizer
nltk.download('punkt_tab')  # If needed


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\darsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
def clean_text(text):
    """Removes extra spaces, converts text to lowercase, and ensures proper formatting."""
    if not isinstance(text, str) or text.strip() == "":
        return ""
    return " ".join(text.split())


In [4]:
def remove_stopwords(text):
    """Removes common English stopwords but retains legal terms."""
    if not isinstance(text, str) or text.strip() == "":
        return ""

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]

    return " ".join(filtered_words)


In [5]:
from nltk.tokenize import sent_tokenize

def tokenize_sentences(text):
    """Tokenize text into sentences while ensuring nltk punkt is loaded correctly."""
    if not isinstance(text, str) or text.strip() == "":
        return []
    
    return sent_tokenize(text)


In [6]:
def split_long_text(text, max_tokens=512):
    """Splits long legal text into smaller chunks while maintaining context."""
    words = text.split()
    chunks = [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]
    return chunks


In [14]:
import re

def extract_legal_clauses(text):
    """
    Extracts only the section number and its clause.
    """
    # Regex to capture "Section X" or "Section X(a)" followed by its clause, stopping at the next section
    pattern = r"(Section\s+\d+[A-Za-z]*)\s*([\s\S]*?)(?=(Section\s+\d+[A-Za-z]*)|$)"
    
    matches = re.findall(pattern, text)
    
    # Extract only the section number and its clause
    extracted_clauses = [f"{match[0]} {match[1].strip()}" for match in matches if match[1].strip()]
    
    return extracted_clauses if extracted_clauses else ["No Legal Clause Found"]


In [8]:
def tokenize_with_inlegalbert(text):
    """
    Tokenizes the text using the InLegalBERT tokenizer.
    Converts text into input IDs and attention masks.
    """
    if not isinstance(text, str) or text.strip() == "":
        return None

    tokens = tokenizer(
        text, 
        truncation=True, 
        padding="max_length", 
        max_length=512, 
        return_tensors="pt"
    )

    return tokens.input_ids.cpu().tolist() if device == "cpu" else tokens.input_ids.cuda().tolist()


In [15]:
def preprocess_parquet(input_file, batch_size=1000, output_file="preprocessed_data.parquet"):
    """Processes legal documents in batches while preserving context."""
    print(f"Processing {input_file}...")

    all_batches = []
    df = pd.read_parquet(input_file)

    for i in range(0, len(df), batch_size):
        chunk = df.iloc[i:i+batch_size].copy()
        print(f"Processing batch {i // batch_size + 1} of {len(df) // batch_size + 1}...")

        # Apply preprocessing
        chunk["cleaned_text"] = chunk["Text"].apply(clean_text)
        chunk["no_stopwords_text"] = chunk["cleaned_text"].apply(remove_stopwords)
        chunk["sentences"] = chunk["no_stopwords_text"].apply(tokenize_sentences)
        chunk["text_chunks"] = chunk["no_stopwords_text"].apply(split_long_text)
        chunk["legal_clauses"] = chunk["cleaned_text"].apply(extract_legal_clauses)
        chunk["tokenized_text"] = chunk["cleaned_text"].apply(tokenize_with_inlegalbert)

        all_batches.append(chunk)

    # Merge all batches & save to new Parquet file
    processed_df = pd.concat(all_batches, ignore_index=True)
    processed_df.to_parquet(output_file)
    print(f"Preprocessing completed. Processed data saved to {output_file}")


In [16]:
preprocess_parquet("merged_legal.parquet", batch_size=1000, output_file="preprocessed_data.parquet")


Processing merged_legal.parquet...
Processing batch 1 of 12...
Processing batch 2 of 12...
Processing batch 3 of 12...
Processing batch 4 of 12...
Processing batch 5 of 12...
Processing batch 6 of 12...
Processing batch 7 of 12...
Processing batch 8 of 12...
Processing batch 9 of 12...
Processing batch 10 of 12...
Processing batch 11 of 12...
Processing batch 12 of 12...
Preprocessing completed. Processed data saved to preprocessed_data.parquet


In [17]:
import pandas as pd

# Load the preprocessed data
df = pd.read_parquet("preprocessed_data.parquet")

# Display the first few rows in a proper table format
from IPython.display import display

# Show first 5 rows
display(df.head())

# Show column names
print("Columns:", df.columns.tolist())

# Show dataset shape (rows, columns)
print("Shape of DataFrame:", df.shape)

# Show a random sample of 5 rows
display(df.sample(5))


Unnamed: 0,Titles,Court_Name,Cites,Cited_by,Doc_url,Text,Doc_size,Case_Type,Court_Type,Court_Name_Normalized,cleaned_text,no_stopwords_text,sentences,text_chunks,legal_clauses,tokenized_text
0,Priti Bhojnagarwala vs State Of Gujarat And An...,Gujarat High Court,72,2,https://indiankanoon.org/doc/1943657,"JUDGMENT H.H. Mehta, J. \n 1. This is a group...",75823,Criminal,High_Court,Gujarat High Court,"JUDGMENT H.H. Mehta, J. 1. This is a group of ...","JUDGMENT H.H. Mehta, J. 1. group 30 Criminal M...","[JUDGMENT H.H., Mehta, J., 1. group 30 Crimina...","[JUDGMENT H.H. Mehta, J. 1. group 30 Criminal ...",[Section 482 of the Code of Criminal Procedure...,"[[101, 371, 152, 117, 152, 117, 19904, 190, 17..."
1,Soni Natverlal Prabhudas And Anr. vs State Of ...,Gujarat High Court,34,2,https://indiankanoon.org/doc/672946,"ORDER D.H. Shukla, J. \n\n1. The petitioner No...",47086,Criminal,High_Court,Gujarat High Court,"ORDER D.H. Shukla, J. 1. The petitioner No. 1....","ORDER D.H. Shukla, J. 1. petitioner No. 1. Son...","[ORDER D.H. Shukla, J., 1. petitioner No., 1.,...","[ORDER D.H. Shukla, J. 1. petitioner No. 1. So...",[Section 50 (1) of the Criminal Procedure Code...,"[[101, 308, 148, 117, 152, 117, 13164, 12857, ..."
2,State Of Gujarat vs Shahnawaz Abdulgafur Bhatt...,Gujarat High Court,33,2,https://indiankanoon.org/doc/952858,"JUDGMENT A.M. Kapadia, J. \n\n1. Criminal Conf...",90696,Criminal,High_Court,Gujarat High Court,"JUDGMENT A.M. Kapadia, J. 1. Criminal Confirma...","JUDGMENT A.M. Kapadia, J. 1. Criminal Confirma...","[JUDGMENT A.M. Kapadia, J., 1., Criminal Confi...","[JUDGMENT A.M. Kapadia, J. 1. Criminal Confirm...",[Section 366 (1) of the Code of Criminal Proce...,"[[101, 371, 145, 117, 157, 117, 10757, 16577, ..."
3,"State vs Sonu on 12 January, 2010",Gujarat High Court,12,2,https://indiankanoon.org/doc/187011,Gujarat High Court Case Information System \n\...,21070,Criminal,High_Court,Gujarat High Court,Gujarat High Court Case Information System Pri...,Gujarat High Court Case Information System Pri...,[Gujarat High Court Case Information System Pr...,[Gujarat High Court Case Information System Pr...,[Section 304 Part I IPC; R.I. for six months a...,"[[101, 6170, 13867, 2272, 1026, 240, 256, 286,..."
4,Rabari Sagarbhai Ganeshbhai vs State Of Gujara...,Gujarat High Court,16,2,https://indiankanoon.org/doc/98745554,RABARI SAGARBHAI GANESHBHAIV/SSTATE OF GUJARAT...,21749,Criminal,High_Court,Gujarat High Court,RABARI SAGARBHAI GANESHBHAIV/SSTATE OF GUJARAT...,RABARI SAGARBHAI GANESHBHAIV/SSTATE GUJARAT R/...,[RABARI SAGARBHAI GANESHBHAIV/SSTATE GUJARAT R...,[RABARI SAGARBHAI GANESHBHAIV/SSTATE GUJARAT R...,[Section 482 . No precise and inflexible guide...,"[[101, 3679, 6148, 179, 20782, 188, 172, 5428,..."


Columns: ['Titles', 'Court_Name', 'Cites', 'Cited_by', 'Doc_url', 'Text', 'Doc_size', 'Case_Type', 'Court_Type', 'Court_Name_Normalized', 'cleaned_text', 'no_stopwords_text', 'sentences', 'text_chunks', 'legal_clauses', 'tokenized_text']
Shape of DataFrame: (11970, 16)


Unnamed: 0,Titles,Court_Name,Cites,Cited_by,Doc_url,Text,Doc_size,Case_Type,Court_Type,Court_Name_Normalized,cleaned_text,no_stopwords_text,sentences,text_chunks,legal_clauses,tokenized_text
5138,"State vs . Sandeep @ Amit on 21 February, 2014",Delhi District Court,11,0,https://indiankanoon.org/doc/87134991/,1\n\n IN THE COURT OF MS. ILLA RAWAT...,57146,Constitution,District_And_Tribunals,Delhi District Court,1 IN THE COURT OF MS. ILLA RAWAT : ADDL. SESSI...,1 COURT MS. ILLA RAWAT : ADDL. SESSIONS JUDGE ...,"[1 COURT MS. ILLA RAWAT : ADDL., SESSIONS JUDG...",[1 COURT MS. ILLA RAWAT : ADDL. SESSIONS JUDGE...,[No Legal Clause Found],"[[101, 198, 213, 207, 240, 210, 1620, 117, 177..."
8221,"Deepa Ram vs State Of Rajasthan on 11 March, 1987",Rajasthan High Court,10,0,https://indiankanoon.org/doc/1606869,"JUDGMENT Shyam Sunder Byas, J. \n\n1. Since th...",29259,Criminal,High_Court,Rajasthan High Court,"JUDGMENT Shyam Sunder Byas, J. 1. Since these ...","JUDGMENT Shyam Sunder Byas, J. 1. Since two ap...","[JUDGMENT Shyam Sunder Byas, J., 1., Since two...","[JUDGMENT Shyam Sunder Byas, J. 1. Since two a...","[Section 302 , IPC is fully justified and call...","[[101, 371, 4936, 195, 4107, 15682, 577, 218, ..."
7482,Food Corporation Of India vs Joginderpal Mohin...,Supreme Court of India,8,142,https://indiankanoon.org/doc/1337022/,PETITIONER:\nFOOD CORPORATION OF INDIA\n\n\tVs...,24407,Tax,Supreme_Court,Supreme Court of India,PETITIONER: FOOD CORPORATION OF INDIA Vs. RESP...,PETITIONER: FOOD CORPORATION INDIA Vs. RESPOND...,[PETITIONER: FOOD CORPORATION INDIA Vs. RESPON...,[PETITIONER: FOOD CORPORATION INDIA Vs. RESPON...,[No Legal Clause Found],"[[101, 1818, 119, 1360, 469, 210, 3975, 166, 1..."
1726,"Shaji vs Ramachandran on 24 February, 2003",Kerala High Court,6,12,https://indiankanoon.org/doc/1858937,"JUDGMENT Jawahar Lal Gupta, C.J. \n\n1. Does a...",28657,Motorvehicles,High_Court,Kerala High Court,"JUDGMENT Jawahar Lal Gupta, C.J. 1. Does a tea...","JUDGMENT Jawahar Lal Gupta, C.J. 1. teacher in...","[JUDGMENT Jawahar Lal Gupta, C.J., 1. teacher ...","[JUDGMENT Jawahar Lal Gupta, C.J. 1. teacher i...","[Section 185 of the Motor Vehicles Act, 1988 a...","[[101, 371, 17845, 171, 9243, 1058, 182, 25564..."
2209,Chief Conservator Of Forests And ... vs Jagann...,Supreme Court of India,13,119,https://indiankanoon.org/doc/112813232/,"ORDER B.L. Hansaria, J. \n\n1. Two questions i...",26676,Constitution,Supreme_Court,Supreme Court of India,"ORDER B.L. Hansaria, J. 1. Two questions in th...","ORDER B.L. Hansaria, J. 1. Two questions main ...","[ORDER B.L., Hansaria, J., 1., Two questions m...","[ORDER B.L. Hansaria, J. 1. Two questions main...","[Section 2 (j) of the Industrial Disputes Act,...","[[101, 308, 146, 117, 156, 117, 12713, 6289, 1..."


In [18]:
import pandas as pd

# Load the preprocessed Parquet file
df = pd.read_parquet("preprocessed_data.parquet")

# Display a random row with full text for checking
sample_row = df.sample(1).iloc[0]

# Print all processed fields for inspection
print("Original Text:\n", sample_row["Text"], "\n")
print("Cleaned Text:\n", sample_row["cleaned_text"], "\n")
print("Without Stopwords:\n", sample_row["no_stopwords_text"], "\n")
print("Tokenized Sentences:\n", sample_row["sentences"], "\n")
print("Text Chunks:\n", sample_row["text_chunks"], "\n")
print("Extracted Legal Clauses:\n", sample_row["legal_clauses"], "\n")
print("tokenized_text:\n", sample_row["tokenized_text"], "\n")


Original Text:
 JUDGMENT Dr. B.P. Saraf, J. 
1. By this writ petition the petitioner has challenged the order of the appropriate authority dated February 26, 1993, under sub-section (1) of section 269UD of the Income-tax Act, 1961 ("the Act"), directing the purchase of the property in question by the Central Government. The facts of the case briefly stated are as follows :  
The petitioner is the owner of a flat being flat No. 403, Parishram, situated at 320, Pali Hill, Bandra, Bombay. The petitioner entered into an agreement dated February 5, 1990, for the sale of the above flat with Shri Amarnath, Smt. Kamala Amarnath and Shri Rakesh Kapoor ("the purchasers"). By the said agreement of sale, the petitioner agreed to sell and the purchasers agreed to purchase the said flat along with five shares of Rs. 50 each held by the petitioner in the co-operative housing society for a consideration of Rs. 29.50 lakhs. Of the said consideration, an amount of Rs. 1 lakh was payable as earnest money

In [None]:
import spacy
import pandas as pd
from tqdm import tqdm
import glob
from sentence_transformers import SentenceTransformer

# ✅ Increase spaCy's max length limit
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 3000000  # Increased from 1,000,000 to 2,000,000

# ✅ Load Sentence-BERT with CUDA
sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")

# ✅ Load dataset
file_path = "preprocessed_data.parquet"
df = pd.read_parquet(file_path)

# ✅ Get total batches
batch_size = 1000
num_batches = len(df) // batch_size + (1 if len(df) % batch_size else 0)

# ✅ Find already processed batches
processed_files = set(glob.glob("processed_batch_*.parquet"))
processed_batch_indices = {int(f.split("_")[-1].split(".")[0]) for f in processed_files}

# ✅ Process remaining batches
for i in range( num_batches):  # Start from batch 7
    if i in processed_batch_indices:
        print(f"✅ Batch {i} already processed. Skipping...")
        continue  # Skip if already processed

    print(f"🚀 Processing batch {i}...")

    start, end = i * batch_size, (i + 1) * batch_size
    batch_df = df.iloc[start:end].copy()

    pos_tags, named_entities, semantic_embeddings = [], [], []
    
    for text in tqdm(batch_df["cleaned_text"], desc=f"Batch {i} Processing"):
        doc = nlp(text)

        # ✅ POS tagging
        pos_tags.append([(token.text, token.pos_) for token in doc])

        # ✅ Named Entity Recognition (NER)
        named_entities.append([(ent.text, ent.label_) for ent in doc.ents])

        # ✅ Semantic text embeddings (Optimized with CUDA)
        embedding = sbert_model.encode(text, convert_to_numpy=True, device="cuda")
        semantic_embeddings.append(embedding)

    # ✅ Append new features
    batch_df["pos_tags"] = pos_tags
    batch_df["named_entities"] = named_entities
    batch_df["semantic_embeddings"] = semantic_embeddings

    # ✅ Save processed batch
    temp_file = f"processed_batch_{i}.parquet"
    batch_df.to_parquet(temp_file, index=False)
    print(f"✅ Saved batch {i} to {temp_file}")

print("🎉 All remaining batches processed!")


  from .autonotebook import tqdm as notebook_tqdm


✅ Batch 7 already processed. Skipping...
✅ Batch 8 already processed. Skipping...
🚀 Processing batch 9...


Batch 9 Processing: 100%|██████████| 1000/1000 [16:40<00:00,  1.00s/it]


✅ Saved batch 9 to processed_batch_9.parquet
🚀 Processing batch 10...


Batch 10 Processing: 100%|██████████| 1000/1000 [17:24<00:00,  1.04s/it]


✅ Saved batch 10 to processed_batch_10.parquet
🚀 Processing batch 11...


Batch 11 Processing: 100%|██████████| 970/970 [14:26<00:00,  1.12it/s]


✅ Saved batch 11 to processed_batch_11.parquet
🎉 All remaining batches processed!


In [4]:
import pandas as pd
import glob

# Path where all processed batch files are stored
batch_files = sorted(glob.glob("processed_batch_*.parquet"))  # Sort to maintain order

# Read all batches and concatenate
df_list = [pd.read_parquet(batch) for batch in batch_files]
final_df = pd.concat(df_list, ignore_index=True)

# Save the final merged file
final_df.to_parquet("final_preprocessed_data.parquet", index=False)
print("✅ All processed batches merged into 'final_preprocessed_data.parquet'")


✅ All processed batches merged into 'final_preprocessed_data.parquet'


In [2]:
import pandas as pd

# Load the Parquet file
def inspect_parquet(file_path):
    df = pd.read_parquet(file_path)
    
    # Display basic information
    print("Dataset Shape:", df.shape)  # (rows, columns)
    print("Columns:", df.columns.tolist())
    print("Data Types:\n", df.dtypes)
    print("First few rows:\n", df.head())
    
    return df

# Example usage
file_path = "processed_batch_11.parquet"  # Replace with your actual file path
df = inspect_parquet(file_path)


Dataset Shape: (970, 19)
Columns: ['Titles', 'Court_Name', 'Cites', 'Cited_by', 'Doc_url', 'Text', 'Doc_size', 'Case_Type', 'Court_Type', 'Court_Name_Normalized', 'cleaned_text', 'no_stopwords_text', 'sentences', 'text_chunks', 'legal_clauses', 'tokenized_text', 'pos_tags', 'named_entities', 'semantic_embeddings']
Data Types:
 Titles                   object
Court_Name               object
Cites                     int64
Cited_by                  int64
Doc_url                  object
Text                     object
Doc_size                  int64
Case_Type                object
Court_Type               object
Court_Name_Normalized    object
cleaned_text             object
no_stopwords_text        object
sentences                object
text_chunks              object
legal_clauses            object
tokenized_text           object
pos_tags                 object
named_entities           object
semantic_embeddings      object
dtype: object
First few rows:
                                 

In [7]:
df["Titles"].iloc[0]  # Check the first row of semantic embeddings

'Sri Ram Ram Narain Medhi vs The State Of Bombay(And Connected ... on 18 November, 1958'

In [5]:
# Display an entire column without truncation
column_name = "tokenized_text"  # Replace with the column you want to view
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_colwidth", None)  # Show full content of each cell

print(df[column_name])


0                             [[101, 1818, 119, 8088, 7979, 7979, 11292, 171, 890, 3911, 3920, 166, 189, 117, 1992, 119, 207, 264, 210, 12019, 171, 195, 111, 212, 2935, 1497, 112, 254, 210, 371, 119, 422, 118, 282, 118, 5561, 7396, 119, 146, 3646, 177, 6082, 190, 179, 115, 6779, 16688, 3463, 182, 152, 117, 7396, 119, 146, 3646, 177, 6082, 190, 179, 115, 6779, 16688, 3463, 182, 152, 117, 2602, 189, 115, 5958, 3920, 6764, 9958, 111, 147, 180, 112, 163, 890, 3646, 115, 146, 6416, 192, 6024, 23065, 188, 160, 117, 1063, 6148, 171, 185, 115, 155, 117, 13737, 7823, 185, 115, ...]]
1                                  [[101, 1818, 119, 10945, 178, 4689, 166, 189, 117, 1992, 119, 264, 210, 165, 117, 160, 117, 254, 210, 371, 119, 1117, 118, 834, 118, 3024, 7396, 119, 2019, 171, 115, 153, 117, 148, 117, 7396, 119, 2019, 171, 115, 153, 117, 148, 117, 4735, 115, 145, 117, 158, 117, 5377, 119, 3024, 1180, 18874, 3145, 11567, 111, 198, 112, 8332, 3024, 17551, 111, 200, 112, 3434, 5637, 9635, 12416, 119

In [1]:
import pandas as pd

# Read the first few rows of the Parquet file
df = pd.read_parquet('processed_batch_0.parquet', engine='pyarrow')  # or engine='fastparquet'

# Preview the first 5 rows
print(df.head())

# Get basic info about the dataframe
print(df.info())

# Get summary statistics
print(df.describe())

                                              Titles          Court_Name  \
0  Priti Bhojnagarwala vs State Of Gujarat And An...  Gujarat High Court   
1  Soni Natverlal Prabhudas And Anr. vs State Of ...  Gujarat High Court   
2  State Of Gujarat vs Shahnawaz Abdulgafur Bhatt...  Gujarat High Court   
3                  State vs Sonu on 12 January, 2010  Gujarat High Court   
4  Rabari Sagarbhai Ganeshbhai vs State Of Gujara...  Gujarat High Court   

   Cites  Cited_by                                Doc_url  \
0     72         2   https://indiankanoon.org/doc/1943657   
1     34         2    https://indiankanoon.org/doc/672946   
2     33         2    https://indiankanoon.org/doc/952858   
3     12         2    https://indiankanoon.org/doc/187011   
4     16         2  https://indiankanoon.org/doc/98745554   

                                                Text  Doc_size Case_Type  \
0  JUDGMENT H.H. Mehta, J.  \n 1. This is a group...     75823  Criminal   
1  ORDER D.H. Shukla, J.