In [1]:
# ============================================================================
# LEGAL CASE SUMMARIZATION USING LEXRANK + BART
# ============================================================================

# ============================================================================
# IMPORTS
# ============================================================================

import pandas as pd
import numpy as np
import re
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import nltk
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("‚úÖ All libraries imported successfully!")

‚úÖ All libraries imported successfully!


In [2]:
# ============================================================================
# STEP 2: LOAD YOUR DATA
# ============================================================================

df = pd.read_csv('710cases.csv')
df.head()

Unnamed: 0,Case Title,Link,Case Content
0,The State Of Tamil Nadu vs The Governor Of Tam...,https://indiankanoon.org/docfragment/82729634/...,Take notes as you read a judgment using ourVir...
1,Independent Sugar Corporation Limited vs Giris...,https://indiankanoon.org/docfragment/117249167...,Take notes as you read a judgment using ourVir...
2,Piramal Capital And Housing Finance ... vs 63 ...,https://indiankanoon.org/docfragment/190999006...,Take notes as you read a judgment using ourVir...
3,In Re Recruitment Of Visually Impaired ... vs ...,https://indiankanoon.org/docfragment/158218833...,Take notes as you read a judgment using ourVir...
4,Union Of India vs Future Gaming Solutions P.Lt...,https://indiankanoon.org/docfragment/117744026...,Take notes as you read a judgment using ourVir...


In [3]:
print(f"üìä Dataset loaded: {len(df)} cases")
print(f"üìè Average case length: {df['Case Content'].apply(lambda x: len(str(x).split())).mean():.0f} words")

üìä Dataset loaded: 710 cases
üìè Average case length: 7186 words


In [None]:
# # ============================================================================
# # STEP 3: TEXT PREPROCESSING FOR LEGAL DOCUMENTS
# # ============================================================================

# def preprocess_legal_text(text):
#     """
#     Clean and preprocess legal case text
#     """
#     if not isinstance(text, str):
#         return ""
    
#     # Remove extra whitespace
#     text = re.sub(r'\n\s*\n', '\n\n', text)
#     text = re.sub(r'[ \t]+', ' ', text)
    
#     # Remove citation patterns like (2020) or [2020]
#     text = re.sub(r'\([^)]*\d{4}[^)]*\)', '', text)
#     text = re.sub(r'\[[^\]]*\d{4}[^\]]*\]', '', text)
    
#     # Remove case numbers and reference patterns
#     text = re.sub(r'AIR\s+\d{4}', '', text)
#     text = re.sub(r'SCC\s+\d+', '', text)
#     text = re.sub(r'CrLJ\s+\d+', '', text)
    
#     # Remove excessive legal jargon markers
#     text = re.sub(r'\s+vs?\s+', ' vs ', text, flags=re.IGNORECASE)
    
#     return text.strip()

# print("‚úÖ Preprocessing functions ready!")

‚úÖ Preprocessing functions ready!


In [None]:
# ============================================================================
# STEP 4: EXTRACTIVE SUMMARIZATION USING LEXRANK
# ============================================================================

def extract_key_sentences(text, sentence_count=15):
    
    try:
        # Preprocess
        clean_text = preprocess_legal_text(text)
        
        # LexRank works best with at least some content
        if len(clean_text.split()) < 100:
            return clean_text
        
        # Calculate sentence count based on document length
        word_count = len(clean_text.split())
        if word_count > 50000:
            sentence_count = 200 
        elif word_count > 20000:
            sentence_count = 150 
        elif word_count > 10000:
            sentence_count = 100  
        elif word_count > 5000:
            sentence_count = 700  
        elif word_count > 2000:
            sentence_count = 50
        else:
            sentence_count = 30  
        
        # Create parser
        parser = PlaintextParser.from_string(clean_text, Tokenizer("english"))
        
        # Use LexRank summarizer
        stemmer = Stemmer("english")
        summarizer = LexRankSummarizer(stemmer)
        summarizer.stop_words = get_stop_words("english")
        
        # Get important sentences
        summary_sentences = summarizer(parser.document, sentence_count)
        
        
        summary_sentences_with_pos = []
        for sentence in summary_sentences:
            original_text = str(sentence)
            # Find position in original document
            pos = clean_text.find(original_text)
            summary_sentences_with_pos.append((pos, original_text))

        # Sort by position
        summary_sentences_with_pos.sort(key=lambda x: x[0])

        # Combine in original order
        extracted_text = " ".join([sent for pos, sent in summary_sentences_with_pos])
        
        return extracted_text
    
    except Exception as e:
        print(f"‚ö†Ô∏è LexRank extraction error: {e}")
        # Fallback: return first 2000 words
        words = clean_text.split()[:2000]
        return " ".join(words)

print("‚úÖ LexRank extractive summarizer ready!")

‚úÖ LexRank extractive summarizer ready!


In [6]:
# ============================================================================
# STEP 5: ABSTRACTIVE SUMMARIZATION USING BART
# ============================================================================

# Load BART model (this will download ~1.6GB first time)
print("\nüì• Loading BART model...")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è Using device: {device}")

bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)

print("‚úÖ BART model loaded successfully!")

def generate_abstractive_summary(text, max_length=700, min_length=200):
    
    try:
        inputs = bart_tokenizer(
            text,
            max_length=1024, 
            truncation=True,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            summary_ids = bart_model.generate(
                inputs.input_ids,
                max_length=max_length,
                min_length=min_length,
                length_penalty=2.0,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3 
            )
        
        summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
        return summary
    
    except Exception as e:
        print(f"‚ö†Ô∏è BART generation error: {e}")
        return "Summary generation failed."

print("‚úÖ BART abstractive summarizer ready!")


üì• Loading BART model...
üñ•Ô∏è Using device: cpu
‚úÖ BART model loaded successfully!
‚úÖ BART abstractive summarizer ready!


In [7]:
# ============================================================================
# STEP 6: COMPLETE PIPELINE - HYBRID SUMMARIZATION
# ============================================================================

def summarize_legal_case_hybrid(case_text, max_summary_length=500):
    """
    Complete hybrid summarization pipeline:
    1. Extract key sentences using LexRank (extractive)
    2. Generate clean summary using BART (abstractive)
    
    Args:
        case_text: Full legal case content
        max_summary_length: Target summary length
    
    Returns:
        Final summary
    """
    
    # Step 1: Extract important content using LexRank
    print("  üîç Extracting key sentences with LexRank...")
    extracted_content = extract_key_sentences(case_text)
    
    # Step 2: Generate clean summary using BART
    print("  ‚úçÔ∏è Generating summary with BART...")
    final_summary = generate_abstractive_summary(
        extracted_content,
        max_length=700,
        min_length=300
    )
    
    return final_summary

print("‚úÖ Complete hybrid pipeline ready!")

‚úÖ Complete hybrid pipeline ready!


In [8]:
# ============================================================================
# STEP 7: TEST ON SAMPLE CASES
# ============================================================================

print("\nüß™ TESTING ON SAMPLE CASES\n")

sample_df = df.sample(n=5, random_state=42).reset_index(drop=True)

summaries = []

for idx, row in sample_df.iterrows():
    case_title = row['Case Title']
    case_text = row['Case Content']
    
    print(f"\nüìÑ Case {idx+1}: {case_title[:60]}...")
    print(f"   Original length: {len(str(case_text).split())} words")
    
    summary = summarize_legal_case_hybrid(case_text, max_summary_length=400)
    summaries.append(summary)
    
    print(f"   Summary length: {len(summary.split())} words")
    print(f"   ‚úÖ Summary: {summary[:200]}...")

sample_df['Summary'] = summaries

# Display results
print("üìä SAMPLE RESULTS")
print(sample_df[['Case Title', 'Summary']].head())

# Save sample results
sample_df.to_csv('sample_cases_summarized_hybrid.csv', index=False)
print("\nüíæ Sample results saved to 'sample_cases_summarized_hybrid.csv'")

print("‚úÖ TESTING COMPLETE!")


üß™ TESTING ON SAMPLE CASES


üìÑ Case 1: Zahoor Ahmad Rather vs Sheikh Imtiyaz Ahmad on 5 December, 2...
   Original length: 2917 words
  üîç Extracting key sentences with LexRank...
  ‚úçÔ∏è Generating summary with BART...
   Summary length: 244 words
   ‚úÖ Summary: State Service Selection Board (SSSB) for filling up the posts of Technician- allowed the writ petitions on the ground that it was not open to the SSSB to noted that a candidate possessing a Diploma ‚Äì ...

üìÑ Case 2: The State Of Bihar vs Devendra Sharma on 17 October, 2019...
   Original length: 4757 words
  üîç Extracting key sentences with LexRank...
  ‚úçÔ∏è Generating summary with BART...
   Summary length: 245 words
   ‚úÖ Summary: Case, the employees were put in following three categories: 5 for short, ‚ÄòState Committee‚Äô7October 6, 2009 whereby, the report submitted by three. The issue of any procedural irregularity for a findin...

üìÑ Case 3: Kamal Singh vs State Of Haryana on 29 July, 2010...
   Ori