<a href="https://colab.research.google.com/github/2303A51553/Natural-language-process/blob/main/legal_document_summarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
!pip install tensorflow nltk numpy pandas sacrebleu





In [31]:
import re, string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

df = pd.read_csv("/content/Legal_Summarisation_100_Final (1).csv")
df = df.dropna(subset=['document','summary']).reset_index(drop=True)
df['cleaned_document'] = df['document'].apply(preprocess_text)
df['cleaned_summary']  = df['summary'].apply(preprocess_text)
print("Examples after cleaning:")
print(df[['cleaned_document','cleaned_summary']].head(2).to_string(index=False))


Examples after cleaning:
                                                                                                                                                                                                                                            cleaned_document                                          cleaned_summary
                        matter concerns constitutional validity law petitioner argues violation rights articles 14 21 respondent defends validity action public interest high court decision challenged supreme court hearing sides court delivered judgment      supreme court rules constitutional validity statute
matter concerns public interest litigation environmental protection petitioner argues violation rights articles 14 21 respondent defends validity action public interest high court decision challenged supreme court hearing sides court delivered judgment court directs stronger measures environmental protection


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [32]:
max_vocab_size=6000
max_enc_len=256
max_dec_len=64

vectorizer = layers.TextVectorization(
    max_tokens=max_vocab_size,
    output_mode='int',
    output_sequence_length=max_enc_len,
    standardize=None
)

vectorizer.adapt(np.concatenate([df['cleaned_document'].values, df['cleaned_summary'].values]))

vocab = vectorizer.get_vocabulary()
vocab_size = len(vocab)
print("Vocab size:", vocab_size)


Vocab size: 90


In [33]:
pad_id = 0
def encode_input(texts):
    return vectorizer(tf.constant(texts)).numpy()





In [34]:
def prepare_decoder_sequences(texts):
    token_ids = vectorizer(tf.constant(texts)).numpy()
    return token_ids

In [35]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, embed_dim, maxlen):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
        self.maxlen = maxlen
        self.embed_dim = embed_dim

    def call(self, x):
        # x: (batch, seq)
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions  # (batch, seq, embed_dim)

def transformer_encoder_layer(embed_dim, ff_dim, num_heads, dropout_rate=0.1):
    inputs = layers.Input(shape=(None, embed_dim))
    # Self-attention
    attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim//num_heads)(inputs, inputs)
    attn = layers.Dropout(dropout_rate)(attn)
    out1 = layers.LayerNormalization(epsilon=1e-6)(inputs + attn)
    # Feed-forward
    ff = layers.Dense(ff_dim, activation='relu')(out1)
    ff = layers.Dense(embed_dim)(ff)
    ff = layers.Dropout(dropout_rate)(ff)
    out2 = layers.LayerNormalization(epsilon=1e-6)(out1 + ff)
    return keras.Model(inputs=inputs, outputs=out2) # Removed name argument

def transformer_decoder_layer(embed_dim, ff_dim, num_heads, dropout_rate=0.1):
    dec_inputs = layers.Input(shape=(None, embed_dim))      # decoder embeddings
    enc_outputs = layers.Input(shape=(None, embed_dim))     # encoder outputs
    # Self-attention with causal mask handled in call
    attn1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim//num_heads)(dec_inputs, dec_inputs, use_causal_mask=True)
    attn1 = layers.Dropout(dropout_rate)(attn1)
    out1 = layers.LayerNormalization(epsilon=1e-6)(dec_inputs + attn1)
    # Cross-attention
    attn2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim//num_heads)(out1, enc_outputs)
    attn2 = layers.Dropout(dropout_rate)(attn2)
    out2 = layers.LayerNormalization(epsilon=1e-6)(out1 + attn2)
    # Feed-forward
    ff = layers.Dense(ff_dim, activation='relu')(out2)
    ff = layers.Dense(embed_dim)(ff)
    ff = layers.Dropout(dropout_rate)(ff)
    out3 = layers.LayerNormalization(epsilon=1e-6)(out2 + ff)
    return keras.Model([dec_inputs, enc_outputs], out3) # Removed name argument

In [36]:
embed_dim = 256
ff_dim = 512
num_heads = 8
num_layers = 4

# Encoder
enc_inputs = keras.Input(shape=(None,), dtype="int64")
enc_emb = PositionalEmbedding(vocab_size, embed_dim, max_enc_len)(enc_inputs)
encoder_output = enc_emb
for _ in range(num_layers):
    encoder_output = transformer_encoder_layer(embed_dim, ff_dim, num_heads, dropout_rate)(encoder_output)

# Decoder
dec_inputs = keras.Input(shape=(max_dec_len,), dtype="int64") # Fixed: Explicitly set max_dec_len
dec_emb = PositionalEmbedding(vocab_size, embed_dim, max_dec_len)(dec_inputs)
decoder_output = dec_emb
for _ in range(num_layers):
    decoder_output = transformer_decoder_layer(embed_dim, ff_dim, num_heads, dropout_rate)([decoder_output, encoder_output])

logits = layers.Dense(vocab_size)(decoder_output)  # (batch, seq_dec, vocab_size)

model = keras.Model([enc_inputs, dec_inputs], logits, name="bert_like_seq2seq")
model.summary()

# ---------------------------
# Loss, optimizer, compile
# ---------------------------
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def masked_loss(y_true, y_pred):
    # y_true: (batch, seq)
    loss = loss_object(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, pad_id), dtype=loss.dtype)
    loss = loss * mask
    return tf.reduce_sum(loss) / (tf.reduce_sum(mask) + 1e-6)

metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss=masked_loss, metrics=metrics)

In [37]:
epochs = 6
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 2s/step - loss: 4.4007 - sparse_categorical_accuracy: 0.0100 - val_loss: 2.7807 - val_sparse_categorical_accuracy: 0.0302
Epoch 2/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2s/step - loss: 1.9209 - sparse_categorical_accuracy: 0.0530 - val_loss: 1.4551 - val_sparse_categorical_accuracy: 0.0667
Epoch 3/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - loss: 0.9775 - sparse_categorical_accuracy: 0.0756 - val_loss: 0.7952 - val_sparse_categorical_accuracy: 0.0760
Epoch 4/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2s/step - loss: 0.6268 - sparse_categorical_accuracy: 0.0794 - val_loss: 0.5763 - val_sparse_categorical_accuracy: 0.0750
Epoch 5/6
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2s/step - loss: 0.5470 - sparse_categorical_accuracy: 0.0760 - val_loss: 0.5297 - val_sparse_categorical_accuracy: 0.0760
Epoch 6/6
[1m1

<keras.src.callbacks.history.History at 0x7beb426815e0>

In [38]:
id_to_word = {i:w for i,w in enumerate(vocab)}
word_to_id = {w:i for i,w in enumerate(vocab)}

def greedy_decode(input_text, max_dec_steps=60):
    enc_seq = encode_input([input_text])  # shape (1, max_enc_len)
    # start token is pad_id (0) as used earlier
    dec_seq = np.full((1, max_dec_len), pad_id, dtype=np.int32)
    # initial decoder input: start token at position 0
    dec_seq[0,0] = pad_id
    for i in range(1, max_dec_steps):
        # prepare truncated input for speed: feed upto i positions
        dec_input_slice = dec_seq[:, :i]
        preds = model.predict([enc_seq, dec_input_slice], verbose=0)  # (1, i, vocab)
        next_token_logits = preds[0, -1, :]  # last timestep logits
        next_id = int(np.argmax(next_token_logits))
        dec_seq[0, i] = next_id
        # stop if we predict padding (used as pseudo-SEP) or EOS equivalent; no explicit SEP here
        if next_id == pad_id:
            break
    # convert dec_seq tokens back to words, skip pads at beginning
    tokens = []
    for id in dec_seq[0]:
        if id == pad_id:
            continue
        tokens.append(id_to_word.get(int(id), ""))
    return " ".join([t for t in tokens if t])

In [39]:
refs=[]
preds=[]
num_eval=min(100, len(val_df))
for idx in range(num_eval):
    src=val_df['cleaned_document'].iloc[idx]
    tgt=val_df['cleaned_summary'].iloc[idx]
    pred_text=greedy_decode(src, max_dec_steps=60)

    refs.append([tgt.split()])
    preds.append(pred_text.split())

smooth=SmoothingFunction().method1
bleu_score=corpus_bleu(refs, preds,smoothing_function=smooth)
print("BLEU (corpus) on validation subset:",bleu_score)

BLEU (corpus) on validation subset: 0.017894467682407687
