<a href="https://colab.research.google.com/github/2303A51553/Natural-language-process/blob/main/legal_document_summarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow nltk numpy pandas sacrebleu





In [None]:
import re
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words=set(stopwords.words('english'))
df = pd.read_csv("/content/Legal_Summarisation_100_Final.csv")

def preprocess_text(text):
    text=str(text).lower()
    text=text.translate(str.maketrans('', '', string.punctuation))
    text=re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens=word_tokenize(text)
    tokens=[w for w in tokens if w not in stop_words]
    return " ".join(tokens)

df["clean_doc"]=df["document"].apply(preprocess_text)
df["clean_sum"]=df["summary"].apply(preprocess_text)
df["clean_sum"] = df["clean_sum"].apply(lambda x: "bos " + x + " eos")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
max_vocab=6000
max_enc_len=256
max_dec_len=64

# Define encoder vectorizer
encoder_vectorizer=layers.TextVectorization(
    max_tokens=max_vocab,
    output_mode="int",
    output_sequence_length=max_enc_len,
    standardize=None
)

# Define decoder vectorizer
decoder_vectorizer=layers.TextVectorization(
    max_tokens=max_vocab,
    output_mode="int",
    output_sequence_length=max_dec_len,
    standardize=None,
    split="whitespace"
)

# Adapt both vectorizers to the combined vocabulary
combined_text = df["clean_doc"].tolist() + df["clean_sum"].tolist()
encoder_vectorizer.adapt(combined_text)
decoder_vectorizer.adapt(combined_text)

# Get vocabulary and word_to_id mapping from one of the adapted vectorizers
vocab=encoder_vectorizer.get_vocabulary()
vocab_size=len(vocab)
word_to_id={w:i for i,w in enumerate(vocab)}

print("bos id:", word_to_id.get("bos"))
print("eos id:", word_to_id.get("eos"))
print("First 50 vocab items:", vocab[:50])

bos id: 22
eos id: 16
First 50 vocab items: ['', '[UNK]', np.str_('court'), np.str_('public'), np.str_('interest'), np.str_('validity'), np.str_('supreme'), np.str_('violation'), np.str_('sides'), np.str_('rights'), np.str_('respondent'), np.str_('petitioner'), np.str_('matter'), np.str_('judgment'), np.str_('high'), np.str_('hearing'), np.str_('eos'), np.str_('delivered'), np.str_('defends'), np.str_('decision'), np.str_('concerns'), np.str_('challenged'), np.str_('bos'), np.str_('articles'), np.str_('argues'), np.str_('action'), np.str_('21'), np.str_('14'), np.str_('case'), np.str_('protection'), np.str_('environmental'), np.str_('appeal'), np.str_('upheld'), np.str_('dispute'), np.str_('narcotics'), np.str_('bail'), np.str_('constitutional'), np.str_('act'), np.str_('conviction'), np.str_('land'), np.str_('compensation'), np.str_('acquisition'), np.str_('preventive'), np.str_('detention'), np.str_('custody'), np.str_('child'), np.str_('cheque'), np.str_('bounce'), np.str_('stronger

In [None]:
enc=encoder_vectorizer(df["clean_doc"].tolist()).numpy()
dec=decoder_vectorizer(df["clean_sum"].tolist()).numpy()

bos_id = word_to_id["bos"]
dec_in = np.concatenate([np.full((dec.shape[0],1), bos_id), dec[:, :-1]], axis=1)
dec_out = dec # The target output for the decoder


print(dec_in[0][:20])
print(dec_out[0][:20])

text=df["clean_doc"].iloc[0]
e=encoder_vectorizer([text]).numpy()

d=np.zeros((1, max_dec_len), dtype=int)
d[0,0]=word_to_id["bos"]



[22 22  6  2 55 36  5 84 16  0  0  0  0  0  0  0  0  0  0  0]
[22  6  2 55 36  5 84 16  0  0  0  0  0  0  0  0  0  0  0  0]


In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_emb=layers.Embedding(vocab_size, embed_dim, mask_zero=True)
        self.pos_emb=layers.Embedding(max_len, embed_dim)

    def call(self, x):
        max_len=tf.shape(x)[1]
        positions=tf.range(start=0, limit=max_len)
        pos_embeddings=self.pos_emb(positions)
        tok_embeddings=self.token_emb(x)
        return tok_embeddings + pos_embeddings


In [None]:
def bert_encoder_layer(embed_dim, num_heads, ff_dim):
    inputs=layers.Input(shape=(None, embed_dim))
    attn=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    out1=layers.LayerNormalization()(inputs + attn)

    ffn=layers.Dense(ff_dim, activation="relu")(out1)
    ffn=layers.Dense(embed_dim)(ffn)
    out2=layers.LayerNormalization()(out1 + ffn)

    return keras.Model(inputs, out2)


In [None]:
def bert_decoder_layer(embed_dim, num_heads, ff_dim):
    dec_in=layers.Input(shape=(None, embed_dim))
    enc_out=layers.Input(shape=(None, embed_dim))

    attn1=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(dec_in, dec_in, use_causal_mask=True)
    out1=layers.LayerNormalization()(dec_in + attn1)

    attn2=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(out1, enc_out)
    out2=layers.LayerNormalization()(out1 + attn2)

    ffn=layers.Dense(ff_dim, activation="relu")(out2)
    ffn=layers.Dense(embed_dim)(ffn)
    out3=layers.LayerNormalization()(out2 + ffn)

    return keras.Model([dec_in, enc_out], out3)


In [None]:
embed_dim=256
ff_dim=512
num_heads=8
num_layers=4

# Encoder
enc_input=keras.Input(shape=(None,), dtype="int32")
enc_emb=PositionalEmbedding(vocab_size, embed_dim, max_enc_len)(enc_input)
encoder=enc_emb
for _ in range(num_layers):
    encoder=bert_encoder_layer(embed_dim, num_heads, ff_dim)(encoder)

# Decoder
dec_input=keras.Input(shape=(None,), dtype="int32")
dec_emb=PositionalEmbedding(vocab_size, embed_dim, max_dec_len)(dec_input)
decoder=dec_emb
for _ in range(num_layers):
    decoder=bert_decoder_layer(embed_dim, num_heads, ff_dim)([decoder, encoder])

# Output to vocab
final_output=layers.Dense(vocab_size)(decoder)

model=keras.Model([enc_input, dec_input], final_output)
model.summary()


In [None]:
enc=encoder_vectorizer(df["clean_doc"].tolist()).numpy()
dec=decoder_vectorizer(df["clean_sum"].tolist()).numpy()[:, :max_dec_len]

dec_in =np.concatenate([np.full((dec.shape[0],1), word_to_id["bos"]), dec[:,:-1]], axis=1)
dec_out=dec

In [None]:
model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)

# Increased epochs from 5 to 20 for better training
model.fit([enc, dec_in], dec_out, epochs=20, batch_size=4)

Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 4s/step - loss: 0.8724
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 4s/step - loss: 0.8131
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 4s/step - loss: 0.8084
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 4s/step - loss: 0.8056
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 4s/step - loss: 0.8073
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 4s/step - loss: 0.8206
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 4s/step - loss: 0.8195
Epoch 8/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 4s/step - loss: 0.8150
Epoch 9/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 4s/step - loss: 0.8194
Epoch 10/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 4s/step - loss: 0.8211
Epo

#Generating summary

In [None]:
id_to_word={i:w for i,w in enumerate(vocab)}

def generate(text):
    e=encoder_vectorizer([text]).numpy() # Use encoder_vectorizer
    d=np.zeros((1, max_dec_len), dtype=int)

    # Initialize with bos_id
    d[0, 0] = word_to_id["bos"]

    for i in range(1, max_dec_len):
        # Predict the next token based on encoder output and current decoder input
        preds=model.predict([e, d[:,:i]], verbose=0)
        # Get the index of the token with the highest probability
        next_id=np.argmax(preds[0, i-1, :])
        d[0, i]=next_id
        # Stop if 'eos' token is generated
        if id_to_word.get(next_id) == "eos":
            break

    # Convert predicted IDs back to words, filtering out special tokens and padding
    words=[]
    for token_id in d[0]:
        word = id_to_word.get(token_id, "")
        if word not in ["", "bos", "eos", "[UNK]"]:
            words.append(word)
        elif word == "eos": # Stop adding words once 'eos' is encountered
            break
    return " ".join(words)

text=df["clean_doc"].iloc[0]
summary=generate(text)
print("Generated Summary:\n", text)

Generated Summary:
 matter concerns constitutional validity law petitioner argues violation rights articles 14 21 respondent defends validity action public interest high court decision challenged supreme court hearing sides court delivered judgment


In [None]:
!pip install evaluate
import evaluate
bleu = evaluate.load("bleu")

preds = [generate(x) for x in df["document"].head(20)]
refs = [[x] for x in df["summary"].head(20)]

bleu_score = bleu.compute(predictions=preds, references=refs)
bleu_score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [None]:
import sacrebleu

preds = []
refs  = []

for i in range(30):
    p = generate(df["clean_doc"].iloc[i])
    preds.append(p)
    refs.append(df["clean_sum"].iloc[i])

bleu = sacrebleu.corpus_bleu(preds, [refs])
print("BLEU:", bleu.score)