<a href="https://colab.research.google.com/github/2303A51553/Natural-language-process/blob/main/2303A51553_Legal_document_Summarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow nltk pandas numpy sacrebleu


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [2]:
import re
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
nltk.download('punkt_tab')
df = pd.read_csv("/content/Legal_Summarisation_100_Final.csv")

df["clean_doc"] = df["document"].apply(preprocess_text)
df["clean_sum"] = df["summary"].apply(preprocess_text)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
BOS = "[BOS]"
EOS = "[EOS]"

df["clean_sum"] = df["clean_sum"].apply(lambda x: f"{BOS} {x} {EOS}")


In [6]:
max_vocab = 25000
max_enc_len = 256
max_dec_len = 64

vectorizer = layers.TextVectorization(
    max_tokens=max_vocab,
    output_mode="int",
    output_sequence_length=max_dec_len,
    standardize=None
)

# Adapt vocabulary
vectorizer.adapt(df["clean_doc"].tolist() + df["clean_sum"].tolist())

# Get vocab
vocab = vectorizer.get_vocabulary()
vocab_size = len(vocab)

# Token IDs
BOS_ID = vocab.index(BOS)
EOS_ID = vocab.index(EOS)
PAD_ID = 0


In [7]:
enc = vectorizer(df["clean_doc"].tolist()).numpy()
dec = vectorizer(df["clean_sum"].tolist()).numpy()

# Decoder input starts with BOS
dec_in = np.concatenate([np.full((dec.shape[0],1), BOS_ID), dec[:,:-1]], axis=1)

# Decoder target is full summary
dec_out = dec


In [8]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_emb=layers.Embedding(vocab_size, embed_dim, mask_zero=True)
        self.pos_emb=layers.Embedding(max_len, embed_dim)

    def call(self, x):
        positions=tf.range(start=0, limit=tf.shape(x)[1])
        pos_embeddings=self.pos_emb(positions)
        tok_embeddings=self.token_emb(x)
        return tok_embeddings+pos_embeddings


In [9]:
def bert_encoder_layer(embed_dim, num_heads, ff_dim):
    inp=layers.Input(shape=(None, embed_dim))
    attn=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inp, inp)
    out1=layers.LayerNormalization()(inp + attn)

    ffn=layers.Dense(ff_dim, activation="relu")(out1)
    ffn=layers.Dense(embed_dim)(ffn)
    out2=layers.LayerNormalization()(out1 + ffn)

    return keras.Model(inp, out2)


In [10]:
def bert_decoder_layer(embed_dim, num_heads, ff_dim):
    dec_in=layers.Input(shape=(None, embed_dim))
    enc_out=layers.Input(shape=(None, embed_dim))

    # Masked self-attention
    attn1=layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim, use_causal_mask=True
    )(dec_in, dec_in)
    out1=layers.LayerNormalization()(dec_in + attn1)

    # Cross-attention
    attn2=layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )(out1, enc_out)
    out2=layers.LayerNormalization()(out1 + attn2)

    # Feed-forward
    ffn=layers.Dense(ff_dim, activation="relu")(out2)
    ffn=layers.Dense(embed_dim)(ffn)
    out3=layers.LayerNormalization()(out2 + ffn)

    return keras.Model([dec_in, enc_out],out3)


In [21]:
embed_dim=256
ff_dim=512
num_heads=8
num_layers=4

def bert_decoder_layer(embed_dim, num_heads, ff_dim):
    dec_in=layers.Input(shape=(None, embed_dim))
    enc_out=layers.Input(shape=(None, embed_dim))

    # Instantiate MultiHeadAttention layers
    mha_self_attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    mha_cross_attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

    # Masked self-attention: pass use_causal_mask=True to the call method
    attn1=mha_self_attn(query=dec_in, value=dec_in, use_causal_mask=True)
    out1=layers.LayerNormalization()(dec_in + attn1)

    # Cross-attention
    attn2=mha_cross_attn(query=out1, value=enc_out, key=enc_out)
    out2=layers.LayerNormalization()(out1 + attn2)

    # Feed-forward
    ffn=layers.Dense(ff_dim, activation="relu")(out2)
    ffn=layers.Dense(embed_dim)(ffn)
    out3=layers.LayerNormalization()(out2 + ffn)

    return keras.Model([dec_in, enc_out],out3)

# Encoder
enc_input = keras.Input(shape=(None,), dtype="int32")
enc_emb = PositionalEmbedding(vocab_size, embed_dim, max_enc_len)(enc_input)
encoder = enc_emb
for _ in range(num_layers):
    encoder = bert_encoder_layer(embed_dim, num_heads, ff_dim)(encoder)

# Decoder
dec_input=keras.Input(shape=(None,),dtype="int32")
dec_emb=PositionalEmbedding(vocab_size,embed_dim, max_dec_len)(dec_input)
decoder=dec_emb
for _ in range(num_layers):
    decoder=bert_decoder_layer(embed_dim, num_heads,ff_dim)([decoder,encoder])

# Output
final_output=layers.Dense(vocab_size)(decoder)

model=keras.Model([enc_input,dec_input],final_output)
model.summary()

In [17]:
loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer="adam",loss=loss_fn)

model.fit([enc, dec_in],dec_out,batch_size=4,epochs=6)


Epoch 1/6
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 2s/step - loss: 2.6583
Epoch 2/6
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2s/step - loss: 0.8239
Epoch 3/6
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 3s/step - loss: 0.8138
Epoch 4/6
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 3s/step - loss: 0.8160
Epoch 5/6
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 3s/step - loss: 0.8068
Epoch 6/6
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 3s/step - loss: 0.8049


<keras.src.callbacks.history.History at 0x79b48fe70ec0>

In [18]:
id_to_word={i:w for i,w in enumerate(vocab)}

def generate(text, max_len=60):
    e=vectorizer([text]).numpy()
    d=np.zeros((1, max_len), dtype=int)
    d[0,0]=BOS_ID

    for i in range(1, max_len):
        preds=model.predict([e, d[:,:i]],verbose=0)
        next_id=np.argmax(preds[0, i-1])

        if next_id==EOS_ID:
            break

        d[0,i]=next_id

    tokens=[id_to_word[t] for t in d[0] if t not in [PAD_ID, BOS_ID, EOS_ID]]
    return " ".join(tokens)


In [22]:
import sacrebleu

preds=[]
refs=[]

for i in range(30):
    p=generate(df["clean_doc"].iloc[i])
    preds.append(p)
    refs.append(df["clean_sum"].iloc[i])

bleu=sacrebleu.corpus_bleu(preds, [refs])
print("BLEU Score:", bleu.score)


BLEU Score: 0.046290446176410774
