<a href="https://colab.research.google.com/github/2303A51553/Natural-language-process/blob/main/legal_document_summarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow nltk numpy pandas sacrebleu



In [None]:
import re
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words=set(stopwords.words('english'))
df = pd.read_csv("/content/Legal_Summarisation_100_Final.csv")

def preprocess_text(text):
    text=str(text).lower()
    text=text.translate(str.maketrans('', '', string.punctuation))
    text=re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens=word_tokenize(text)
    tokens=[w for w in tokens if w not in stop_words]
    return " ".join(tokens)

df["clean_doc"]=df["document"].apply(preprocess_text)
df["clean_sum"]=df["summary"].apply(preprocess_text)
df["clean_sum"] = "bos "+df["clean_sum"]+" eos"

In [None]:
max_vocab=6000
max_enc_len=256
max_dec_len=64

vectorizer=layers.TextVectorization(
    max_tokens=max_vocab,
    output_mode="int",
    output_sequence_length=max_enc_len,
    standardize=None
)

vectorizer.adapt(df["clean_doc"].tolist() + df["clean_sum"].tolist())

vocab=vectorizer.get_vocabulary()
vocab_size=len(vocab)


In [None]:
enc=vectorizer(df["clean_doc"].tolist()).numpy()
dec=vectorizer(df["clean_sum"].tolist()).numpy()

# teacher forcing shift
dec_in = np.concatenate([np.zeros((dec.shape[0],1)), dec[:,:-1]], axis=1)
dec_out = dec


In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_emb=layers.Embedding(vocab_size, embed_dim, mask_zero=True)
        self.pos_emb=layers.Embedding(max_len, embed_dim)

    def call(self, x):
        max_len=tf.shape(x)[1]
        positions=tf.range(start=0, limit=max_len)
        pos_embeddings=self.pos_emb(positions)
        tok_embeddings=self.token_emb(x)
        return tok_embeddings + pos_embeddings


In [None]:
def bert_encoder_layer(embed_dim, num_heads, ff_dim):
    inputs=layers.Input(shape=(None, embed_dim))
    attn=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    out1=layers.LayerNormalization()(inputs + attn)

    ffn=layers.Dense(ff_dim, activation="relu")(out1)
    ffn=layers.Dense(embed_dim)(ffn)
    out2=layers.LayerNormalization()(out1 + ffn)

    return keras.Model(inputs, out2)


In [None]:
def bert_decoder_layer(embed_dim, num_heads, ff_dim):
    dec_in=layers.Input(shape=(None, embed_dim))
    enc_out=layers.Input(shape=(None, embed_dim))

    attn1=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(dec_in, dec_in, use_causal_mask=True)
    out1=layers.LayerNormalization()(dec_in + attn1)

    attn2=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(out1, enc_out)
    out2=layers.LayerNormalization()(out1 + attn2)

    ffn=layers.Dense(ff_dim, activation="relu")(out2)
    ffn=layers.Dense(embed_dim)(ffn)
    out3=layers.LayerNormalization()(out2 + ffn)

    return keras.Model([dec_in, enc_out], out3)


In [None]:
embed_dim=256
ff_dim=512
num_heads=8
num_layers=4

# Encoder
enc_input=keras.Input(shape=(None,), dtype="int32")
enc_emb=PositionalEmbedding(vocab_size, embed_dim, max_enc_len)(enc_input)
encoder=enc_emb
for _ in range(num_layers):
    encoder=bert_encoder_layer(embed_dim, num_heads, ff_dim)(encoder)

# Decoder
dec_input=keras.Input(shape=(None,), dtype="int32")
dec_emb=PositionalEmbedding(vocab_size, embed_dim, max_dec_len)(dec_input)
decoder=dec_emb
for _ in range(num_layers):
    decoder=bert_decoder_layer(embed_dim, num_heads, ff_dim)([decoder, encoder])

# Output to vocab
final_output=layers.Dense(vocab_size)(decoder)

model=keras.Model([enc_input, dec_input], final_output)
model.summary()


In [None]:
enc=vectorizer(df["clean_doc"].tolist()).numpy()
dec=vectorizer(df["clean_sum"].tolist()).numpy()[:, :max_dec_len]

dec_in =np.concatenate([np.zeros((dec.shape[0],1)), dec[:,:-1]], axis=1)
dec_out=dec

In [None]:
model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)

model.fit([enc, dec_in], dec_out, epochs=5, batch_size=4)

Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 5s/step - loss: 2.8572
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4s/step - loss: 0.8334
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 4s/step - loss: 0.8199
Epoch 4/5
[1m 3/25[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:17[0m 4s/step - loss: 0.8187

#Generating summary

In [None]:
id_to_word={i:w for i,w in enumerate(vocab)}

def generate(text):
    e=vectorizer([text]).numpy()
    d=np.zeros((1, max_dec_len), dtype=int)

    for i in range(1, max_dec_len):
        preds=model.predict([e, d[:,:i]], verbose=0)
        next_id=np.argmax(preds[0, i-1])
        d[0, i]=next_id
        if next_id==0:
            break

    words=[id_to_word.get(i,"") for i in d[0] if i!=0]
    return " ".join(words)

text=df["clean_doc"].iloc[0] # Changed from .iloc[0:10] to .iloc[0]
summary=generate(text)
print("Generated Summary:\n", text)

In [None]:
import sacrebleu
preds=[]
refs=[]

for i in range(30):  # evaluate 30 examples
    pred=generate(df["clean_doc"].iloc[i])
    preds.append(pred)
    refs.append(df["clean_sum"].iloc[i].replace("bos ","").replace(" eos",""))

bleu=sacrebleu.corpus_bleu(preds,what is bos eos [refs],tokenize="flores200")

print("BLEU Score:", "{:.10f}".format(bleu.score))

