<a href="https://colab.research.google.com/github/2303A51553/Natural-language-process/blob/main/project_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install pandas numpy nltk gensim scikit-learn tensorflow sentence-transformers rouge-score




In [26]:
import pandas as pd
import numpy as np
import re, string, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Load CSV
X = pd.read_csv('/content/Legal_Summarisation_100_Final (1).csv')
X.head()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,document,summary
0,1,The matter concerns constitutional validity of...,Supreme Court rules on constitutional validity...
1,2,The matter concerns public interest litigation...,Court directs stronger measures for environmen...
2,3,The matter concerns tax dispute regarding newl...,Tax provisions upheld as valid and constitutio...
3,4,The matter concerns dispute over custody of a ...,Custody of child decided considering welfare p...
4,5,The matter concerns public interest litigation...,Court directs stronger measures for environmen...


In [27]:

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

X['cleaned_document'] = X['document'].apply(preprocess_text)
X['cleaned_summary']  = X['summary'].apply(preprocess_text)



In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_doc = TfidfVectorizer(max_features=5000)
tfidf_sum = TfidfVectorizer(max_features=5000)

doc_tfidf = tfidf_doc.fit_transform(X['cleaned_document'])
sum_tfidf = tfidf_sum.fit_transform(X['cleaned_summary'])

print("Document TF-IDF shape:", doc_tfidf.shape)
print("Summary TF-IDF shape:", sum_tfidf.shape)


Document TF-IDF shape: (100, 63)
Summary TF-IDF shape: (100, 47)


In [29]:
from gensim.models import Word2Vec

def tokenize_words(text):
    return word_tokenize(text)

X['tokens_doc'] = X['cleaned_document'].apply(tokenize_words)
X['tokens_sum'] = X['cleaned_summary'].apply(tokenize_words)

w2v_doc = Word2Vec(sentences=X['tokens_doc'], vector_size=100, window=5, min_count=1, workers=4)
w2v_sum = Word2Vec(sentences=X['tokens_sum'], vector_size=100, window=5, min_count=1, workers=4)

def get_avg_vector(tokens, model):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    return np.mean(vecs, axis=0) if len(vecs) > 0 else np.zeros(model.vector_size)

X['doc_w2v'] = X['tokens_doc'].apply(lambda x: get_avg_vector(x, w2v_doc))
X['sum_w2v'] = X['tokens_sum'].apply(lambda x: get_avg_vector(x, w2v_sum))


In [30]:
from sentence_transformers import SentenceTransformer

bert_model = SentenceTransformer('all-MiniLM-L6-v2')

X['doc_bert'] = X['cleaned_document'].apply(lambda x: bert_model.encode(x))
X['sum_bert'] = X['cleaned_summary'].apply(lambda x: bert_model.encode(x))

print("Example BERT vector length:", len(X['doc_bert'].iloc[0]))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Example BERT vector length: 384


In [32]:
# Assign BERT embeddings to doc_embeddings and sum_embeddings
doc_embeddings = np.vstack(X['doc_bert'].values)
sum_embeddings = np.vstack(X['sum_bert'].values)

# Reshape for LSTM input: (samples, timesteps, features)
# Here timesteps=1, features=384 (BERT vector)
encoder_input_data = doc_embeddings.reshape(doc_embeddings.shape[0], 1, doc_embeddings.shape[1])
decoder_input_data = sum_embeddings.reshape(sum_embeddings.shape[0], 1, sum_embeddings.shape[1])

# Decoder target same as input (shifted version for training)
decoder_target_data = decoder_input_data.copy()

print("Encoder input shape:", encoder_input_data.shape)
print("Decoder input shape:", decoder_input_data.shape)
print("Decoder target shape:", decoder_target_data.shape)

Encoder input shape: (100, 1, 384)
Decoder input shape: (100, 1, 384)
Decoder target shape: (100, 1, 384)


In [33]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

latent_dim = 256  # size of LSTM hidden states

# Encoder
encoder_inputs = Input(shape=(1, 384))
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(1, 384))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(384, activation='linear')  # output same dimension as BERT
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='mse')
model.summary()


In [34]:
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=16,
    epochs=5,
    validation_split=0.2
)



Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 167ms/step - loss: 0.0025 - val_loss: 0.0020
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 0.0018 - val_loss: 0.0015
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.0011 - val_loss: 0.0010
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 8.9984e-04 - val_loss: 8.2340e-04


<keras.src.callbacks.history.History at 0x7cea44117950>

In [35]:
# Encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)



In [36]:

def generate_summary_vector(doc_vec):
    # Encode the document
    states_value = encoder_model.predict(doc_vec.reshape(1, 1, 384))

    # Prepare target sequence
    target_seq = np.zeros((1, 1, 384))

    # Predict one timestep (summary embedding)
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    return output_tokens.reshape(384,)

# Example
summary_vec = generate_summary_vector(doc_embeddings[0])
print("Generated summary vector shape:", summary_vec.shape)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 704ms/step
Generated summary vector shape: (384,)


In [38]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

def evaluate_summary(doc_idx):
    gen_vec = generate_summary_vector(doc_embeddings[doc_idx])
    real_vec = sum_embeddings[doc_idx]

    cos_sim = cosine_similarity(gen_vec.reshape(1, -1), real_vec.reshape(1, -1))[0][0]

    # Use the correct column names for cleaned text
    ref = X['cleaned_summary'][doc_idx].split()
    cand = X['cleaned_document'][doc_idx].split()[:len(ref)]

    bleu = sentence_bleu([ref], cand)
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = rouge.score(" ".join(ref), " ".join(cand))

    print(f"Cosine Similarity: {cos_sim:.3f}")
    print(f"BLEU: {bleu:.3f}")
    print(f"ROUGE-1: {scores['rouge1'].fmeasure:.3f}, ROUGE-L: {scores['rougeL'].fmeasure:.3f}")

# Example evaluation
evaluate_summary(0)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
Cosine Similarity: 0.582
BLEU: 0.000
ROUGE-1: 0.333, ROUGE-L: 0.333


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
