In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
!pip install evaluate

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch
nltk.download("punkt")
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
import os

In [None]:
import json
import pandas as pd

data_judg = []
with open("train_judg.jsonl", 'r', encoding="utf-8") as f:
    for line in f:
        data_judg.append(json.loads(line))

df_judg = pd.DataFrame(data_judg)
df_judg

In [None]:
data_sum = []
with open("train_ref_summ.jsonl", 'r', encoding="utf-8") as f:
    for line in f:
        data_sum.append(json.loads(line))

df_sum = pd.DataFrame(data_sum)
df_sum

In [None]:
tokenizer = AutoTokenizer.from_pretrained("akhilm97/pegasus_indian_legal")
model = AutoModelForSeq2SeqLM.from_pretrained("akhilm97/pegasus_indian_legal")

In [None]:
sample=df_judg['Judgment'][5]
len(sample.split())

In [None]:
tokenized_sent=tokenizer(sample, return_tensors="pt")
print(tokenized_sent)

In [None]:
data = pd.merge(df_judg, df_sum[['ID','Summary']], on = 'ID', how = "left")

In [None]:
data

In [None]:
from datasets import Dataset

# Convert your merged DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(data[['Judgment', 'Summary']])

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(
        example_batch['Judgment'],
        max_length=1024,
        truncation=True,
        padding='max_length'
    )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(
            example_batch['Summary'],
            max_length=768,
            truncation=True,
            padding='max_length'
        )

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [None]:
dataset = dataset.map(convert_examples_to_features, batched = True)

In [None]:
dataset

In [None]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [None]:
import evaluate
rouge = evaluate.load("rouge")

In [None]:
import numpy as np
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean up text (remove extra whitespace)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Calculate ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Add generation length statistics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    result["gen_len_std"] = np.std(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="text_summarization_model",
    eval_strategy="no",  # Changed since no eval dataset
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    report_to=None,
    fp16=True,
    push_to_hub=False,
    logging_steps=100,
    save_steps=500,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,  # Your single tokenized dataset
    eval_dataset=None,      # No validation set
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    compute_metrics=None,   # Remove since no evaluation
)

In [None]:
trainer.train()

In [None]:
df_val = pd.read_json("val_judg.jsonl", lines = True, encoding = "utf-8")
df_val

In [None]:
df_val.head()

In [None]:
df_val['Judgment'].iloc[0][:200]

In [None]:
df_val = df_val.dropna(subset=['Judgment'])
df_val['Judgment'] = df_val['Judgment'].astype(str)

In [None]:
val_dataset = Dataset.from_pandas(df_val[['Judgment']])

In [None]:
def tokenize_validation(example_batch):
  input_encodings = tokenizer(example_batch['Judgment'], max_length=768, truncation=True, padding='max_length')
  return {
      'input_ids': input_encodings['input_ids'],
      'attention_mask': input_encodings['attention_mask']
  }

In [None]:
tokenized_val = val_dataset.map(tokenize_validation, remove_columns=['Judgment'], batched = True)

In [None]:
tokenized_val.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask']
)

In [None]:
from torch.utils.data import DataLoader

In [None]:
val_dataloader = DataLoader(tokenized_val, batch_size=6, collate_fn=seq2seq_data_collator)

In [None]:
model.eval()

In [None]:
gen_summaries = []

In [None]:
print("Generating summaries...")
with torch.no_grad():
  for i, batch in enumerate(val_dataloader):
    batch = {k: v.to(model.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}

    generated_ids = model.generate(
        input_ids=batch['input_ids'],
        attention_mask=batch['attention_mask'],
        max_length=768,
        min_length=640,
        num_beams=4,
        length_penalty=1.5,
        early_stopping=True
    )

    summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    gen_summaries.extend(summaries)

    if i % 10 == 0:
      print(f"Processed {len(gen_summaries)} samples...")

In [None]:
import json

In [None]:
# 10. Add summaries to DataFrame and save
df_val['Summary'] = gen_summaries

# Save to JSONL
output_file = 'answer.jsonl'
with open(output_file, 'w') as f:
    for _, row in df_val.iterrows():
        json_line = {
            'ID': row['ID'],
            'summary': row['Summary']
        }
        f.write(json.dumps(json_line) + '\n')

print(f"\nSummaries saved to {output_file}")