In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import pandas as pd
import json
from datasets import Dataset
from torch.utils.data import DataLoader

Model Loading

In [None]:
model_path = './bart_finetuned'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
device = torch.device("cuda")
model = model.to(device)
model.eval()

In [None]:
test_judg = pd.read_json("test_judg.jsonl", lines = True, encoding = "utf-8")
test_judg

In [None]:
test_judg = test_judg.dropna(subset=['judgment'])
test_judg['judgment'] = test_judg['judgment'].astype(str)

In [None]:
test_dataset = Dataset.from_pandas(test_judg[['judgment']])
# val_dataset = Dataset.from_pandas(val_judg[['Judgment']])

In [None]:
def tokenize_test(examples):
    model_inputs = tokenizer(
        examples['judgment'],
        max_length=1024,         
        truncation=True,
        padding='max_length'
    )
    
    return model_inputs

In [None]:
tokenized_test = test_dataset.map(
    tokenize_test,
    batched=True,
    batch_size=100,
    remove_columns=['judgment'],
    desc="Tokenizing test data"
)

In [None]:
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
test_dataloader = DataLoader(
    tokenized_test,
    batch_size=8,         
    shuffle=False          
)

In [None]:
generated_summaries = []

print("Summary generation for test")

with torch.no_grad():
    for i, text_example in enumerate(test_judg['judgment'].tolist()):
        # Encode input
        input_ids = tokenizer.encode(
            text_example,
            return_tensors="pt",
            max_length=1024,
            truncation=True,
        ).to(device)
        
        # Generate summary
        generated_ids = model.generate(
            input_ids=input_ids,
            bos_token_id=model.config.bos_token_id,
            eos_token_id=model.config.eos_token_id,
            max_length=700,
            min_length=670,
            num_beams=4,
            length_penalty=2.0,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        
        # Decode generated IDs to text
        summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        generated_summaries.extend(summaries)
        
        # Progress update
        if (i + 1) % 10 == 0:
            print(f"Processed {len(generated_summaries)}/{len(test_judg)} samples...")

print(f"\n✓ Generated {len(generated_summaries)} summaries!")

In [None]:
# Save summary in a separate jsonl file
test_judg['Summary'] = generated_summaries
summary_output_file = 'test_summaries.jsonl'

with open(summary_output_file, 'w') as f:
    for _, row in test_judg.iterrows():
        json_line = {
            'ID': row['id'],
            'Summary': row['Summary']
        }
        f.write(json.dumps(json_line) + '\n')

print(f"✓ Summaries only saved to: {summary_output_file}")

In [None]:
import pandas as pd
import json

In [None]:
test_sum = pd.read_json("test_summaries.jsonl", lines = True, encoding = "utf-8")
test_sum

In [None]:
test_sum['word_count'] = test_sum['Summary'].apply(lambda x: len(x.split()))

In [None]:
test_sum

In [None]:
test_sum = test_sum.rename(columns={'id': 'ID'})

In [None]:
test_sum

In [None]:
test_sum=test_sum.to_json("bart_summs.jsonl", orient= "records", lines=True)