In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import torch
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import json









# Save the generated summaries

# df_test.to_json("/content/gdrive/MyDrive/test_with_summaries.jsonl", orient='records', lines=True, force_ascii=False)
# print("Summaries saved to Google Drive!")

In [None]:
import os

# List the contents
base_path = "/content/gdrive/MyDrive/pegasus_indlegal"
print("Contents of pegasus_indlegal:")
print(os.listdir(base_path))

# Check checkpoint folders
checkpoints = [d for d in os.listdir(base_path) if d.startswith('checkpoint')]
print("\nCheckpoint folders:", checkpoints)

# If you want to see what's inside a checkpoint
if checkpoints:
    checkpoint_path = os.path.join(base_path, checkpoints[0])
    print(f"\nContents of {checkpoints[0]}:")
    print(os.listdir(checkpoint_path))

In [None]:
# Load your fine-tuned model and tokenizer from the saved directory
model_path = "/content/gdrive/MyDrive/pegasus_indlegal/checkpoint-1800"
tokenizer = AutoTokenizer.from_pretrained(model_path)  # Load tokenizer from base model
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:

# Data collator
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Load test dataset
df_test = []
with open("test_judg.jsonl", 'r', encoding="utf-8") as f:
    for line in f:
        df_test.append(json.loads(line))

df_test = pd.DataFrame(df_test)
df_test = df_test.dropna(subset=['judgment'])
df_test['judgment'] = df_test['judgment'].astype(str)


In [None]:

test_dataset = Dataset.from_pandas(df_test[['judgment']])

def tokenize_test(example_batch):
    input_encodings = tokenizer(example_batch['judgment'], max_length=768, truncation=True, padding='max_length')
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask']
    }

In [None]:

tokenized_test = test_dataset.map(tokenize_test, remove_columns=['judgment'], batched=True)

tokenized_test.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask']
)

test_dataloader = DataLoader(tokenized_test, batch_size=6, collate_fn=seq2seq_data_collator)

In [None]:
model.eval()
gen_summaries_test = []

print("Generating summaries for test set...")
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        batch = {k: v.to(model.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}

        generated_ids = model.generate(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            max_length=768,
            min_length=640,
            num_beams=4,
            length_penalty=1.5,
            early_stopping=True
        )

        summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        gen_summaries_test.extend(summaries)

        if i % 10 == 0:
            print(f"Processed {len(gen_summaries_test)} samples...")

print(f"Total test summaries generated: {len(gen_summaries_test)}")

In [None]:
df_test['summary'] = gen_summaries_test

In [None]:
# Save to JSONL
output_file = 'test_summaries.jsonl'
with open(output_file, 'w') as f:
    for _, row in df_test.iterrows():
        json_line = {
            'ID': row['id'],
            'Summary': row['summary']
        }
        f.write(json.dumps(json_line) + '\n')

print(f"\nSummaries saved to {output_file}")