In [None]:
import torch
import numpy as np
import nltk
import transformers
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader
import json

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('ainize/bart-base-cnn')

In [None]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained('ainize/bart-base-cnn')
# Batching function
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
model = model.to(device)

In [None]:
train_judg = pd.read_json("train_judg.jsonl", lines = True, encoding = "utf-8")
train_judg

In [None]:
train_summ = pd.read_json("train_ref_summ.jsonl", lines = True, encoding = "utf-8")
train_summ

In [None]:
val_judg = pd.read_json("Validation.jsonl", lines = True, encoding = "utf-8")
val_judg

In [None]:
train_df = train_judg.merge(train_summ, on='ID')
print(f"Merged training samples: {len(train_df)}")
print(f"Columns: {train_df.columns.tolist()}")

#Clean data
train_df = train_df.dropna(subset=['Judgment', 'Summary'])
train_df['Judgment'] = train_df['Judgment'].astype(str)
train_df['Summary'] = train_df['Summary'].astype(str)

In [None]:
train_dataset = Dataset.from_pandas(train_df[['Judgment', 'Summary']])

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['Judgment'],
        max_length=1024,
        truncation=True,
        padding='max_length'
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['Summary'],
            max_length=768,
            truncation=True,
            padding='max_length'
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=100,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing training data"
)

In [None]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

In [None]:
training_args = transformers.Seq2SeqTrainingArguments(
    output_dir='./bart_finetuned',
    eval_strategy='no',
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True, 
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    warmup_steps=500,
    fp16=True,
    predict_with_generate=True,
    logging_steps=100,
    save_steps=500,
    report_to='none',
    push_to_hub=False,
)

In [None]:
trainer = transformers.Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=None
)

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
trainer.train()

In [None]:
trainer.save_model('./bart_finetuned')
tokenizer.save_pretrained('./bart_finetuned')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
model_path = './bart_finetuned'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

Summary Generation

In [None]:
model.eval()

In [None]:
val_judg = val_judg.dropna(subset=['Judgment'])
val_judg['Judgment'] = val_judg['Judgment'].astype(str)
val_judg = val_judg[val_judg['Judgment'].str.strip() != '']

In [None]:
val_dataset = Dataset.from_pandas(val_judg[['Judgment']])

In [None]:
def tokenize_validation(examples):
    return tokenizer(
        examples['Judgment'],
        max_length=1024,
        truncation=True,
        padding='max_length'
    )

In [None]:
tokenized_val_dataset = val_dataset.map(
    tokenize_validation,
    batched=True,
    batch_size=100,
    remove_columns=['Judgment'],
    desc="Tokenizing validation data"
)

In [None]:
tokenized_val_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask']
)

In [None]:
val_dataloader = DataLoader(
    tokenized_val_dataset,
    batch_size=8,
    shuffle=False
)

In [None]:
generated_summaries = []

In [None]:

with torch.no_grad():
    for i, text_example in enumerate(val_judg['Judgment'].tolist()):
        # Encode input
        input_ids = tokenizer.encode(
            text_example,
            return_tensors="pt",
            max_length=1024,
            truncation=True,
        ).to(device)
        
        # Generate summary
        summary_text_ids = model.generate(
            input_ids=input_ids,
            bos_token_id=model.config.bos_token_id,
            eos_token_id=model.config.eos_token_id,
            max_length=700,
            min_length=670,
            num_beams=4,
            length_penalty=2.0,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        
        # Decode the generated summary
        decoded_text = tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)
        generated_summaries.append(decoded_text)
        
        # Progress update
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(val_judg)} samples...")

print("="*80)
print(f"✓ Generated {len(generated_summaries)} summaries!")

In [None]:
# Save summary in a separate jsonl file
val_judg['Summary'] = generated_summaries
summary_output_file = 'validation_summaries.jsonl'

with open(summary_output_file, 'w') as f:
    for _, row in val_judg.iterrows():
        json_line = {
            'ID': row['ID'],
            'Summary': row['Summary']
        }
        f.write(json.dumps(json_line) + '\n')

print(f"✓ Summaries only saved to: {summary_output_file}")