# NOTE:
**Output cells are intentionally omitted in this repository** to prevent display corruption or issues.
Run this notebook in Colab/Kaggle to see live outputs.

In [None]:
# Clears all outputs to prevent corruption
from google.colab import output
output.clear()

In [None]:
!pip -q install transformers torch sentencepiece pandas evaluate accelerate rouge_score wandb bert_score

In [None]:
import torch
import evaluate
import os
import wandb

from google.colab import userdata
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model

In [None]:
dataset = load_dataset("iqballx/indonesian_news_datasets",
                       data_files="data.csv",
                       split="train")

In [None]:
display(dataset)

In [None]:
print(dataset.features)

In [None]:
train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_val = train_test["train"].train_test_split(test_size=0.125, seed=42)

# Final splits
train_dataset = train_val["train"]
val_dataset = train_val["test"]  # This becomes validation
test_dataset = train_test["test"]  # This becomes test

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")

In [None]:
model_name = "cahya/t5-base-indonesian-summarization-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

if torch.cuda.is_available():
  model.to("cuda")

In [None]:
def tokenize_function(examples):
  # Handle batched input explicitly
  content_list = examples["content"]
  summary_list = examples["summary"]

  # Ensure all entries are strings
  content_list = [str(content) if content is not None else "" for content in content_list]
  summary_list = [str(summary) if summary is not None else "" for summary in summary_list]

  # Add T5 prefix for summarization task
  prefixed_content = [f"summarize: {content}" for content in content_list]

  inputs = tokenizer(
    prefixed_content,
    max_length=1024,
    truncation=True,
    padding=False
  )

  targets = tokenizer(
    summary_list,
    max_length=512,
    truncation=True,
    padding=False
  )

  inputs['labels'] = targets['input_ids']
  return inputs

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
lora_config = LoraConfig(
  r=8,                    # Low rank for efficiency
  lora_alpha=32,          # Higher alpha for r=8
  lora_dropout=0.1,       # Moderate regularization
  bias="none",            # Preserve original bias behavior
  task_type="SEQ_2_SEQ_LM",  # For T5 summarization
  target_modules=["q", "v", "k", "o", "DenseReluDense.wi", "DenseReluDense.wo"]  # Key T5 layers
)

model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

In [None]:
wandb_api_key = userdata.get('WANDB_API_KEY')

if not wandb_api_key:
  print('Wandb API key not found in environment variables!')
else:
  wandb.login(key=wandb_api_key)

In [None]:
wandb_entity = userdata.get('WANDB_ENTITY')

if not wandb_entity:
  wandb_entity = input("Enter wandb entity name: ")
else:
  wandb.init(
    project="trustify",
    entity=wandb_entity,
    config={
      "epochs": 3,
      "batch_size": 8,
      "lr": 3e-4
    }
  )

In [None]:
training_args = TrainingArguments(
  output_dir="./t5_results",
  num_train_epochs=3,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  warmup_steps=500,
  weight_decay=0.01,
  logging_dir="./t5_logs",
  logging_steps=10,
  learning_rate=3e-4,
  report_to=["wandb"],
  save_steps=500,  # Save every 500 steps
  eval_steps=500,  # Evaluate every 500 steps (matching save_steps)
  save_total_limit=2,
  load_best_model_at_end=True,
  metric_for_best_model="eval_loss",
  greater_is_better=False,
  # T5-specific additions
  gradient_accumulation_steps=2,
  fp16=True,
  dataloader_num_workers=2,
  # Align evaluation and save strategy
  eval_strategy="steps",  # Changed from default "no"
  save_strategy="steps",        # Explicitly set to match evaluation
  # Label smoothing for T5's generation tasks
  label_smoothing_factor=0.1,
  # Gradient clipping for stability
  max_grad_norm=1.0,
  # Learning rate scheduler
  lr_scheduler_type="linear",
)

In [None]:
# Data collator for T5
data_collator = DataCollatorForSeq2Seq(
  tokenizer=tokenizer,
  model=model,
  padding=True,  # Dynamic padding to longest sequence in batch
  pad_to_multiple_of=8,  # Optimize for GPU memory alignment
  return_tensors="pt"
)

In [None]:
# Your trainer configuration
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use validation set for evaluation
    data_collator=data_collator,  # Essential for proper batching
)

In [None]:
trainer.train()

In [None]:
import torch
import evaluate

# Load both metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Limit evaluation to first 300 samples for efficiency
eval_limit = 300
test_subset = test_dataset.select(range(eval_limit))

generated_summaries = []

# Generate summaries for limited subset
for i in range(len(test_subset)):
    input_ids = torch.tensor(test_subset[i]["input_ids"]).unsqueeze(0)

    # Move input_ids to GPU if available
    if torch.cuda.is_available():
        input_ids = input_ids.to('cuda')

    # Generate summary
    generated_ids = model.generate(input_ids=input_ids, max_length=128)

    # Decode the generated summary
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_text)

# Prepare reference summaries
reference_summaries = []
for i in range(len(test_subset)):
    # Replace -100 with pad_token_id for proper decoding
    labels = test_subset[i]["labels"]
    labels = [token_id if token_id != -100 else tokenizer.pad_token_id for token_id in labels]
    reference_text = tokenizer.decode(labels, skip_special_tokens=True)
    reference_summaries.append(reference_text)

# Calculate both metrics
rouge_scores = rouge.compute(predictions=generated_summaries, references=reference_summaries)
bertscore_scores = bertscore.compute(
    predictions=generated_summaries,
    references=reference_summaries,
    lang="id"
)

In [None]:
# Print results
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")
print(f"BERTScore F1: {sum(bertscore_scores['f1']) / len(bertscore_scores['f1']):.4f}")

## Interpretation

**ROUGE Scores Analysis:**
- **ROUGE-1 (0.5161):** Moderate unigram overlap between generated and reference summaries (51.6% of words match)
- **ROUGE-2 (0.3154):** Lower bigram overlap (31.5% of word pairs match), indicating some phrase disruption
- **ROUGE-L (0.4207):** 42.1% of longest common subsequence matches, showing reasonable structural alignment

**BERTScore Analysis:**
- **BERTScore F1 (0.7955):** High semantic similarity (79.5%), indicating generated summaries capture meaning well despite surface-level differences

**Interpretation:**
- **Good semantic quality:** BERTScore suggests summaries preserve meaning effectively
- **Moderate lexical precision:** ROUGE scores indicate reasonable word choice but potential paraphrasing
- **Phrase-level challenges:** Lower ROUGE-2 suggests some disruption in phrase structure
- **Overall positive:** BERTScore significantly higher than ROUGE indicates good meaning preservation despite different wording

**Quality Assessment:**
The model produces summaries that capture the core meaning (high BERTScore) but often uses different wording than references (lower ROUGE). This suggests good abstractive capabilities rather than simple copying, which is desirable for summarization. The 0.7955 BERTScore indicates strong semantic alignment with reference summaries, suggesting quality output for Indonesian summarization.

In [None]:
# Store evaluation results in wandb
wandb.log({
    "eval_rouge1": rouge_scores["rouge1"],
    "eval_rouge2": rouge_scores["rouge2"],
    "eval_rougeL": rouge_scores["rougeL"],
    "eval_bertscore_f1": sum(bertscore_scores['f1']) / len(bertscore_scores['f1'])
})

In [None]:
# Save PEFT adapters after training
trainer.save_model("./t5_lora_adapters")

# Create wandb artifact for PEFT adapters
lora_artifact = wandb.Artifact(
    name=f"t5-indonesian-lora-{wandb.run.id}",
    type="model",
    description="LoRA adapter weights for Indonesian summarization model",
    metadata={
        "architecture": "t5",
        "peft_method": "lora",
        "training_epochs": 3,
        "batch_size": 4,
        "learning_rate": 3e-4,
        "final_train_loss": trainer.state.log_history[-1]["train_loss"] if trainer.state.log_history else None,
        "rouge1_f1": rouge_scores["rouge1"],
        "rouge2_f1": rouge_scores["rouge2"],
        "rougeL_f1": rouge_scores["rougeL"],
        "bertscore_f1": sum(bertscore_scores['f1']) / len(bertscore_scores['f1'])
    }
)

lora_artifact.add_dir("./t5_lora_adapters")
wandb.log_artifact(lora_artifact)

print(f"LoRA adapters saved as artifact: {lora_artifact.name}")

In [None]:
def test_summarization(input_text, max_input_length=512, max_output_length=128):
    """
    Generate summary for a given input text using the trained model.
    """
    # Tokenize input text
    inputs = tokenizer(
        input_text,
        max_length=max_input_length,
        truncation=True,
        padding=False,
        return_tensors="pt"
    )

    # Move to GPU if available
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

    # Generate summary
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_output_length,
            num_beams=4,  # Use beam search for better quality
            length_penalty=2.0,  # Penalize longer sequences
            early_stopping=True,
            no_repeat_ngram_size=2  # Prevent repetitive n-grams
        )

    # Decode generated summary
    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return generated_summary

def interactive_summarization_loop():
    """
    Interactive loop for manual summarization testing with user input.
    """
    print("Interactive Summarization Testing")
    print("Enter text to summarize (type 'quit' to exit):")

    while True:
        try:
            user_input = input("\nEnter text: ").strip()

            if user_input.lower() == 'quit':
                print("Exiting summarization loop.")
                break

            if not user_input:
                print("Please enter valid text.")
                continue

            # Generate summary
            summary = test_summarization(user_input)

            print(f"\nGenerated Summary: {summary}")
            print("-" * 80)

        except KeyboardInterrupt:
            print("\nInterrupted by user. Exiting.")
            break
        except Exception as e:
            print(f"Error during summarization: {e}")
            continue

# Uncomment to run interactive testing
interactive_summarization_loop()

Input:

> REPUBLIKA.CO.ID, WASHINGTON -- Arab Saudi dan Amerika Serikat (AS) menandatangani serangkaian perjanjian penting di Washington pada Selasa (18/11/2025). Perjanjian tersebut menggarisbawahi kemitraan strategis dalam kecerdasan buatan (AI) dan pernyataan bersama yang menandai selesainya negosiasi mengenai kerja sama nuklir sipil.  Perjanjian tersebut mencerminkan peningkatan signifikan dalam kerja sama teknologi dan ekonomi antara kedua negara, khususnya di bidang industri maju, keamanan energi, dan teknologi baru. Bersamaan dengan kerangka kerja AI dan nuklir, kedua belah pihak mendukung pengaturan baru untuk memperkuat ketahanan rantai pasokan untuk uranium, mineral penting, dan magnet permanen, serta inisiatif untuk mempercepat investasi Saudi di Amerika Serikat dan memperluas kerja sama keuangan, ekonomi, pendidikan, dan regulasi.  Selama pertemuan puncak tersebut, Putra Mahkota dan Perdana Menteri Saudi, Mohammed bin Salman (MBS) dan Presiden AS Donald Trump meninjau hubungan bilateral dan membahas cara-cara untuk meningkatkan kemitraan strategis Saudi–AS di seluruh sektor prioritas.  Seperti dilaporkan Saudi Gazette, mereka juga bertukar pandangan mengenai perkembangan regional dan internasional dengan penekanan pada penguatan keamanan, stabilitas, dan pertumbuhan ekonomi.  Presiden Trump secara resmi menyambut Putra Mahkota di Gedung Putih dengan upacara lengkap yang meliputi pengawalan kavaleri, penghormatan 19 senjata, band militer yang membawakan lagu kebangsaan Saudi dan Amerika, serta pertunjukan pesawat tempur untuk menghormati kunjungan tersebut.  Kedua pemimpin kemudian mengunjungi beberapa bagian Gedung Putih sebelum memulai pembicaraan resmi mereka.  Delegasi Arab Saudi terdiri dari Menteri Energi dan Ketua Komite Kemitraan Ekonomi Strategis Saudi–AS Pangeran Abdulaziz bin Salman, Duta Besar Saudi untuk Amerika Serikat Putri Reema binti Bandar, Menteri Luar Negeri Pangeran Faisal bin Farhan, Penasihat Keamanan Nasional Musaed Al-Aiban, Menteri Perdagangan Majid Al-Qasabi, Menteri Keuangan Mohammed Al-Jadaan, dan Gubernur Dana Investasi Publik Yasir Al-Rumayyan.  Dari pihak AS, peserta meliputi Wakil Presiden JD Vance, Menteri Luar Negeri Marco Rubio, Menteri Perang Pete Hegseth, Menteri Keuangan Scott Bessent, Menteri Energi Chris Wright, Kepala Staf Gedung Putih Susie Wiles, dan Utusan Khusus untuk Timur Tengah Steve Witkoff.

Generated Summary:

> Arab Saudi dan AS menandatangani serangkaian perjanjian penting di Washington pada Selasa( 18/11/2025). Perjanjian tersebut menggarisbawahi kemitraan strategis dalam kecerdasan buatan( AI) dan pernyataan bersama yang menandai selesainya negosiasi mengenai kerja sama nuklir sipil. Kedua belah pihak mendukung pengaturan baru untuk memperkuat ketahanan rantai pasokan untuk uranium, mineral penting, dan magnet permanen, serta inisiatif untuk mempercepat investasi Saudi di Amerika Serikat. Pertemuan puncak tersebut, Putra Mahkota dan Perdana Menteri Saudi, Mohammed bin Salman( MBS) serta Presiden AS Donald Trump meninjau hubungan bilateral dan membahas cara- cara untuk meningkatkan kemitraan strategik Saudi– AS di seluruh sektor prioritas.
