In [None]:
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m632.7 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6cc767806e703d4c6c79f2a4a767116ef7166be050bbf0c42d5c7d69f289030e
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
# Nonaktifkan wandb
os.environ["WANDB_MODE"] = "disabled"

In [None]:
# 1. Load data Excel
df = pd.read_excel("/content/drive/MyDrive/NLP/Dataset/korpus1000.xlsx")  # Pastikan kolom ada 'teks' dan 'summary'
print(df.head())

                                                teks  \
0  Kadatangan padagang-padagang Arab jeung Pérsia...   
1  adat jeung budaya Minangkabau ngaguratkeun ka ...   
2  ti pangaruh agama Hindu jeung Buddha, sababara...   
3  di wewengkon kulon Pulo Jawa, dina abad ka-4 n...   
4  dina abad ka-7 aya Karajaan Malayu nu puseurna...   

                                             summary  
0  Islam asup ka Nusantara ti padagang Arab jeung...  
1           Budaya Minang ngagem sistem matrilineal.  
2  Pangaruh Hindu-Buddha nyababkeun karajaan-kara...  
3  Karajaan Tarumanagara ngadeg di kulon Jawa ant...  
4  Karajaan Malayu muncul dina abad ke-7 di Jambi...  


In [None]:
df.shape

(1000, 2)

In [None]:
# 2. Persiapkan dataset dengan prefix "summarize: "
df['input_text'] = 'summarize: ' + df['teks']
df = df.rename(columns={'summary': 'target_text'})

In [None]:
# 3. Buat dataset HuggingFace
dataset = Dataset.from_pandas(df[['input_text', 'target_text']])

In [None]:
# 4. Tokenizer dan model
model_name = "panggi/t5-small-indonesian-summarization-cased"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/793k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
# 5. Fungsi tokenisasi
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



In [None]:
# 6. Split dataset train/eval 80:20
split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split['train']
eval_dataset = split['test']

print(f"Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}")

Train size: 800, Eval size: 200


In [None]:
import evaluate
import numpy as np
import torch
rouge = evaluate.load("rouge")

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

  decode_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(predictions=decode_preds, references=decode_labels, use_stemmer=True,
                         rouge_types=[
                             'rouge1',
                             'rouge2',
                             'rougeL',
                         ])

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)

  return {k: round(v, 4) for k, v in result.items()}

In [None]:
def preprocess_logits_for_metrics(logits, labels):
  pred_ids = torch.argmax(logits[0], dim=-1)
  return pred_ids, labels

In [None]:
# 7. Setup TrainingArguments
training_args = TrainingArguments(
    output_dir="./t5-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",  # log lokal
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",  # opsional: pilih metrik terbaik
    report_to=[],
    greater_is_better=True           # True untuk metrik yang lebih tinggi = lebih baik
)


# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    # predict_with_generate=True  # ⚠️ versi lama, tidak ada di transformers >=4.20
)

# 9. Train
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
1,0.6647,0.57936,0.3389,0.1422,0.3323,23.705
2,0.6005,0.546401,0.3342,0.1407,0.3304,23.715


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
1,0.6647,0.57936,0.3389,0.1422,0.3323,23.705
2,0.6005,0.546401,0.3342,0.1407,0.3304,23.715
3,0.5492,0.535186,0.3374,0.1442,0.3339,23.72
4,0.5293,0.52909,0.339,0.1478,0.3353,23.72
5,0.5316,0.526029,0.3396,0.1478,0.3361,23.72


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1000, training_loss=0.5866929702758789, metrics={'train_runtime': 10780.5446, 'train_samples_per_second': 0.371, 'train_steps_per_second': 0.093, 'total_flos': 541367205888000.0, 'train_loss': 0.5866929702758789, 'epoch': 5.0})

In [None]:
# 10. Simpan model & tokenizer hasil fine-tune
model.save_pretrained("./t5-finetuned-model")
tokenizer.save_pretrained("./t5-finetuned-model")

('./t5-finetuned-model/tokenizer_config.json',
 './t5-finetuned-model/special_tokens_map.json',
 './t5-finetuned-model/spiece.model',
 './t5-finetuned-model/added_tokens.json')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model & tokenizer hasil fine-tune
model_path = "./t5-finetuned-model"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

In [None]:
def summarize_input_text(text, model, tokenizer, max_input_length=512, max_output_length=100):
    inputs = tokenizer(
        "summarize: " + text,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    summary_ids = model.generate(
        inputs.input_ids,
        max_length=max_output_length,
        num_beams=4,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
# Cell interaktif
text = input("Masukkan paragraf yang ingin diringkas:\n")
summary_t5 = summarize_input_text(text, model, tokenizer)
print("\nRingkasan:\n", summary_t5)

Masukkan paragraf yang ingin diringkas:
Kasubbag Humas Polres Sibolga, Iptu R Sormin, nyebutkeun yén taneuh urug ieu disangka lumangsung alatan hujan gede anu terus-terusan ngaguyur wilayah éta. Warga di sabudeureun lokasi ayeuna dihimbau pikeun waspada kana potensi taneuh longsor susulan. Pihak berwenang ogé geus ngirimkeun tim evakuasi pikeun ngabantosan warga anu masih kénéh aya di lokasi bahaya. Kajadian ieu nyésakeun duka anu jero pikeun kulawarga korban.

Ringkasan:
 Kasubbag Humas Polres Sibolga, Iptu R Sormin, nyebutkeun yn taneuh urug ieu.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re

def split_sentences(text):
    # Pisahkan kalimat berdasarkan titik, tanda tanya, atau seru (diikuti spasi dan huruf kapital)
    sentences = re.split(r'(?<=[.?!])\s+(?=[A-Z])', text.strip())
    return [s for s in sentences if len(s.strip()) > 0]

def summarize_tfidf(text, num_sentences=3):
    sentences = split_sentences(text)
    if len(sentences) <= num_sentences:
        return text

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    sentence_scores = tfidf_matrix.sum(axis=1).flatten()
    sentence_scores = np.array(sentence_scores).flatten()

    top_indices = sentence_scores.argsort()[-num_sentences:][::-1]
    top_sentences = [sentences[i] for i in sorted(top_indices)]
    return " ".join(top_sentences)


In [None]:
text_input = input("Masukkan teks yang ingin diringkas:\n")
summary_tfidf = summarize_tfidf(text_input)
print("\n📄 Ringkasan TF-IDF:\n", summary_tfidf)

Masukkan teks yang ingin diringkas:
Kasubbag Humas Polres Sibolga, Iptu R Sormin, nyebutkeun yén taneuh urug ieu disangka lumangsung alatan hujan gede anu terus-terusan ngaguyur wilayah éta. Warga di sabudeureun lokasi ayeuna dihimbau pikeun waspada kana potensi taneuh longsor susulan. Pihak berwenang ogé geus ngirimkeun tim evakuasi pikeun ngabantosan warga anu masih kénéh aya di lokasi bahaya. Kajadian ieu nyésakeun duka anu jero pikeun kulawarga korban.

📄 Ringkasan TF-IDF:
 Kasubbag Humas Polres Sibolga, Iptu R Sormin, nyebutkeun yén taneuh urug ieu disangka lumangsung alatan hujan gede anu terus-terusan ngaguyur wilayah éta. Warga di sabudeureun lokasi ayeuna dihimbau pikeun waspada kana potensi taneuh longsor susulan. Pihak berwenang ogé geus ngirimkeun tim evakuasi pikeun ngabantosan warga anu masih kénéh aya di lokasi bahaya.


In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def hitung_rouge(prediksi, referensi):
    result = rouge.compute(
        predictions=[prediksi],
        references=[referensi],
        use_stemmer=True,
        rouge_types=['rouge1', 'rouge2', 'rougeL']
    )
    return {k: round(v, 4) for k, v in result.items()}


In [None]:
score_t5 = hitung_rouge(summary_t5, text)
score_tfidf = hitung_rouge(summary_tfidf, text_input)

print("ROUGE T5:", score_t5)
print("ROUGE TF-IDF:", score_tfidf)

ROUGE T5: {'rouge1': np.float64(0.2821), 'rouge2': np.float64(0.2368), 'rougeL': np.float64(0.2821)}
ROUGE TF-IDF: {'rouge1': np.float64(0.918), 'rouge2': np.float64(0.9167), 'rougeL': np.float64(0.918)}
