# **Paraphrase Generation**

# **Datasets**

## Quora Question Pairs

Dataset Quora Question Pairs2 adalah kumpulan pasangan pertanyaan dari situs web tanya jawab komunitas Quora. Tugasnya adalah menentukan apakah suatu pasangan pertanyaan setara secara semantik.

    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'question1': Text(shape=(), dtype=tf.string),
    'question2': Text(shape=(), dtype=tf.string),


## Fine Tuner
















In [None]:
!pip install pytorch-lightning
!pip install transformers

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.6.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading pytorch_lightning-2.6.0-py3-none-any.whl (849 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.5/849.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.15.2 pytorch-lightning-2.6.0 torchmetrics-1.8.2


In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
class T5FineTuner(pl.LightningModule):

    def __init__(self, hparams):
        super().__init__()
        self.params = hparams
        self.save_hyperparameters(hparams)

        self.model = T5ForConditionalGeneration.from_pretrained(
            hparams.model_name_or_path
        )
        self.tokenizer = T5Tokenizer.from_pretrained(
            hparams.tokenizer_name_or_path
        )

    def forward(
        self,
        input_ids,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        labels=None,
    ):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

    def is_logger(self):
        # Kalau kamu nggak pakai ini di callback lain, sebenernya bisa dihapus.
        return self.trainer.global_rank <= 0

    def _step(self, batch):
        labels = batch["target_ids"]
        # ignore pad token di loss
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch["target_mask"],
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        # log di step dan di epoch → Lightning yang agregasi
        self.log(
            "train_loss",
            loss,
            prog_bar=True,
            on_step=True,
            on_epoch=True,
        )
        return loss  # untuk PL versi baru cukup return tensor loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        # log hanya di level epoch
        self.log(
            "val_loss",
            loss,
            prog_bar=True,
            on_step=False,
            on_epoch=True,
        )
        return loss

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)."""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.params.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.params.learning_rate,
            eps=self.params.adam_epsilon,
        )

        # Lightning 1.4+ / 2.x: boleh pakai estimated_stepping_batches
        num_training_steps = self.trainer.estimated_stepping_batches

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.params.warmup_steps,
            num_training_steps=num_training_steps,
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
                "frequency": 1,
            },
        }

    def get_tqdm_dict(self):
        # Masih ada buat kompatibilitas lama; kalau nggak dipakai bisa dihapus.
        return {}

    def train_dataloader(self):
        train_dataset = CustomDataset(
            tokenizer=self.tokenizer,
            type_path=self.params.train_file_name,
            data_dir=self.params.data_dir,
            max_len=self.params.max_seq_length,
        )
        dataloader = DataLoader(
            train_dataset,
            batch_size=self.params.train_batch_size,
            drop_last=True,
            shuffle=True,
            num_workers=4,
        )
        return dataloader

    def val_dataloader(self):
        val_dataset = CustomDataset(
            tokenizer=self.tokenizer,
            type_path=self.params.eval_file_name,
            data_dir=self.params.data_dir,
            max_len=self.params.max_seq_length,
        )
        return DataLoader(
            val_dataset,
            batch_size=self.params.eval_batch_size,
            num_workers=4,
        )


In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))


In [None]:
# Hyper parameters
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.1,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=6,
    eval_batch_size=6,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    # early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O2', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-base")


class CustomDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=256):
        # kolom logis (nggak terlalu kepakai karena kita baca dari CSV langsung)
        self.source_column = "question1"
        self.target_column = "question2"

        # baca file CSV: Train.csv / Test.csv
        self.data = []
        with open(type_path + ".csv", "r", encoding="utf-8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=",")
            for row in csv_reader:
                # pastikan minimal ada 2 kolom: question1, question2
                if len(row) >= 2:
                    self.data.append(row)

        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # ambil pasangan kalimat
        input_ = self.data[index][0]
        target = self.data[index][1]

        # format untuk T5 paraphrase
        input_ = "paraphrase: " + input_ + " </s>"
        target = target + " </s>"

        # tokenisasi input
        tokenized_inputs = self.tokenizer(
            input_,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # tokenisasi target
        tokenized_targets = self.tokenizer(
            target,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()
        src_mask = tokenized_inputs["attention_mask"].squeeze()
        target_mask = tokenized_targets["attention_mask"].squeeze()

        return {
            "source_ids": source_ids,
            "source_mask": src_mask,
            "target_ids": target_ids,
            "target_mask": target_mask,
        }


    def _build(self):
        for example in self.data:

            input_ = example[0]
            target = example[1]

            input_ = "paraphrase: " + input_ + " </s>"
            target = target + " </s>"

            # ✅ PERBAIKAN DI SINI (1)
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_],
                max_length=self.max_len,
                padding="max_length",     # <-- DIGANTI
                truncation=True,
                return_tensors="pt"
            )

            # ✅ PERBAIKAN DI SINI (2)
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target],
                max_length=self.max_len,
                padding="max_length",     # <-- DIGANTI
                truncation=True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# def get_dataset(tokenizer, type_path, args):
#   return CustomDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

## Quora Question Pairs

In [None]:
import tensorflow_datasets as tfds


##Load Dataset QQP (GLUE)

In [None]:
(ds_train, ds_validation, ds_test), ds_info = tfds.load(
    "glue/qqp",
    split=["train", "validation", "test"],
    with_info=True
)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/glue/qqp/2.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/glue/qqp/incomplete.HBRPOI_2.0.0/glue-train.tfrecord*...:   0%|          |…

Generating validation examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/glue/qqp/incomplete.HBRPOI_2.0.0/glue-validation.tfrecord*...:   0%|      …

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/glue/qqp/incomplete.HBRPOI_2.0.0/glue-test.tfrecord*...:   0%|          | …

Dataset glue downloaded and prepared to /root/tensorflow_datasets/glue/qqp/2.0.0. Subsequent calls will reuse this data.


##Cek Informasi Dataset

In [None]:
print(ds_info)


tfds.core.DatasetInfo(
    name='glue',
    full_name='glue/qqp/2.0.0',
    description="""
    GLUE, the General Language Understanding Evaluation benchmark
    (https://gluebenchmark.com/) is a collection of resources for training,
    evaluating, and analyzing natural language understanding systems.
    """,
    config_description="""
    The Quora Question Pairs2 dataset is a collection of question pairs from the
    community question-answering website Quora. The task is to determine whether a
    pair of questions are semantically equivalent.
    """,
    homepage='https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs',
    data_dir='/root/tensorflow_datasets/glue/qqp/2.0.0',
    file_format=tfrecord,
    download_size=39.76 MiB,
    dataset_size=150.37 MiB,
    features=FeaturesDict({
        'idx': int32,
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'question1': Text(shape=(), dtype=string),
        'question2': Text(shape=(), dtype=str

##Inisialisasi Penampung Data

In [None]:
train_examples = []
test_examples = []


##Filter Data TRAIN (Label = 1 / Duplikat)

In [None]:
for example in ds_train:
    if example["label"] == 1:
        train_examples.append(
            (
                example["question1"].numpy().decode(),
                example["question2"].numpy().decode()
            )
        )


##Filter Data VALIDATION (Sebagai Test Set)

In [None]:
for example in ds_validation:
    if example["label"] == 1:
        test_examples.append(
            (
                example["question1"].numpy().decode(),
                example["question2"].numpy().decode()
            )
        )


## Sampel Data Training

In [None]:
print("=== Contoh Sampel Data TRAIN ===\n")

for i in range(5):
    q1, q2 = train_examples[i]
    print(f"Sample {i+1}")
    print("Question 1:", q1)
    print("Question 2:", q2)
    print("-" * 50)


=== Contoh Sampel Data TRAIN ===

Sample 1
Question 1: What is the best sex youve ever had?
Question 2: What's the best sexual experience you have had?
--------------------------------------------------
Sample 2
Question 1: What are some TV shows/ movies with a scene where a female falls, specifically, face-first into something like a cake, a mud puddle, pie, etc.?
Question 2: What are some TV shows/movies with a scene where a female falls face-first into something like a cake, a mud puddle, pie, etc.?
--------------------------------------------------
Sample 3
Question 1: How can I become rich in short time?
Question 2: How can I become rich man?
--------------------------------------------------
Sample 4
Question 1: What is the concept behind phase angles in ac circuits?
Question 2: What is phase angle in AC circuit?
--------------------------------------------------
Sample 5
Question 1: What are the best way to sustain your motivation?
Question 2: What are the ways to sustain your m

In [None]:
print("\n=== Contoh Sampel Data TEST (Validation) ===\n")

for i in range(5):
    q1, q2 = test_examples[i]
    print(f"Sample {i+1}")
    print("Question 1:", q1)
    print("Question 2:", q2)
    print("-" * 50)



=== Contoh Sampel Data TEST (Validation) ===

Sample 1
Question 1: Who is going to be a better president - Hillary Clinton or Donald Trump?
Question 2: In what aspects is Hillary Clinton better than Trump?
--------------------------------------------------
Sample 2
Question 1: What is the best language to learn programming?
Question 2: What is the best programming language for beginners to learn?
--------------------------------------------------
Sample 3
Question 1: How can I control my emotions and actions?
Question 2: How can I control my emotions?
--------------------------------------------------
Sample 4
Question 1: What are the features linux better than windows?
Question 2: Why is Linux better than Windows for programmers?
--------------------------------------------------
Sample 5
Question 1: Which is the best book for c programming?
Question 2: Which is the best C programming book for a first time reader?
--------------------------------------------------


In [None]:
print("Jumlah data train :", len(train_examples))
print("Jumlah data test  :", len(test_examples))

Jumlah data train : 134378
Jumlah data test  : 14885


## Simpan Data TRAIN dan TEST ke CSV

In [None]:
import csv

# ======================
# Simpan TRAIN
# ======================
with open('Train.csv', 'w', newline='', encoding='utf-8') as out:
    csv_out = csv.writer(out)
    csv_out.writerow(['question1', 'question2'])  # header
    for q1, q2 in train_examples:
        csv_out.writerow([q1, q2])

# ======================
# Simpan TEST
# ======================
with open('Test.csv', 'w', newline='', encoding='utf-8') as out:
    csv_out = csv.writer(out)
    csv_out.writerow(['question1', 'question2'])  # header
    for q1, q2 in test_examples:
        csv_out.writerow([q1, q2])


## sampel data

In [None]:
print("=== Sampel Data TRAIN ===\n")
for i in range(3):
    q1, q2 = train_examples[i]
    print(f"Sample {i+1}")
    print("Q1:", q1)
    print("Q2:", q2)
    print("-" * 40)
print("\n=== Sampel Data TEST ===\n")
for i in range(3):
    q1, q2 = test_examples[i]
    print(f"Sample {i+1}")
    print("Q1:", q1)
    print("Q2:", q2)
    print("-" * 40)


=== Sampel Data TRAIN ===

Sample 1
Q1: What is the best sex youve ever had?
Q2: What's the best sexual experience you have had?
----------------------------------------
Sample 2
Q1: What are some TV shows/ movies with a scene where a female falls, specifically, face-first into something like a cake, a mud puddle, pie, etc.?
Q2: What are some TV shows/movies with a scene where a female falls face-first into something like a cake, a mud puddle, pie, etc.?
----------------------------------------
Sample 3
Q1: How can I become rich in short time?
Q2: How can I become rich man?
----------------------------------------

=== Sampel Data TEST ===

Sample 1
Q1: Who is going to be a better president - Hillary Clinton or Donald Trump?
Q2: In what aspects is Hillary Clinton better than Trump?
----------------------------------------
Sample 2
Q1: What is the best language to learn programming?
Q2: What is the best programming language for beginners to learn?
---------------------------------------

## Konfigurasi dan Persiapan Proses Training Model T5

In [None]:
output_dir = "/content/drive/MyDrive/t5_QQP"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

args_dict.update({
    "output_dir": output_dir,
    "num_train_epochs": 1,   # cukup 1 lagi
    "max_seq_length": 256,
    "train_file_name": "Train",
    "eval_file_name": "Test"
})

args = argparse.Namespace(**args_dict)

print(args_dict)

{'data_dir': '', 'output_dir': '/content/drive/MyDrive/t5_QQP', 'model_name_or_path': 't5-base', 'tokenizer_name_or_path': 't5-base', 'max_seq_length': 256, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 6, 'eval_batch_size': 6, 'num_train_epochs': 1, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'fp_16': False, 'opt_level': 'O2', 'max_grad_norm': 1.0, 'seed': 42, 'train_file_name': 'Train', 'eval_file_name': 'Test'}


## Konfigurasi Trainer dan Checkpointing (PyTorch Lightning)

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir,
    filename="checkpoint",
    monitor="val_loss",
    mode="min",
    save_top_k=5
)

train_params = dict(
    accelerator="gpu",
    devices=args.n_gpu,
    max_epochs=args.num_train_epochs,
    accumulate_grad_batches=args.gradient_accumulation_steps,
    precision=16 if args.fp_16 else 32,
    gradient_clip_val=args.max_grad_norm,
    callbacks=[
        checkpoint_callback,
        LoggingCallback()
    ],
)


##Inisialisasi Model T5 untuk Fine-Tuning

In [None]:
model = T5FineTuner(args)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# ====================================================
# 8. Training model dan simpan hasilnya
# ====================================================
trainer = pl.Trainer(**train_params)

print("Training model")
trainer.fit(model)
print("Training finished")

print("Saving model to Google Drive...")
model.model.save_pretrained("/content/drive/MyDrive/t5_paraphrase")
model.tokenizer.save_pretrained("/content/drive/MyDrive/t5_paraphrase")
print("Model saved to Google Drive ")


INFO:pytorch_lightning.utilities.rank_zero:Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training model


INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.


Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Training finished
Saving model to Google Drive...
Model saved to Google Drive 


In [None]:
import torch
import re

device = "cuda" if torch.cuda.is_available() else "cpu"


model.model.to(device)
model.model.eval()

sentence = "In order to make something we have to work hard."
# jangan tambahin </s> manual
text = "paraphrase: " + sentence

encoding = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    max_length=128,
    padding="longest"
)

with torch.no_grad():
  outputs = model.model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=48,
    num_beams=4,
    num_return_sequences=4,
    no_repeat_ngram_size=3,
    repetition_penalty=1.2,
    length_penalty=0.8,
    do_sample=False
)




def clean_sentence(s):
    s = s.strip()
    s = s.strip('"').strip("'")
    s = re.sub(r'^[“”`´]+', '', s)
    s = re.sub(r'[“”`´]+$', '', s)
    s = re.sub(r'\s+', ' ', s)
    if s:
        s = s[0].upper() + s[1:]
    return s

def canonical_key(s):
    return re.sub(r'[^0-9a-z]+', '', s.lower())

orig_norm_key = canonical_key(sentence)

cleaned = []
seen = set()

for out in outputs:
    sent = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    if sent.lower().startswith("paraphrase:"):
        sent = sent[len("paraphrase:"):].strip()
    sent = clean_sentence(sent)
    if len(sent) < 3:
        continue
    key = canonical_key(sent)
    if key == orig_norm_key:
        continue
    if key in seen:
        continue
    seen.add(key)
    cleaned.append(sent)

print("Original Sentence:")
print(sentence)
print("\nParaphrased Results:")
if not cleaned:
    print("  (Tidak ada paraphrase unik selain original.)")
else:
    for i, s in enumerate(cleaned, 1):
        print(f"{i}. {s}")


Original Sentence:
In order to make something we have to work hard.

Paraphrased Results:
1. To make something, we have to work hard.


In [None]:
!zip -r /content/drive/MyDrive/t5_paraphrase.zip /content/drive/MyDrive/t5_paraphrase

  adding: content/drive/MyDrive/t5_paraphrase/ (stored 0%)
  adding: content/drive/MyDrive/t5_paraphrase/config.json (deflated 63%)
  adding: content/drive/MyDrive/t5_paraphrase/generation_config.json (deflated 27%)
  adding: content/drive/MyDrive/t5_paraphrase/model.safetensors (deflated 8%)
  adding: content/drive/MyDrive/t5_paraphrase/tokenizer_config.json (deflated 94%)
  adding: content/drive/MyDrive/t5_paraphrase/special_tokens_map.json (deflated 85%)
  adding: content/drive/MyDrive/t5_paraphrase/added_tokens.json (deflated 83%)
  adding: content/drive/MyDrive/t5_paraphrase/spiece.model (deflated 48%)
