In [5]:
!pip install transformers datasets torch



In [6]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pandas as pd


In [7]:
model_name = "pszemraj/flan-t5-large-grammar-synthesis"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
dataset_path=("/kaggle/input/clean-dataset/cleaned_data.csv")
import csv

with open('/kaggle/input/clean-dataset/cleaned_data.csv', 'r', newline='', encoding='utf-8') as file_input, open('output.csv', 'w', newline='', encoding='utf-8') as file_output:
    reader = csv.DictReader(file_input)
    writer = csv.writer(file_output)

    # Tulis header baru
    writer.writerow(['incorrect', 'correct'])

    # Proses setiap baris
    for row in reader:
        writer.writerow([row['incorrect'], row['correct']])


In [27]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, input_max_len, target_max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.input_max_len = input_max_len
        self.target_max_len = target_max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        source_text = self.dataframe.loc[index, 'incorrect']
        target_text = self.dataframe.loc[index, 'correct']
        
        source = self.tokenizer(source_text, max_length=self.input_max_len, padding='max_length', truncation=True, return_tensors="pt")
        target = self.tokenizer(target_text, max_length=self.target_max_len, padding='max_length', truncation=True, return_tensors="pt")
        
        source_ids = source['input_ids'].squeeze()
        target_ids = target['input_ids'].squeeze()
        
        return {'input_ids': source_ids, 'attention_mask': source['attention_mask'].squeeze(), 'labels': target_ids}

# Contoh penggunaan:
df = pd.read_csv("/kaggle/input/berish/Data_bersih.csv", nrows=1000)
df.columns = ["incorrect", "correct"]
dataset = CustomDataset(df, tokenizer, input_max_len=128, target_max_len=128)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


In [28]:
optimizer = AdamW(model.parameters(), lr=5e-5)



In [29]:
def train(model, dataloader, optimizer, device, epochs=3):
    model.train()
    model.to(device)
    for epoch in range(epochs):
        for batch_idx, batch in enumerate(dataloader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            if batch_idx % 10 == 0:  # Print every 10 batches
                print(f"Epoch: {epoch+1}, Batch: {batch_idx}, Loss: {loss.item()}")

device = torch.device("cpu")
train(model, dataloader, optimizer, device)

Epoch: 1, Batch: 0, Loss: 0.08688411116600037
Epoch: 1, Batch: 10, Loss: 0.04007495567202568
Epoch: 1, Batch: 20, Loss: 0.23775027692317963
Epoch: 1, Batch: 30, Loss: 0.09954997152090073
Epoch: 1, Batch: 40, Loss: 0.08651918172836304
Epoch: 1, Batch: 50, Loss: 0.07697045803070068
Epoch: 1, Batch: 60, Loss: 0.057699255645275116
Epoch: 1, Batch: 70, Loss: 0.11596230417490005
Epoch: 1, Batch: 80, Loss: 0.0970078855752945
Epoch: 1, Batch: 90, Loss: 0.09645769000053406
Epoch: 1, Batch: 100, Loss: 0.04329666867852211
Epoch: 1, Batch: 110, Loss: 0.03500140830874443
Epoch: 1, Batch: 120, Loss: 0.1166069358587265
Epoch: 1, Batch: 130, Loss: 0.022157881408929825
Epoch: 1, Batch: 140, Loss: 0.1796889454126358
Epoch: 1, Batch: 150, Loss: 0.051213495433330536
Epoch: 1, Batch: 160, Loss: 0.18746241927146912
Epoch: 1, Batch: 170, Loss: 0.09404228627681732
Epoch: 1, Batch: 180, Loss: 0.09798043966293335
Epoch: 1, Batch: 190, Loss: 0.15387089550495148
Epoch: 1, Batch: 200, Loss: 0.03999416530132294
Epo

In [32]:
model.save_pretrained('model')
tokenizer.save_pretrained('tokenizer')

Non-default generation parameters: {'max_length': 512, 'min_length': 8, 'num_beams': 2, 'no_repeat_ngram_size': 4}


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json')