In [11]:
# Cell 1: Import và Setup
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq
import numpy as np
from sklearn.metrics import accuracy_score
import evaluate
from datasets import Dataset as HFDataset

class GECDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        source = row['misspelled']
        target = row['original']
        
        source_encoding = self.tokenizer(
            source, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors='pt'
        )
        
        target_encoding = self.tokenizer(
            target, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors='pt'
        )
        
        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

model_name = "VietAI/vit5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [17]:
# Cell 2: Load và Preprocess Data
df = pd.read_csv('/kaggle/input/cccxsass/vi_misspellings_diverse.csv')
df_filtered = df[df['error_type'] != -1].reset_index(drop=True)
df_filtered = df_filtered.sample(n=50000, random_state=42).reset_index(drop=True)

train_size = int(0.8 * len(df_filtered))
val_size = int(0.1 * len(df_filtered))

train_data = df_filtered[:train_size]
val_data = df_filtered[train_size:train_size + val_size]
test_data = df_filtered[train_size + val_size:]

train_dataset = GECDataset(train_data, tokenizer)
val_dataset = GECDataset(val_data, tokenizer)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

print(f"Training: {len(train_data)}, Validation: {len(val_data)}, Test: {len(test_data)}")

Training: 40000, Validation: 5000, Test: 5000


In [None]:
# Cell 3: Training
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    dataloader_num_workers=4,
    gradient_accumulation_steps=2,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained('./fine_tuned_vit5_gec')
tokenizer.save_pretrained('./fine_tuned_vit5_gec')


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

Step,Training Loss,Validation Loss
500,0.0358,0.031404
1000,0.0309,0.02433
1500,0.0194,0.024125
2000,0.0187,0.022667
2500,0.0171,0.021854
3000,0.0108,0.022298
3500,0.0104,0.022875
4000,0.0068,0.024304
4500,0.0068,0.023965
5000,0.0069,0.02275


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# Cell 4: Evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()
predictions = []
references = []

for _, row in test_data.iterrows():
    input_text = row['misspelled']
    target_text = row['original']
    
    inputs = tokenizer(input_text, return_tensors='pt', max_length=128, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=128,
            num_beams=4,
            early_stopping=True
        )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(prediction)
    references.append(target_text)

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_score = rouge.compute(predictions=predictions, references=references)

exact_match = sum([pred.strip() == ref.strip() for pred, ref in zip(predictions, references)]) / len(predictions)

for pred, ref in zip(predictions, references):
    if len(pred) < len(ref):
        pred, ref = ref, pred
    
    if len(ref) == 0:
        continue
    
    previous_row = list(range(len(ref) + 1))
    for i, c1 in enumerate(pred):
        current_row = [i + 1]
        for j, c2 in enumerate(ref):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    

print("Kết quả đánh giá:")
print(f"BLEU Score: {bleu_score['bleu']:.4f}")
print(f"ROUGE-1: {rouge_score['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_score['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")
print(f"Exact Match: {exact_match:.4f}")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Kết quả đánh giá:
BLEU Score: 0.9440
ROUGE-1: 0.9794
ROUGE-2: 0.9633
ROUGE-L: 0.9764
Exact Match: 0.865