In [7]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import os

MODEL_PATH = os.path.abspath("./vit5_finetune_en_to_vi")
INPUT_CSV = "./private test/en.csv"                  
OUTPUT_CSV = "translated.csv"           
TEXT_COLUMN = "English"                 
BATCH_SIZE = 16
MAX_LENGTH = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
model.eval()

df = pd.read_csv(INPUT_CSV)
if TEXT_COLUMN not in df.columns:
    raise ValueError(f"Không tìm thấy cột '{TEXT_COLUMN}' trong file {INPUT_CSV}")

sentences = df[TEXT_COLUMN].astype(str).tolist()
print(f"Loaded {len(sentences)} sentences from {INPUT_CSV}")

translations = []
for i in tqdm(range(0, len(sentences), BATCH_SIZE), desc="Translating"):
    batch_sentences = sentences[i:i+BATCH_SIZE]
    batch_inputs = [f"en-vi: {s}" for s in batch_sentences]

    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=MAX_LENGTH,
            num_beams=4,
            early_stopping=True
        )

    batch_translations = [tokenizer.decode(g, skip_special_tokens=True) for g in outputs]
    translations.extend(batch_translations)

out_df = pd.DataFrame({"Vietnamese": translations})
out_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")


Using device: cuda
Loaded 1000 sentences from ./private test/en.csv


Translating:  10%|▉         | 6/63 [00:54<08:41,  9.15s/it]


KeyboardInterrupt: 

In [None]:
MODEL_PATH = os.path.abspath("./vit5_finetune_vi_to_en")
INPUT_CSV = "./private test/vi.csv"                  
OUTPUT_CSV = "translated2.csv"           
TEXT_COLUMN = "Vietnamese"                 
BATCH_SIZE = 16
MAX_LENGTH = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
model.eval()

df = pd.read_csv(INPUT_CSV)
if TEXT_COLUMN not in df.columns:
    raise ValueError(f"Không tìm thấy cột '{TEXT_COLUMN}' trong file {INPUT_CSV}")

sentences = df[TEXT_COLUMN].astype(str).tolist()
print(f"Loaded {len(sentences)} sentences from {INPUT_CSV}")

translations = []
for i in tqdm(range(0, len(sentences), BATCH_SIZE), desc="Translating"):
    batch_sentences = sentences[i:i+BATCH_SIZE]
    batch_inputs = [f"en-vi: {s}" for s in batch_sentences]

    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=MAX_LENGTH,
            num_beams=4,
            early_stopping=True
        )

    batch_translations = [tokenizer.decode(g, skip_special_tokens=True) for g in outputs]
    translations.extend(batch_translations)

out_df = pd.DataFrame({"English": translations})
out_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

Using device: cuda
Loaded 1000 sentences from ./private test/vi.csv


Translating:  27%|██▋       | 17/63 [00:15<00:41,  1.12it/s]


KeyboardInterrupt: 