# Path to data

In [1]:
data_dir = '../input/flickr-image-dataset/flickr30k_images'
image_dir = f'{data_dir}/flick30k_images'
csv_file = f'{data_dir}/results.csv'

# Import needed library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Look through and preprocess the data

In [3]:
df = pd.read_csv(csv_file, delimiter="|")
df.head()

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...
1,1000092795.jpg,1,"Two young , White males are outside near many..."
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .
4,1000092795.jpg,4,Two friends enjoy time spent together .


## Check null values in the dataset

In [4]:
df.isnull().sum()

image_name         0
 comment_number    0
 comment           1
dtype: int64

In [5]:
null_comments = df[df[' comment'].isnull()]

print(null_comments)

           image_name                      comment_number  comment
19999  2199200615.jpg   4   A dog runs across the grass .      NaN


### Rename columns in the dataset

In [6]:
df.columns

Index(['image_name', ' comment_number', ' comment'], dtype='object')

In [7]:
df = df.rename(columns={' comment_number':'comment_number', ' comment':'comment'})

In [8]:
df['comment'][0]

' Two young guys with shaggy hair look at their hands while hanging out in the yard .'

In [9]:
df.shape

(158915, 3)

In [10]:
df["image_name"].nunique()

31783

In [11]:
list_comment_number = df["comment_number"].values.tolist()

### Check the error row in the dataset

In [12]:
valid_number = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
for num in list_comment_number:
    if num.strip() not in valid_number:
        print('Invalid number: ',num)
        print('Index of it: ', list_comment_number.index(num))

Invalid number:   4   A dog runs across the grass .
Index of it:  19999


In [13]:
df.loc[19999, 'comment_number'] = ' 4'
df.loc[19999, 'comment'] = 'A dog runs across the grass .'
df['image_name'] = image_dir + '/' + df['image_name']
df.head()

Unnamed: 0,image_name,comment_number,comment
0,../input/flickr-image-dataset/flickr30k_images...,0,Two young guys with shaggy hair look at their...
1,../input/flickr-image-dataset/flickr30k_images...,1,"Two young , White males are outside near many..."
2,../input/flickr-image-dataset/flickr30k_images...,2,Two men in green shirts are standing in a yard .
3,../input/flickr-image-dataset/flickr30k_images...,3,A man in a blue shirt standing in a garden .
4,../input/flickr-image-dataset/flickr30k_images...,4,Two friends enjoy time spent together .


### Each image contains 5 captions, so we need tranform each caption into a column

In [14]:
df['comment_number'].nunique()

5

In [15]:
print(df['comment_number'].unique())

[' 0' ' 1' ' 2' ' 3' ' 4']


In [16]:
image_name = {
    'image_name':df[df['comment_number'] == df['comment_number'][0]]['image_name'].values,
}
comments = {
    'comment_0':df[df['comment_number'] == df['comment_number'][0]]['comment'].values,
    'comment_1':df[df['comment_number'] == df['comment_number'][1]]['comment'].values,
    'comment_2':df[df['comment_number'] == df['comment_number'][2]]['comment'].values,
    'comment_3':df[df['comment_number'] == df['comment_number'][3]]['comment'].values,
    'comment_4':df[df['comment_number'] == df['comment_number'][4]]['comment'].values,
}

image_name_df = pd.DataFrame.from_dict(image_name)
comments_df = pd.DataFrame.from_dict(comments)

df = pd.concat([image_name_df,comments_df], axis=1)
df.head(5)

Unnamed: 0,image_name,comment_0,comment_1,comment_2,comment_3,comment_4
0,../input/flickr-image-dataset/flickr30k_images...,Two young guys with shaggy hair look at their...,"Two young , White males are outside near many...",Two men in green shirts are standing in a yard .,A man in a blue shirt standing in a garden .,Two friends enjoy time spent together .
1,../input/flickr-image-dataset/flickr30k_images...,Several men in hard hats are operating a gian...,Workers look down from up above on a piece of...,Two men working on a machine wearing hard hats .,Four men on top of a tall structure .,Three men on a large rig .
2,../input/flickr-image-dataset/flickr30k_images...,A child in a pink dress is climbing up a set ...,A little girl in a pink dress going into a wo...,A little girl climbing the stairs to her play...,A little girl climbing into a wooden playhouse,A girl going into a wooden building .
3,../input/flickr-image-dataset/flickr30k_images...,Someone in a blue shirt and hat is standing o...,A man in a blue shirt is standing on a ladder...,A man on a ladder cleans the window of a tall...,man in blue shirt and jeans on ladder cleanin...,a man on a ladder cleans a window
4,../input/flickr-image-dataset/flickr30k_images...,"Two men , one in a gray shirt , one in a blac...",Two guy cooking and joking around with the ca...,Two men in a kitchen cooking food on a stove .,Two men are at the stove preparing food .,Two men are cooking a meal .


In [17]:
#Save the editted dataset to csv file
# df.to_csv("flickr30k.csv", index=False)
# print("Exported to csv file!")

# Using Ctranslate 2 with opus-mt-en-vi model to translate from English to Vietnamese

In [None]:
pip install ctranslate2 transformers sentencepiece

In [None]:
from ctranslate2 import converters

# Define the model name from Hugging Face
model_name = "Helsinki-NLP/opus-mt-en-vi"

# Define the path where the converted model will be saved
output_path = "en_vi_ctranslate2"

# Convert the model
converter = converters.TransformersConverter(model_name)
converter.convert(output_path)

print(f"Model converted successfully and saved to: {output_path}")

# Step to translate
1. Tokenizing the source text.

2. Translating the tokens with CTranslate2.

3. Decoding the resulting tokens back into text.

In [None]:
#Check number of gpu
import torch
print(torch.cuda.device_count())

In [None]:
import transformers
import ctranslate2

In [None]:
!pip install sacremoses

In [None]:
# Load the model and tokenizer
model_path = "en_vi_ctranslate2"
device = "cuda"

# Load the CTranslate2 translator
translator = ctranslate2.Translator(model_path, device=device, device_index=[0, 1])

# Load the tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")

def translate_with_ct2(text):
    try:
        # Tokenize the source text
        source_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
        
        # Translate the tokens
        results = translator.translate_batch([source_tokens])
        
        target_tokens = results[0].hypotheses[0]
        
        # Decode the target tokens back to a string
        target_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(target_tokens))
        return target_text
    except Exception as e:
        return f"Error: {e}"

df_new = df.iloc[:10, :2]
df_new['vi_comment_0'] = df_new['comment_0'].apply(translate_with_ct2)

df_new

In [None]:
df_new.iloc[0, 1]

In [None]:
df_new.iloc[0, 2]

In [None]:
df_new.iloc[1, 1]

In [None]:
df_new.iloc[1, 2]

In [None]:
df_new.iloc[2, 1]

In [None]:
df_new.iloc[2, 2]

In [None]:
df_new.iloc[3, 1]

In [None]:
df_new.iloc[3, 2]

In [None]:
df_new.iloc[4, 1]

In [None]:
df_new.iloc[4, 2]

In [None]:
import pandas as pd
from tqdm import tqdm

# Load the model and tokenizer
model_path = "en_vi_ctranslate2"
device = "cuda"

# Load the CTranslate2 translator
translator = ctranslate2.Translator(model_path, device=device, device_index=[0, 1])

# Load the tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")

def batch_translate(texts, batch_size=64, desc="Translating"):
    """
    Translate list text via batch
    """
    translations = []
    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch_texts = texts[i:i+batch_size]

        # Tokenize each sentence
        batch_tokens = [
            tokenizer.convert_ids_to_tokens(tokenizer.encode(t))
            for t in batch_texts
        ]

        # Translate with CT2
        results = translator.translate_batch(batch_tokens)

        # Decode result
        batch_translations = [
            tokenizer.decode(tokenizer.convert_tokens_to_ids(r.hypotheses[0]))
            for r in resultsn
        ]

        translations.extend(batch_translations)
    return translations

# Split all dataset into batchn
columns_to_translate = ['comment_0']
for column in columns_to_translate:
    all_texts = df[column].tolist()
    translated_texts = batch_translate(all_texts, batch_size=64, desc=f"Translating {column}")
    new_col = f"vi_{column}"
    df[new_col] = translated_texts

df.head()

In [None]:
# Split all dataset into batch
columns_to_translate = ['comment_1']
for column in columns_to_translate:
    all_texts = df[column].tolist()
    translated_texts = batch_translate(all_texts, batch_size=64, desc=f"Translating {column}")
    new_col = f"vi_{column}"
    df[new_col] = translated_texts

df.head()

In [None]:
# Split all dataset into batch
columns_to_translate = ['comment_2']
for column in columns_to_translate:
    all_texts = df[column].tolist()
    translated_texts = batch_translate(all_texts, batch_size=64, desc=f"Translating {column}")
    new_col = f"vi_{column}"
    df[new_col] = translated_texts

df.head()

In [None]:
# Split all dataset into batch
columns_to_translate = ['comment_3']
for column in columns_to_translate:
    all_texts = df[column].tolist()
    translated_texts = batch_translate(all_texts, batch_size=64, desc=f"Translating {column}")
    new_col = f"vi_{column}"
    df[new_col] = translated_texts

df.head()

In [None]:
# Split all dataset into batch
columns_to_translate = ['comment_4']
for column in columns_to_translate:
    all_texts = df[column].tolist()
    translated_texts = batch_translate(all_texts, batch_size=64, desc=f"Translating {column}")
    new_col = f"vi_{column}"
    df[new_col] = translated_texts

df.head()

In [None]:
# Save the editted dataset to csv file
df.to_csv("flickr30k_translated.csv", index=False)
print("Exported to csv file!")

In [None]:
df.iloc[0, 6]

In [None]:
df.iloc[1, 6]

In [None]:
df.iloc[2, 6]

# Use Vinai-translate-en2vi-v2 to translate from English to Vietnamese

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer_en2vi = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi-v2", src_lang="en_XX")
model_en2vi = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-en2vi-v2")
device_en2vi = torch.device("cuda")
model_en2vi.to(device_en2vi)


def translate_en2vi(en_texts: str) -> str:
    input_ids = tokenizer_en2vi(en_texts, padding=True, return_tensors="pt").to(device_en2vi)
    output_ids = model_en2vi.generate(
        **input_ids,
        decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"],
        num_return_sequences=1,
        num_beams=5,
        early_stopping=True
    )
    vi_texts = tokenizer_en2vi.batch_decode(output_ids, skip_special_tokens=True)
    return vi_texts

en_texts = ["I haven't been to a public gym before.",
            "When I exercise in a private space, I feel more comfortable.",
            "i haven't been to a public gym before when i exercise in a private space i feel more comfortable"]
print(translate_en2vi(en_texts))

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

2025-10-01 13:15:11.634768: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759324511.973363      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759324512.078480      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

['Tôi chưa từng đến phòng tập công cộng trước đây.', 'Khi tôi tập thể dục trong không gian riêng tư, tôi cảm thấy thoải mái hơn.', 'Tôi chưa từng đến phòng tập thể dục công cộng trước đây. Khi tôi tập thể dục trong một không gian riêng tư, tôi cảm thấy thoải mái hơn']


In [19]:
import pandas as pd
from tqdm import tqdm

tokenizer_en2vi = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi-v2", src_lang="en_XX")
model_en2vi = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-en2vi-v2")

# Use both GPUs
device = torch.device("cuda")
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    model_en2vi = torch.nn.DataParallel(model_en2vi)

model_en2vi = model_en2vi.to(device)

def batch_translate_vinai(texts, batch_size=16, desc="Translating"):
    """
    Translate list text via batch
    """
    translations = []
    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch_texts = texts[i:i+batch_size]

        # Tokenize each sentence
        input_ids = tokenizer_en2vi(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device_en2vi)


        #Translate with vinai-en2vi-v2
        output_ids = model_en2vi.module.generate(
            **input_ids,
            decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"],
            num_return_sequences=1,
            num_beams=5,
            early_stopping=True
        )

        # Decode result
        batch_translations = tokenizer_en2vi.batch_decode(output_ids, skip_special_tokens=True)

        translations.extend(batch_translations)
    return translations

# # Split all dataset into batchn
# columns_to_translate = ['comment_1']
# for column in columns_to_translate:
#     all_texts = df[column].astype(str).tolist()
#     translated_texts = batch_translate_vinai(all_texts, batch_size=16, desc=f"Translating {column}")
#     new_col = f"vinai_{column}"
#     df[new_col] = translated_texts

# df.head()

Using 2 GPUs!


In [None]:
# Split all dataset into batchn
# columns_to_translate = ['comment_1']
# for column in columns_to_translate:
#     all_texts = df[column].astype(str).tolist()
#     translated_texts = batch_translate_vinai(all_texts, batch_size=16, desc=f"Translating {column}")
#     new_col = f"vinai_{column}"
#     df[new_col] = translated_texts

# df.head()

In [None]:
# Split all dataset into batchn
columns_to_translate = ['comment_2']
for column in columns_to_translate:
    all_texts = df[column].astype(str).tolist()
    translated_texts = batch_translate_vinai(all_texts, batch_size=16, desc=f"Translating {column}")
    new_col = f"vinai_{column}"
    df[new_col] = translated_texts

df.head()

In [None]:
df.to_csv("flickr30k_vinai_translated_02.csv", index=False)
print("Exported to csv file!")

In [None]:
# Split all dataset into batchn
columns_to_translate = ['comment_3']
for column in columns_to_translate:
    all_texts = df[column].astype(str).tolist()
    translated_texts = batch_translate_vinai(all_texts, batch_size=16, desc=f"Translating {column}")
    new_col = f"vinai_{column}"
    df[new_col] = translated_texts

df.head()

In [None]:
df.to_csv("flickr30k_vinai_translated_03.csv", index=False)
print("Exported to csv file!")

In [20]:
# Split all dataset into batchn
columns_to_translate = ['comment_4']
for column in columns_to_translate:
    all_texts = df[column].astype(str).tolist()
    translated_texts = batch_translate_vinai(all_texts, batch_size=16, desc=f"Translating {column}")
    new_col = f"vinai_{column}"
    df[new_col] = translated_texts

df.head()

Translating comment_4:   0%|          | 0/1987 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Translating comment_4: 100%|██████████| 1987/1987 [29:55<00:00,  1.11it/s]


Unnamed: 0,image_name,comment_0,comment_1,comment_2,comment_3,comment_4,vinai_comment_4
0,../input/flickr-image-dataset/flickr30k_images...,Two young guys with shaggy hair look at their...,"Two young , White males are outside near many...",Two men in green shirts are standing in a yard .,A man in a blue shirt standing in a garden .,Two friends enjoy time spent together .,Hai người bạn tận hưởng thời gian dành cho nhau.
1,../input/flickr-image-dataset/flickr30k_images...,Several men in hard hats are operating a gian...,Workers look down from up above on a piece of...,Two men working on a machine wearing hard hats .,Four men on top of a tall structure .,Three men on a large rig .,Ba người trên một giàn khoan lớn.
2,../input/flickr-image-dataset/flickr30k_images...,A child in a pink dress is climbing up a set ...,A little girl in a pink dress going into a wo...,A little girl climbing the stairs to her play...,A little girl climbing into a wooden playhouse,A girl going into a wooden building .,Một cô gái đi vào một tòa nhà bằng gỗ .
3,../input/flickr-image-dataset/flickr30k_images...,Someone in a blue shirt and hat is standing o...,A man in a blue shirt is standing on a ladder...,A man on a ladder cleans the window of a tall...,man in blue shirt and jeans on ladder cleanin...,a man on a ladder cleans a window,Một người đàn ông trên thang lau cửa sổ.
4,../input/flickr-image-dataset/flickr30k_images...,"Two men , one in a gray shirt , one in a blac...",Two guy cooking and joking around with the ca...,Two men in a kitchen cooking food on a stove .,Two men are at the stove preparing food .,Two men are cooking a meal .,Hai người đàn ông đang nấu ăn.


In [21]:
df.to_csv("flickr30k_vinai_translated_04.csv", index=False)
print("Exported to csv file!")

Exported to csv file!
