# Install Environment

In [None]:
# Install Model, Bitsandbytes and Datasets

!pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets
!pip install tqdm

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m33.8 

# Load Datasets Ipu24

In [None]:
# Datasets of thai-language-image-captionin

!wget https://storage.googleapis.com/ss4-exp-datasource/imageCaption/thai-language-image-captioning.zip
!unzip /content/thai-language-image-captioning.zip

## Read Datasets

In [None]:
# Training Datasets

import pandas as pd

train_travel_1 = pd.read_excel('/content/EXP_Train_Travel_1.xlsx')
train_travel_2 = pd.read_excel('/content/EXP_Train_Travel_2.xlsx')
train_food = pd.read_excel('/content/EXP_Train_Food.xlsx')
train_df = pd.concat([train_travel_1, train_travel_2, train_food])
train_df

Unnamed: 0,Image ID,Description
0,train/travel/00000,A golden Naga statue with lotus flowers on th...
1,train/travel/00000,A dragon statue is inside a Chinese shrine an...
2,train/travel/00000,"A dragon statue with golden whiskers, white f..."
3,train/travel/00001,An ancient site with a black-brown pagoda wit...
4,train/travel/00001,Build a large historical site in an area fille...
...,...,...
35704,train/food/28002,Pineapple snacks filled with minced pork are ...
35705,train/food/28002,"On a white plate, pineapple was cut into small..."
35706,train/food/28003,Several skewers of grilled catfish sit in a t...
35707,train/food/28003,A large quantity of grilled catfish placed in...


In [None]:
# Validation Datasets

val_df = pd.read_excel('/content/EXP_Val.xlsx')
val_df

Unnamed: 0,Image ID,Description
0,val/travel/00000,It was a white church with a closed red door ...
1,val/travel/00000,Two green Naga statues are installed on the w...
2,val/travel/00000,White cement temple gable roof Red door with ...
3,val/travel/00001,A small gray pagoda sits next to a tree in fr...
4,val/travel/00001,six-story cement pagoda with a hexagonal roof...
...,...,...
12281,val/food/04034,"Many skewered Isaan sausages, some of which a..."
12282,val/food/04034,The food on several skewers placed on a fire-g...
12283,val/food/04035,"Brown steel rack with yellow, red, green, and..."
12284,val/food/04035,"Bread in a cup with orange, green, red, and b..."


## Transform path to image

In [None]:
import datasets
from PIL import Image
from tqdm.auto import tqdm

def transform_path_to_image(path, df):
    imgs_travel = []
    texts_travel = []
    imgs_food = []
    texts_food = []
    for i in tqdm(range(len(df))):
        img_path = path + df.iloc[i]["Image ID"].replace(" ", "") + ".jpg"
        img = Image.open(img_path)
        if "travel" in img_path:
            imgs_travel.append(img)
            texts_travel.append(df.iloc[i][" Description"])
        elif "food" in img_path:
            imgs_food.append(img)
            texts_food.append(df.iloc[i][" Description"])
    return (imgs_travel, texts_travel), (imgs_food, texts_food)

In [None]:
train_travel, train_food = transform_path_to_image("/content/train/", train_df)
train_travel = datasets.Dataset.from_dict({"image": train_travel[0], "text": train_travel[1]})
train_food = datasets.Dataset.from_dict({"image": train_food[0], "text": train_food[1]})
print(train_travel)
print(train_food)

  0%|          | 0/85241 [00:00<?, ?it/s]

Dataset({
    features: ['image', 'text'],
    num_rows: 49532
})
Dataset({
    features: ['image', 'text'],
    num_rows: 35709
})


In [None]:
val_travel, val_food = transform_path_to_image("/content/val/", val_df)
val_travel = datasets.Dataset.from_dict({"image": val_travel[0], "text": val_travel[1]})
val_food = datasets.Dataset.from_dict({"image": val_food[0], "text": val_food[1]})
print(val_travel)
print(val_food)

  0%|          | 0/12286 [00:00<?, ?it/s]

Dataset({
    features: ['image', 'text'],
    num_rows: 7195
})
Dataset({
    features: ['image', 'text'],
    num_rows: 5091
})


# Modeling Blip2

In [None]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration

# Processor for fine-tuning Travel Datasets
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")

# Model for fine-tuning Travel Datasets
model_travel = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b-coco", load_in_8bit=True, device_map=0)

# Model for fine-tuning Food Datasets
model_food = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b-coco", load_in_8bit=True, device_map=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/5.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model_travel

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear8bitLt(in_features=1408, out_features=4224, bias=True)
            (projection): Linear8bitLt(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear8bitLt(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear8bitLt(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post

## Config Lora

In [None]:
# Select Layers for fine-tuning

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj"]
)

model_travel = get_peft_model(model_travel, config)
model_travel.print_trainable_parameters()
model_food = get_peft_model(model_food, config)
model_food.print_trainable_parameters()

trainable params: 5,242,880 || all params: 3,750,514,176 || trainable%: 0.1398
trainable params: 5,242,880 || all params: 3,750,514,176 || trainable%: 0.1398


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt")
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding

def collate_fn(batch):
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer(
                [example["text"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
    return processed_batch

# DataLoader Training
train_travel = ImageCaptioningDataset(train_travel, processor)
train_food = ImageCaptioningDataset(train_food, processor)
train_travel = DataLoader(train_travel, shuffle=True, batch_size=4, collate_fn=collate_fn)
train_food = DataLoader(train_food, shuffle=True, batch_size=4, collate_fn=collate_fn)

# DataLoader Validation
val_travel = ImageCaptioningDataset(val_travel, processor)
val_food = ImageCaptioningDataset(val_food, processor)
val_travel = DataLoader(val_travel, shuffle=True, batch_size=4, collate_fn=collate_fn)
val_food = DataLoader(val_food, shuffle=True, batch_size=4, collate_fn=collate_fn)

## Connect Huggiface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Train

### Travel

In [None]:
optimizer = torch.optim.Adam(model_travel.parameters(), lr=5e-5)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_travel.train()

for epoch in range(3):
    print("Epoch:", epoch + 1)
    total_train_loss = 0.0
    total_batches = 0
    for idx, batch in tqdm(enumerate(train_travel), total=len(train_travel)):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device, torch.float16)
        attention_mask = batch.pop("attention_mask").to(device)
        outputs = model_travel(input_ids=input_ids,
                        pixel_values=pixel_values,
                        labels=input_ids,
                        attention_mask=attention_mask)
        loss = outputs.loss
        total_train_loss += loss.item()
        total_batches += 1
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    average_train_loss = total_train_loss / total_batches
    print("Train Loss for Epoch {}: {:.4f}".format(epoch + 1, average_train_loss))
    model_travel.push_to_hub("Expss4/EXP-Blip2-ip24-travel")

Epoch: 1


  0%|          | 0/12383 [00:00<?, ?it/s]

### Food

In [None]:
optimizer = torch.optim.Adam(model_food.parameters(), lr=5e-5)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_food.train()

for epoch in range(3):
    print("Epoch:", epoch + 1)
    total_train_loss = 0.0
    total_batches = 0
    for idx, batch in tqdm(enumerate(train_food), total=len(train_food)):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device, torch.float16)
        attention_mask = batch.pop("attention_mask").to(device)
        outputs = model_food(input_ids=input_ids,
                        pixel_values=pixel_values,
                        labels=input_ids,
                        attention_mask=attention_mask)
        loss = outputs.loss
        total_train_loss += loss.item()
        total_batches += 1
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    average_train_loss = total_train_loss / total_batches
    print("Train Loss for Epoch {}: {:.4f}".format(epoch + 1, average_train_loss))
    model_food.push_to_hub("Expss4/EXP-Blip2-ip24-food")

Epoch: 1


  0%|          | 0/17855 [00:00<?, ?it/s]

# Submission

## Load Test Datasets COCO from Kaggle

In [None]:
from google.colab import files

uploaded = files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
!pip install kaggle
!kaggle datasets download -d rbewoor/coco-test-2017-images
!unzip /content/coco-test-2017-images.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: test2017/000000510779.jpg  
  inflating: test2017/000000510817.jpg  
  inflating: test2017/000000510825.jpg  
  inflating: test2017/000000510843.jpg  
  inflating: test2017/000000510862.jpg  
  inflating: test2017/000000510867.jpg  
  inflating: test2017/000000510868.jpg  
  inflating: test2017/000000510878.jpg  
  inflating: test2017/000000510887.jpg  
  inflating: test2017/000000510889.jpg  
  inflating: test2017/000000510901.jpg  
  inflating: test2017/000000510927.jpg  
  inflating: test2017/000000510935.jpg  
  inflating: test2017/000000510941.jpg  
  inflating: test2017/000000510978.jpg  
  inflating: test2017/000000510995.jpg  
  inflating: test2017/000000511010.jpg  
  inflating: test2017/000000511012.jpg  
  inflating: test2017/000000511018.jpg  
  inflating: test2017/000000511034.jpg  
  inflating: test2017/000000511035.jpg  
  inflating: test2017/000000511071.jpg  
  inflating: test2017/00000051108

## Load Model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoProcessor, Blip2ForConditionalGeneration

peft_travel = "Expss4/EXP-Blip2-ip24-travel"
peft_food = "Expss4/EXP-Blip2-ip24-food"

config_travel = PeftConfig.from_pretrained(peft_travel)
config_food = PeftConfig.from_pretrained(peft_food)

model_travel = Blip2ForConditionalGeneration.from_pretrained(config_travel.base_model_name_or_path, load_in_8bit=True, device_map=0)
model_travel = PeftModel.from_pretrained(model_travel, peft_travel)

model_food = Blip2ForConditionalGeneration.from_pretrained(config_food.base_model_name_or_path, load_in_8bit=True, device_map=0)
model_food = PeftModel.from_pretrained(model_food, peft_food)

model_coco = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b-coco", load_in_8bit=True, device_map=0)
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")

## PyThaiNLP Translate

In [None]:
!pip install --upgrade pythainlp
!pip install sacremoses sentencepiece
!pip install fairseq

In [None]:
from pythainlp.translate import Translate

en2th = Translate('en', 'th')
out = en2th.translate("I want fried chicken.")
out

Corpus: scb_1m_en-th_moses
- Downloading: scb_1m_en-th_moses 1.0


  0%|          | 0/1174648148 [00:00<?, ?it/s]

  x = torch._nested_tensor_from_mask(


'ไก่ทอดค่ะ'

## Prediction
- In this case, my team predicted in parallel, so it didn't take long to generate the answer.

In [None]:
# Read Submission

test_df = pd.read_csv('sample_submission.csv')
test_df

Unnamed: 0,image_id,caption
0,test2017/000000160477,เด็กผู้หญิงชาวต่างชาติ กำลังถือขนมเพือกิน โดยท...
1,test2017/000000386306,เป็นย่านขายของริมถนนซึ่งมีป้ายภาษาจีนติดยื่นออ...
2,test2017/000000502273,มีแจกันสองใบ ใบทางขวามีลายกราฟิกสีเขียวส้มดำแล...
3,test2017/000000480896,
4,test2017/000000228698,
...,...,...
48668,test/food/07998,
48669,test/food/07999,
48670,test/food/08000,
48671,test/travel/08001,


In [None]:
from tqdm.auto import tqdm
from PIL import Image

def predicts(image_ls, model):
    inputs = processor(images=image_ls, return_tensors="pt").to("cuda")
    pixel_values = inputs.pixel_values
    generated_ids = model.generate(pixel_values=pixel_values, max_length=128)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    generated_caption = en2th.translate(generated_caption)
    return generated_caption

for index in tqdm(range(len(test_df["image_id"]))):
    if 'food' in test_df["image_id"][index]:
        image_url = 'test/' + test_df["image_id"][index] + '.jpg'
        model = model_food
    elif 'travel' in test_df["image_id"][index]:
        image_url = 'test/' + test_df["image_id"][index] + '.jpg'
        model = model_travel
    else:
        image_url = test_df["image_id"][index] + '.jpg'
        model = model_coco
    raw_image = Image.open(image_url).convert('RGB')
    generated = predicts(raw_image, model)
    test_df.iloc[index]["caption"] = generated

  0%|          | 0/48673 [00:00<?, ?it/s]

In [None]:
test_df

Unnamed: 0,image_id,caption
0,test2017/000000160477,เด็กผู้หญิงชาวต่างชาติ กำลังถือขนมเพือกิน โดยท...
1,test2017/000000386306,เป็นย่านขายของริมถนนซึ่งมีป้ายภาษาจีนติดยื่นออ...
2,test2017/000000502273,มีแจกันสองใบ ใบทางขวามีลายกราฟิกสีเขียวส้มดำแล...
3,test2017/000000480896,จานสีเหลืองที่มีขนมปังและผลไม้
4,test2017/000000228698,ผู้หญิงยืนอยู่ข้างๆคอมพิวเตอร์
...,...,...
48668,test/food/07998,จานสีฟ้าที่มีอาหารหลายชิ้น
48669,test/food/07999,จานอาหารที่มีผักและผลไม้
48670,test/food/08000,หม้อที่มีเนื้อสัตว์และผัก
48671,test/travel/08001,อาคารที่มีนาฬิกาขนาดใหญ่บนหน้า


In [None]:
test_df.to_csv('EXP_Number1.csv', index=False)

##Post Process

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('EXP_Number1.csv')

In [None]:
df['caption'] = df['caption'].str.replace("[ unused0 ]", "", regex=False)

In [None]:
df['text_data'] = df['caption'].str.replace('"', "", regex=False)

In [None]:
df['caption'] = df['caption'].str.replace(":", "", regex=False)

In [None]:
df['caption'] = df['caption'].str.replace("'", "", regex=False)

In [None]:
df['caption'] = df['caption'].str.replace(" ", "", regex=False)

In [None]:
df['caption'] = df['caption'].str.replace("[ ไม่ได้ใช้0 ]", "", regex=False)

In [None]:
df.to_csv('EXP_NO1.csv', index = False)