In [1]:
import os
import datasets
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer, ViTFeatureExtractor
os.environ["WANDB_DISABLED"] = "true"

import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)
    

image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(image_encoder_model, text_decode_model)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.9.crossattention.bias', 'h.10.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.5.crossattention.c_proj.weight', 'h.9.crossattention.c_attn.weight', 'h.9.crossattention.q_attn.weight', 'h.2.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.8.crossattention.masked_bias', 'h.5.crossattention.c_attn.weight', 'h.2.ln_cross_attn.weight', 'h.8.crossattention.bias', 'h.3.crossattention.c_proj.bias', 'h.5.crossattention.masked_bias', 'h.3.crossattention.bias', 'h.4.crossattention.c_proj.bias', 'h.2.crossattention.bias', 'h.1.crossattention.q_attn.weight', 'h.7.crossattention.q_attn.weight', 'h.4.crossattention.q_attn.weight', 'h.5.crossattention.c_proj.bias', 'h.6.crossattention.bias', 'h.1.crossattention.masked_bias', 'h.7.crossattention.masked_bias', 'h.4.crossattention.masked_bias', 'h.6.cr

In [2]:
# image feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
# text tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_decode_model)

# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

output_dir = "vit-gpt-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('vit-gpt-model/tokenizer_config.json',
 'vit-gpt-model/special_tokens_map.json',
 'vit-gpt-model/vocab.json',
 'vit-gpt-model/merges.txt',
 'vit-gpt-model/added_tokens.json',
 'vit-gpt-model/tokenizer.json')

In [3]:
import os

all_files = os.listdir('./data/image')

# extract all idx

img_idx = []

for files in all_files:
    pre_suf_fix = files.split('.')
    if pre_suf_fix[-1] == 'jpg':
        img_idx.append(int(pre_suf_fix[0]))

In [4]:
import datasets
import numpy as np
# train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
idx_intent = np.load('./data/total_idx_intent.npy', allow_pickle=True).item()
# idx_titles = np.load('./total_idx_titles.npy', allow_pickle=True).item()

intents = []
img_urls = []
for idx in img_idx:
    intents.append(idx_intent[idx])
    url_str = 'data/image/' + str(idx) + '.jpg'
    img_urls.append(url_str)
# for k,v in idx_intent.items():
#     intents.append(v)
#     titles.append(idx_titles[k])
    
# split train, test set = 8:2
test_num = int(len(intents)*0.2)

train_intent = intents[:-test_num]
train_titles = img_urls[:-test_num]
test_intent = intents[-test_num:]
test_titles = img_urls[-test_num:]

In [5]:
from collections import defaultdict
from datasets import Dataset

image_caption = defaultdict(list)

for i in range(len(train_intent)):
    image_caption['image'].append(train_titles[i])
    image_caption['caption'].append(train_intent[i])
    
image_caption_test = defaultdict(list)

for i in range(len(test_intent)):
    image_caption_test['image'].append(test_titles[i])
    image_caption_test['caption'].append(test_intent[i])
    
dataset = Dataset.from_dict(image_caption)
dataset_vali = Dataset.from_dict(image_caption_test)

In [6]:
from PIL import Image

# text preprocessing step
def tokenization_fn(captions, max_target_length):
    """Run tokenization on captions."""
    labels = tokenizer(captions, 
                      padding="max_length", 
                      max_length=max_target_length).input_ids

    return labels

# image preprocessing step
def feature_extraction_fn(image_paths, check_image=True):
    """
    Run feature extraction on images
    If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
    Otherwise, an exception will be thrown.
    """

    model_inputs = {}

    if check_image:
        images = []
        to_keep = []
        for image_file in image_paths:
            try:
                img = Image.open(image_file)
                images.append(img)
                to_keep.append(True)
            except Exception:
                to_keep.append(False)
    else:
        images = [Image.open(image_file) for image_file in image_paths]

    encoder_inputs = feature_extractor(images=images, return_tensors="np")

    return encoder_inputs.pixel_values

def preprocess_fn(examples, max_target_length, check_image = True):
    """Run tokenization + image feature extraction"""
    image_paths = examples['image']
    captions = examples['caption']    
    
    model_inputs = {}
    # This contains image path column
    model_inputs['labels'] = tokenization_fn(captions, max_target_length)
    model_inputs['pixel_values'] = feature_extraction_fn(image_paths, check_image=check_image)

    return model_inputs

In [7]:
processed_dataset = dataset.map(
    function=preprocess_fn,
    batched=True,
    fn_kwargs={"max_target_length": 128},
    remove_columns=dataset.column_names
)

processed_dataset_test = dataset_vali.map(
    function=preprocess_fn,
    batched=True,
    fn_kwargs={"max_target_length": 128},
    remove_columns=dataset_vali.column_names
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.53s/ba]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.28s/ba]


In [8]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
    # learning_rate=7e-5,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
metric = datasets.load_metric("./rouge.py")

ignore_pad_token_for_loss = True


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
                                                     decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rouge2"])["rouge2"].mid
    return {
        "rouge2_precision": round(result.precision, 4),
        "rouge2_recall": round(result.recall, 4),
        "rouge2_fmeasure": round(result.fmeasure, 4),
    }
    # result = metric.compute(predictions=decoded_preds,
    #                         references=decoded_labels,
    #                         use_stemmer=True)
    # result = {k: round(v * 100, 4) for k, v in result.items()}
    # prediction_lens = [
    #     np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    # ]
    # result["gen_len"] = np.mean(prediction_lens)
    # return result

In [10]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=processed_dataset,
    eval_dataset=processed_dataset_test,
    data_collator=default_data_collator,
)

trainer.train()

***** Running training *****
  Num examples = 4278
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1605


Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,0.198,0.150629,0.0272,0.0245,0.0244
2,0.1283,0.148411,0.0483,0.0457,0.0462
3,0.1132,0.149555,0.0493,0.0493,0.0479


Saving model checkpoint to ./image-captioning-output/checkpoint-500
Configuration saved in ./image-captioning-output/checkpoint-500/config.json
Model weights saved in ./image-captioning-output/checkpoint-500/pytorch_model.bin
Feature extractor saved in ./image-captioning-output/checkpoint-500/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1069
  Batch size = 8
Saving model checkpoint to ./image-captioning-output/checkpoint-1000
Configuration saved in ./image-captioning-output/checkpoint-1000/config.json
Model weights saved in ./image-captioning-output/checkpoint-1000/pytorch_model.bin
Feature extractor saved in ./image-captioning-output/checkpoint-1000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1069
  Batch size = 8
Saving model checkpoint to ./image-captioning-output/checkpoint-1500
Configuration saved in ./image-captioning-output/checkpoint-1500/config.json
Model weights saved in ./image-captioning-output/checkpoint-1500/pytorch

TrainOutput(global_step=1605, training_loss=0.14426316412809853, metrics={'train_runtime': 1644.6248, 'train_samples_per_second': 7.804, 'train_steps_per_second': 0.976, 'total_flos': 2.316073578635723e+18, 'train_loss': 0.14426316412809853, 'epoch': 3.0})

In [11]:
trainer.save_model("./image-captioning-output")
tokenizer.save_pretrained("./image-captioning-output")
feature_extractor.save_pretrained('./image-captioning-output')

Saving model checkpoint to ./image-captioning-output
Configuration saved in ./image-captioning-output/config.json
Model weights saved in ./image-captioning-output/pytorch_model.bin
Feature extractor saved in ./image-captioning-output/preprocessor_config.json
tokenizer config file saved in ./image-captioning-output/tokenizer_config.json
Special tokens file saved in ./image-captioning-output/special_tokens_map.json
Feature extractor saved in ./image-captioning-output/preprocessor_config.json


['./image-captioning-output/preprocessor_config.json']

In [4]:
from transformers import ViTFeatureExtractor
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer, ViTFeatureExtractor
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("./image-captioning-output")
feature_extractor = ViTFeatureExtractor.from_pretrained("./image-captioning-output")
tokenizer = AutoTokenizer.from_pretrained("./image-captioning-output")

In [14]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)
# feature_extractor = feature_extractor.to(device)
# tokenizer = tokenizer.to(device)

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
    images = []
    for image_path in image_paths:
        i_image = Image.open(image_path)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")
        images.append(i_image)

    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

In [15]:
device

device(type='cuda')

In [26]:
predict_step(['./data/image/4923.jpg'])

['different style of earings']

In [19]:
cloth_idx = [5122, 5133, 5143, 5212, 5278,5302,5127,5128, 5177,5230,5321,5210,5342,5379,5203,5296,5333,5405,5373,5434,5155,5144,5367]
elec_idx = [5169,5214,5306,5310,5345,5120, 5131, 5292,5390,5117,4976,5203,5416,5089,4925,5293,5349,5354,5370,4976,5346,5208,5149,4923]
food_idx = [5198,5239,5323,5341,5120,5192,5241,5110,5161,5294,5362,5366,5386,5411,5180,5233,5364,5176,5125,5398,5351,5392]

In [27]:
import numpy as np

predictions_url = []
for idx in food_idx:
    url = f'./data/image/{str(idx)}.jpg'
    predictions_url.append(url)

In [29]:
predict_step(predictions_url)

['Different food of snack',
 'Different food for cooking',
 'Different food for cooking',
 'Different food for drinking',
 'Different types of plugs',
 'tablet and accessories',
 'snack food',
 'Different food of snack',
 'Different food of snack',
 'Different food for cooking',
 'Different food for cooking',
 'different brands of coffee',
 'Different food of snack',
 'Different food for cooking',
 'Different food of snack',
 'Different food of snack',
 'Different food of snack',
 'Different food of snack',
 'different style of candy',
 'Different food of snack',
 'Coffee',
 'Different food for cooking']

In [22]:
len(predictions_url)

23