In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import pandas as pd
import os
os.environ["TOKENIZERS_PARALLELISM"] = "1"

from config import Config
from fliker_comment_tokenizer import FlikerCommentTokenizer
from img_comment_dataset import ImgCommentDataset
from pathlib import Path
from transformers import AutoTokenizer
from tqdm import tqdm

from common_util import get_logger
logger = get_logger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = Config()

fliker_comment_toeknizer = FlikerCommentTokenizer.train_tokenizer(config=config)
fliker_comment_toeknizer

Loaded 158915 fliker image-caption data items.
Loaded 591753 coco image-caption data items.
Loaded 768536 visual-genome image-caption data items.


100%|██████████| 1520/1520 [00:02<00:00, 660.77it/s]





Saved new fliker comment tokenizer at: /Users/chengbai/ml/cheng_git/notebooks/paligemma-3b-mix-224-tokenizer


GemmaTokenizerFast(name_or_path='google/paligemma-3b-mix-224', vocab_size=10000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<image>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [2]:
config = Config()
new_fliker_comment_toeknizer = FlikerCommentTokenizer.get_tokenizer(config=config)

# test_data = "hello, 會 員"
test_data = "<image><bos>cheng bai<pad><eos>"
test_data_encoded = new_fliker_comment_toeknizer.encode(test_data)
logger.info(f"encoded: {test_data_encoded}")
test_data_decoded = new_fliker_comment_toeknizer.decode(test_data_encoded)
logger.info(f"decode: {test_data_decoded}")

2024-09-29 12:32:07,378 - fliker_comment_tokenizer - INFO - tokens: 10000
2024-09-29 12:32:07,379 - fliker_comment_tokenizer - INFO - tokenizer.is_fast: True
2024-09-29 12:32:07,380 - __main__ - INFO - encoded: [2, 4, 2, 143, 152, 109, 197, 77, 0, 1]
2024-09-29 12:32:07,380 - __main__ - INFO - decode: <bos><image><bos>cheng bai<pad><eos>


In [5]:
logger.info(f"<image>: {new_fliker_comment_toeknizer.encode('<image>')}")
logger.info(f"<bos>: {new_fliker_comment_toeknizer.encode('<bos>')}")
logger.info(f"<eos>: {new_fliker_comment_toeknizer.encode('<eos>')}")
logger.info(f"<pad>: {new_fliker_comment_toeknizer.encode('<pad>')}")
logger.info(f"cheng bai: {new_fliker_comment_toeknizer.encode('cheng bai')}")

<image>: [2, 4]
<bos>: [2, 2]
<eos>: [2, 1]
<pad>: [2, 0]
cheng bai: [2, 135, 148, 176, 149, 77]


In [None]:
new_fliker_comment_toeknizer.vocab_size, len(new_fliker_comment_toeknizer.get_vocab()),
new_fliker_comment_toeknizer.get_added_vocab()

for k, v in new_fliker_comment_toeknizer.vocab.items():
    if v == 0:
        logger.info(k)
        break

2024-09-29 12:29:37,937 - __main__ - INFO - <pad>


# Verify encode and decode logic

In [4]:
train_dataset = ImgCommentDataset(config, split="train")
eval_dataset = ImgCommentDataset(config, split="eval")
test_dataset = ImgCommentDataset(config, split="test")
logger.info(
    f"train_dataset: {len(train_dataset)}, eval_dataset: {len(eval_dataset)}, test_dataset: {len(test_dataset)}"
)

2024-09-29 12:32:19,652 - img_comment_dataset - INFO - Loaded 158915 fliker image-caption data items.
2024-09-29 12:32:21,573 - img_comment_dataset - INFO - Loaded 591753 coco image-caption data items.
2024-09-29 12:32:27,659 - img_comment_dataset - INFO - Loaded 768536 visual-genome image-caption data items.
2024-09-29 12:32:36,037 - img_comment_dataset - INFO - Loaded 158915 fliker image-caption data items.
2024-09-29 12:32:38,301 - img_comment_dataset - INFO - Loaded 591753 coco image-caption data items.
2024-09-29 12:32:44,255 - img_comment_dataset - INFO - Loaded 768536 visual-genome image-caption data items.
2024-09-29 12:32:52,839 - img_comment_dataset - INFO - Loaded 158915 fliker image-caption data items.
2024-09-29 12:32:54,678 - img_comment_dataset - INFO - Loaded 591753 coco image-caption data items.
2024-09-29 12:33:01,195 - img_comment_dataset - INFO - Loaded 768536 visual-genome image-caption data items.
2024-09-29 12:33:08,525 - __main__ - INFO - train_dataset: 1093826,

In [15]:
for dataset_name, dataset in [
    # ("train", train_dataset),
    # ("eval", eval_dataset),
    ("test", test_dataset),
]:
    logger.info(f"dataset_name: {dataset_name}")
    for source, comment in tqdm(
        zip(dataset.img_comments_df["source"], dataset.img_comments_df["comment"]),
        total=len(eval_dataset.img_comments_df),
    ):
        comment_encoded = new_fliker_comment_toeknizer.encode(comment)
        comment_decoded = new_fliker_comment_toeknizer.decode(comment_encoded)
    if f"<bos>{comment}" != comment_decoded:
        logger.info(
            f"comment should be same as commend_decoded. source: {source}, comment: `{comment}`, comment_decoded: `{comment_decoded}`"
        )
        # break

2024-09-29 12:43:14,939 - __main__ - INFO - dataset_name: test


 56%|█████▌    | 151921/273457 [00:06<00:05, 22053.18it/s]
