In [6]:
# %%

from math import pi
import sentencepiece as spm
import os
from icecream import ic

In [7]:
# %%

TOKENIZER_PATH = "./paligemma_tokenizer.model"
if not os.path.exists(TOKENIZER_PATH):
    print("Downloading the model tokenizer...")
    !gsutil cp gs://big_vision/paligemma_tokenizer.model {TOKENIZER_PATH}
    print(f"Tokenizer path: {TOKENIZER_PATH}")
else:
    print(f"Tokenizer file: {TOKENIZER_PATH} is already downloaded")

Tokenizer file: ./paligemma_tokenizer.model is already downloaded


In [8]:
# %%

sp = spm.SentencePieceProcessor(TOKENIZER_PATH)

# encode: text => id
print(sp.EncodeAsPieces("This is a test"))
print(sp.EncodeAsPieces("This is a test_case"))
print(sp.EncodeAsIds("This is a test"))
print(sp.EncodeAsIds("Hello World"))

# decode: id => text
print(sp.DecodePieces(["This", "▁is", "▁a", "▁t", "est"]))

# print(sp.decode_ids([209, 31, 9, 375, 586]))
print(sp.DecodeIds([1596, 603, 476, 2121]))

['This', '▁is', '▁a', '▁test']
['This', '▁is', '▁a', '▁test', '_', 'case']
[1596, 603, 476, 2121]
[4521, 3855]
This is a test
This is a test


In [9]:
# %%

sp.GetPieceSize()

257152

In [10]:
# %%

reserved_size = 256

last_id = sp.GetPieceSize() - 1
ic(last_id)
ids = [i for i in range(last_id - reserved_size, last_id + 1)]
ic(ids)
# print(sp.DecodeIds(ids))

ic| last_id: 257151
ic| ids: [256895,
          256896,
          256897,
          256898,
          256899,
          256900,
          256901,
          256902,
          256903,
          256904,
          256905,
          256906,
          256907,
          256908,
          256909,
          256910,
          256911,
          256912,
          256913,
          256914,
          256915,
          256916,
          256917,
          256918,
          256919,
          256920,
          256921,
          256922,
          256923,
          256924,
          256925,
          256926,
          256927,
          256928,
          256929,
          256930,
          256931,
          256932,
          256933,
          256934,
          256935,
          256936,
          256937,
          256938,
          256939,
          256940,
          256941,
          256942,
          256943,
          256944,
          256945,
          256946,
          256947,
          256948,
        

[256895,
 256896,
 256897,
 256898,
 256899,
 256900,
 256901,
 256902,
 256903,
 256904,
 256905,
 256906,
 256907,
 256908,
 256909,
 256910,
 256911,
 256912,
 256913,
 256914,
 256915,
 256916,
 256917,
 256918,
 256919,
 256920,
 256921,
 256922,
 256923,
 256924,
 256925,
 256926,
 256927,
 256928,
 256929,
 256930,
 256931,
 256932,
 256933,
 256934,
 256935,
 256936,
 256937,
 256938,
 256939,
 256940,
 256941,
 256942,
 256943,
 256944,
 256945,
 256946,
 256947,
 256948,
 256949,
 256950,
 256951,
 256952,
 256953,
 256954,
 256955,
 256956,
 256957,
 256958,
 256959,
 256960,
 256961,
 256962,
 256963,
 256964,
 256965,
 256966,
 256967,
 256968,
 256969,
 256970,
 256971,
 256972,
 256973,
 256974,
 256975,
 256976,
 256977,
 256978,
 256979,
 256980,
 256981,
 256982,
 256983,
 256984,
 256985,
 256986,
 256987,
 256988,
 256989,
 256990,
 256991,
 256992,
 256993,
 256994,
 256995,
 256996,
 256997,
 256998,
 256999,
 257000,
 257001,
 257002,
 257003,
 257004,
 257005,
 

In [11]:
for i in range(last_id - reserved_size, last_id):
    piece = sp.IdToPiece(i)
    print(f"id:{i} -->piece:{piece}")

id:256895 -->piece:<loc0895>
id:256896 -->piece:<loc0896>
id:256897 -->piece:<loc0897>
id:256898 -->piece:<loc0898>
id:256899 -->piece:<loc0899>
id:256900 -->piece:<loc0900>
id:256901 -->piece:<loc0901>
id:256902 -->piece:<loc0902>
id:256903 -->piece:<loc0903>
id:256904 -->piece:<loc0904>
id:256905 -->piece:<loc0905>
id:256906 -->piece:<loc0906>
id:256907 -->piece:<loc0907>
id:256908 -->piece:<loc0908>
id:256909 -->piece:<loc0909>
id:256910 -->piece:<loc0910>
id:256911 -->piece:<loc0911>
id:256912 -->piece:<loc0912>
id:256913 -->piece:<loc0913>
id:256914 -->piece:<loc0914>
id:256915 -->piece:<loc0915>
id:256916 -->piece:<loc0916>
id:256917 -->piece:<loc0917>
id:256918 -->piece:<loc0918>
id:256919 -->piece:<loc0919>
id:256920 -->piece:<loc0920>
id:256921 -->piece:<loc0921>
id:256922 -->piece:<loc0922>
id:256923 -->piece:<loc0923>
id:256924 -->piece:<loc0924>
id:256925 -->piece:<loc0925>
id:256926 -->piece:<loc0926>
id:256927 -->piece:<loc0927>
id:256928 -->piece:<loc0928>
id:256929 -->p

In [12]:
# %%

special_ids = [sp.bos_id(), sp.eos_id(), sp.pad_id(), sp.unk_id()]
ic(sp.IdToPiece([i for i in special_ids]))

ic| sp.IdToPiece([i for i in special_ids]): ['<bos>', '<eos>', '<pad>', '<unk>']


['<bos>', '<eos>', '<pad>', '<unk>']

In [13]:
# %%

real_last_id = sp.GetPieceSize() - 1
ic(real_last_id)

# 257_151

ic| real_last_id: 257151


257151

In [16]:
# %%

ic(real_last_id)

reserved_size = 255
start_id = 255_700
last_id = start_id + reserved_size

num_to_id = {}
id_to_num = {}
pieces = []

for n, i in enumerate(range(last_id - reserved_size, last_id + 1)):
    piece = sp.IdToPiece(i)
    print(f"id({n}):{i} -->piece:{piece} bytes:{piece.encode('utf-8')}")
    pieces.append(piece)

ic| real_last_id: 257151


id(0):255700 -->piece:⢺ bytes:b'\xe2\xa2\xba'
id(1):255701 -->piece:⤙ bytes:b'\xe2\xa4\x99'
id(2):255702 -->piece:⾞ bytes:b'\xe2\xbe\x9e'
id(3):255703 -->piece:ㅚ bytes:b'\xe3\x85\x9a'
id(4):255704 -->piece:㠀 bytes:b'\xe3\xa0\x80'
id(5):255705 -->piece:䊐 bytes:b'\xe4\x8a\x90'
id(6):255706 -->piece:喈 bytes:b'\xe5\x96\x88'
id(7):255707 -->piece:噐 bytes:b'\xe5\x99\x90'
id(8):255708 -->piece:嚭 bytes:b'\xe5\x9a\xad'
id(9):255709 -->piece:囷 bytes:b'\xe5\x9b\xb7'
id(10):255710 -->piece:堟 bytes:b'\xe5\xa0\x9f'
id(11):255711 -->piece:塤 bytes:b'\xe5\xa1\xa4'
id(12):255712 -->piece:夊 bytes:b'\xe5\xa4\x8a'
id(13):255713 -->piece:岽 bytes:b'\xe5\xb2\xbd'
id(14):255714 -->piece:巎 bytes:b'\xe5\xb7\x8e'
id(15):255715 -->piece:廔 bytes:b'\xe5\xbb\x94'
id(16):255716 -->piece:彞 bytes:b'\xe5\xbd\x9e'
id(17):255717 -->piece:慥 bytes:b'\xe6\x85\xa5'
id(18):255718 -->piece:扞 bytes:b'\xe6\x89\x9e'
id(19):255719 -->piece:扺 bytes:b'\xe6\x89\xba'
id(20):255720 -->piece:攆 bytes:b'\xe6\x94\x86'
id(21):255721 -->piece:

In [23]:
# %%
pieces_str = " ".join(pieces)
ic(pieces_str)
ic(sp.Encode(pieces_str)[:5])
ic(None)

ic| pieces_str: ('⢺ ⤙ ⾞ ㅚ 㠀 䊐 喈 噐 嚭 囷 堟 塤 夊 岽 巎 廔 彞 慥 扞 扺 攆 敉 榇 槩 殂 沺 炱 烀 琑 痦 盦 睃 籴 籺 粁 糹 紕 缐 '
                 '缑 翆 臜 蒗 虯 蟶 袿 訫 謌 賔 踅 輅 轵 遄 釔 鋱 鍖 鏉 鏐 鏵 雱 驺 鴴 鷙 麃 鼷 龱 갛 뚠 몭 뻥 뿅 썽 옅 읭 졍 좡 켭 '
                 '펍 횽 \ue02b \ue039 \ue059 \ue082 \ue08a \ue0f1 \ue21a \ue2e8 \ue2f2 \ue314 '
                 '\ue368 \ue5d2 \ue734 \ue73e \ue762 \ue777 \ue840 \uec4c \uf102 \uf17d 兩 流 吏 '
                 'ﭽ ︕ ﹛ ﺝ 𐌱 𑄠 𝆣 𝑿 𝕲 𝘍 𝘒 𝘜 𝝈 𞤑 🪕 🪤 𨭆 \x1c'
                 ' ǈ ǋ Ȟ ˕ ϛ Ϡ ѻ Ӂ ӷ ֠ ܴ ߺ ऴ ଃ ႐ ჻ ᄊ ᆹ ኾ ዡ ᒍ ᠁ ḕ ṝ ṽ Ἤ ↆ ⇎ ∭ ⊐ ⏔ ┥ ⛛ ⠅ ⠆ ⠖ ⡁ ⡌ '
                 '⡑ ⢊ ⢎ ⢐ ⢫ ⣜ ⵌ ⽌ ⾄ ⾳ ㅙ 䴀 俁 俛 倞 冋 凩 劒 勦 喼 嗗 垆 垪 埝 夲 奷 姸 媖 寕 屨 巂 怍 悒 慤 搋 擽 昻 檠 '
                 '欵 殽 汌 沇 涷 淥 滏 濉 珖 瓛 甡 皝 眛 笅 簣 縒 臵 虓 蚨 蛧 袮 裥 谖 谞 赑 趼 醱 鉏 鉺 韡 頎 飈 鯢 鲆 鶿 鹇 麁 齙 '
                 '龥 ꘋ 걔 먀 몄 삵 쌩 읊 튿 횃 \ue08b \ue0b9 \ue29a \ue2a2 \ue2ca \ue2d6 \ue2e6 \ue2f0 '
                 '\ue405 \ue604 \ue65a \ue684 \ue6ec')
ic| sp.Encode(pieces_str)[:5]: [255700, 235248, 255701, 235248, 255702]
ic| None


In [None]:
# print(pieces_str)

ic(len(pieces_str))
ic(pieces_str.encode())

# Action Value → Token 

```text
Q: What should the robot do to <action>?
A:“terminate pos pos pos rot rot rot gripper_extension”  
```
## Steps
- **step 1**: convert floating action values to discrete values(0~255) 
- **step 2**: convert discrete values to tokens

In [None]:
# %%

# Test full string conversion
# real value (meter dimension or degree for rotation)
# action value -> id -> token

# scale for translation  1/128 (meter)
# scale for rotation 180./128 (degree)
# action value range(-scale, scale) -> action value (-128, 128)