In [19]:
!pip install -U pip transformers



In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [21]:
checkpoint = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [22]:
print(f"{len(tokenizer.vocab)}\n")

tokenizer.vocab

256204



{'▁siúl': 138687,
 'västi': 98992,
 '▁ဇ': 53278,
 '▁izb': 67434,
 'ត្ថ': 63469,
 'פֿ': 23816,
 '▁Наде': 224283,
 '▁פאָ': 193977,
 '▁овар': 52593,
 '▁klè': 191791,
 'cey': 148277,
 'ଦ୍ୱ': 141527,
 '迷惑': 188570,
 'өнө': 60870,
 '▁Musambu': 219534,
 '▁رێک': 131116,
 'ិ': 248994,
 '甚': 251625,
 '▁nuest': 63463,
 '▁ñâ': 147846,
 '▁중': 3251,
 'ခြေ': 43352,
 'halten': 46241,
 '▁പക്ഷേ': 56891,
 '▁eleccions': 170064,
 '▁veiligheid': 122703,
 '▁얼마': 16236,
 '▁मजा': 84861,
 'шак': 94652,
 '▁दूसर': 44191,
 '▁κόσμο': 109299,
 'ʾ': 255682,
 '▁ही': 4197,
 '▁speċjali': 91755,
 '▁niaj': 131120,
 '浜': 254272,
 '▁Eras': 97499,
 'ኝም': 134191,
 '▁lipua': 169953,
 '▁ganas': 184983,
 'ဦး': 19685,
 '▁omulimu': 99502,
 '▁එයාට': 129542,
 'łeś': 18002,
 'ysis': 202724,
 '▁2.': 3776,
 '▁dzird': 182928,
 'ituri': 70573,
 '▁mén': 60499,
 'ወስ': 49556,
 'რაფ': 123490,
 '▁bizony': 45110,
 '▁która': 92134,
 'დზე': 214635,
 '▁әһәми': 175377,
 'olisi': 32922,
 'ക്കൊ': 88206,
 'riny': 49186,
 '▁ulter': 214709,
 '▁finants'

In [23]:
thai_char_min = 0x0E00
thai_char_max = 0x0E7F

thai_tokens = [
    token for token in tokenizer.vocab.keys()
    if any(thai_char_min <= ord(char) <= thai_char_max for char in token)
]

thai_token_count = len(thai_tokens)
sample_size = 20
thai_tokens_sample = thai_tokens[:sample_size]


print(f"{thai_token_count}\n")
for token in thai_tokens_sample:
  print(token)

1712

กี่
▁แล้วคุณ
▁ย
เปลี่ยนแปลง
าร
▁ลูก
ฐ
ศ
กับ
▁พระองค์
สอง
เป็นการ
ข
รา
มั้ย
ทธิ
มนุษย์
ชน์
ใส่
ที่ผม


In [24]:
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import math

In [25]:
sentence = 'Work hard, play harder'

In [26]:
cleaned_sentence = sentence.replace(',', '')
cleaned_sentence

'Work hard play harder'

In [27]:
words = cleaned_sentence.split()
words

['Work', 'hard', 'play', 'harder']

In [28]:
sorted_words = sorted(words)
sorted_words

['Work', 'hard', 'harder', 'play']

In [29]:
dc = {word: index for index, word in enumerate(sorted_words)}
dc

{'Work': 0, 'hard': 1, 'harder': 2, 'play': 3}

In [30]:
sentence_int = tf.constant(
    [dc[s] for s in sentence.replace(',', '').split()],
    dtype=tf.int32
)

In [31]:
print(sentence)
print(sentence_int)

Work hard, play harder
tf.Tensor([0 1 3 2], shape=(4,), dtype=int32)


In [32]:
# สร้าง embedding layer
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

In [33]:
embedded_sentence = embed(sentence_int)

In [34]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.02145181,  0.00347415],
       [-0.01575999,  0.01206651],
       [-0.01735415, -0.03061302],
       [-0.0208887 ,  0.04576671]], dtype=float32)>

In [35]:
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

dummy_input = tf.constant([0, 1, 2], dtype=tf.int32)

# Case 1 Default initializer (RandomUniform(-0.05, 0.05))
embed_default = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
_ = embed_default(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_default = embed_default.get_weights()[0].flatten()
weights_default.shape

(100000,)

In [36]:
# Case 2 GlorotUniform initializer
tf.random.set_seed(123)
embed_glorot = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.GlorotUniform()
)
_ = embed_glorot(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_glorot = embed_glorot.get_weights()[0].flatten()
weights_glorot.shape

(100000,)

In [37]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=weights_default, nbinsx=50, name="Default Uniform [-0.05, 0.05]", opacity=0.6))
fig.add_trace(go.Histogram(x=weights_glorot, nbinsx=50, name="Glorot Uniform", opacity=0.6))

fig.update_layout(
    title_text='Embedding Layer Initialization Comparison',
    xaxis_title_text='Weight values',
    yaxis_title_text='Frequency',
    barmode='overlay',
    legend_orientation="h",
    legend_yanchor="bottom",
    legend_y=1.02,
    legend_xanchor="right",
    legend_x=1
)

fig.show()

print("Default initializer range ", weights_default.min(), weights_default.max())
print("Glorot initializer range ", weights_glorot.min(), weights_glorot.max())

Default initializer range  -0.049999774 0.049998906
Glorot initializer range  -0.010954062 0.01095353


In [38]:
def glorot_uniform_limits(fan_in, fan_out):
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    a, b = -limit, limit
    return a, b

# ตัวอย่าง Embedding layer (vocab_size=50000, embedding_dim=2)
fan_in = 50000
fan_out = 2

a, b = glorot_uniform_limits(fan_in, fan_out)
print("Glorot Uniform a =", a)
print("Glorot Uniform b =", b)

Glorot Uniform a = -0.010954232067652772
Glorot Uniform b = 0.010954232067652772


In [39]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [40]:
token_embedding_layer = model.model.encoder.embed_tokens
token_embedding_layer.weight.shape

torch.Size([256206, 1024])

In [41]:
long_sentence = "In the vast realm of natural language processing, understanding the nuances of how models handle sequential data is crucial. Positional encoding plays a vital role in providing this essential information to the model, allowing it to differentiate between words at different positions in a sentence, which is fundamental for tasks like translation, summarization, and text generation."


In [42]:
tokens = tokenizer(long_sentence, return_tensors="pt")

print(tokens['input_ids'][0])

tensor([256047,    717,    349,  14430,  12284, 248070,    452,  25307,  65445,
        157278, 248079, 133930,    349,    713,  75831,    452,  11657, 141057,
         47274, 116914, 124785,   6067,    248, 182071, 248075,  12013,  58409,
         12025, 246156,   3054,    705,      9, 104781,  76065,    108, 174693,
          3423, 140515,  18781,    202,    349,  14916, 248079,  82935,     87,
           796,    202,  53054,    502,  25914,  51744,    230,  30158, 199073,
           108,      9, 109267, 248079,   9089,    248,  75529,    351, 226047,
          6399, 200356, 248079,   2493, 109207, 181953, 248079,    540,  35883,
        120531, 248075,      2])


In [43]:
len(tokens['input_ids'][0])

75

In [44]:
token_embedding_layer(tokens['input_ids'][0][0]).shape


torch.Size([1024])

In [45]:
token_embeddings = token_embedding_layer(tokens['input_ids'][0])

print("Token Embedding Matrix shape", token_embeddings.shape)
token_embeddings

Token Embedding Matrix shape torch.Size([75, 1024])


tensor([[-5.0000e+00, -1.2725e+00, -9.3604e-01,  ..., -1.8297e+01,
         -9.1328e+00, -1.0672e+01],
        [ 2.6416e-01,  2.6831e-01,  2.0117e-01,  ...,  3.2715e+00,
         -3.2402e+00,  3.1738e+00],
        [ 4.3579e-01, -2.3352e-01,  2.6825e-02,  ...,  5.4648e+00,
          2.7129e+00,  5.5430e+00],
        ...,
        [ 8.5859e+00, -4.5391e+00, -4.7314e-01,  ..., -7.9529e-02,
          7.4844e+00, -7.5156e+00],
        [-2.4863e+00, -2.7515e-01,  5.6114e-03,  ...,  1.0180e+01,
         -7.2422e+00, -4.8047e+00],
        [-7.8320e-01, -9.0527e-01, -9.4482e-01,  ...,  3.1078e+01,
         -8.1494e-01, -8.7354e-01]], grad_fn=<MulBackward0>)

In [46]:
import plotly.express as px

token_embeddings_np = token_embeddings.detach().numpy()

fig = px.imshow(
    token_embeddings_np,
    color_continuous_scale="RdBu",
    labels=dict(x="Embedding Dimension", y="Token Index", color="Value"),
    title="Token Embedding Heatmap"
)

fig.update_xaxes(side="top")
fig.update_layout(height=500, width=900)
fig.show()

In [47]:
d = embedded_sentence.shape[-1]
d


2

In [48]:
d_q, d_k, d_v = 2, 2, 4

d_q, d_k, d_v

(2, 2, 4)

In [49]:
tf.random.set_seed(123)
W_query = tf.Variable(tf.random.uniform((d, d_q)), trainable=True)
W_key   = tf.Variable(tf.random.uniform((d, d_k)), trainable=True)
W_value = tf.Variable(tf.random.uniform((d, d_v)), trainable=True)

In [50]:
print(W_query.shape, W_key.shape, W_value.shape)

(2, 2) (2, 2) (2, 4)


In [51]:
W_query

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.12615311, 0.5727513 ],
       [0.2993133 , 0.5461836 ]], dtype=float32)>

In [52]:
W_key

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.88968754, 0.12354946],
       [0.7718717 , 0.6850728 ]], dtype=float32)>

In [None]:
W_value

In [53]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.02145181,  0.00347415],
       [-0.01575999,  0.01206651],
       [-0.01735415, -0.03061302],
       [-0.0208887 ,  0.04576671]], dtype=float32)>

In [54]:
queries = tf.matmul(embedded_sentence, W_query)
keys    = tf.matmul(embedded_sentence, W_key)
values  = tf.matmul(embedded_sentence, W_value)

In [55]:
print("Queries shape", queries.shape)
queries

Queries shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.00166635, -0.01038902],
       [ 0.00162349, -0.00243603],
       [-0.01135216, -0.02665994],
       [ 0.01106341,  0.013033  ]], dtype=float32)>

In [56]:
print("Keys shape", keys.shape)
keys

Keys shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.01640381, -0.00027031],
       [-0.00470768,  0.0063193 ],
       [-0.0390691 , -0.02311624],
       [ 0.01674161,  0.02877274]], dtype=float32)>

In [57]:
print("Values shape", values.shape)
values

Values shape (4, 4)


<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[-0.00735056, -0.01126149, -0.00542851, -0.01317014],
       [ 0.00323394, -0.00470018,  0.00255983, -0.00726063],
       [-0.03627863, -0.02166347, -0.0273949 , -0.01913868],
       [ 0.03130601,  0.00495247,  0.02388407, -0.00206572]],
      dtype=float32)>

In [59]:
omega = tf.matmul(queries, keys, transpose_b=True)

print("Omega shape", omega.shape)
print("Omega (Unnormalized attention weights)")
print(omega)

Omega shape (4, 4)
Omega (Unnormalized attention weights)
tf.Tensor(
[[ 3.0142777e-05 -5.7806679e-05  3.0525803e-04 -3.2681812e-04]
 [-2.5972984e-05 -2.3036868e-05 -7.1165923e-06 -4.2911328e-05]
 [ 1.9342516e-04 -1.1502979e-04  1.0597964e-03 -9.5713319e-04]
 [-1.8500503e-04  3.0276424e-05 -7.3351152e-04  5.6021451e-04]], shape=(4, 4), dtype=float32)


In [60]:
d_k = tf.cast(d_k, tf.float32)

scaled_omega = omega / tf.sqrt(d_k)

attention_weights = tf.nn.softmax(scaled_omega, axis=-1)

print("Attention Weights")
print(attention_weights)

Attention Weights
tf.Tensor(
[[0.25000748 0.24999195 0.25005612 0.2499444 ]
 [0.24999978 0.25000033 0.25000313 0.24999678]
 [0.25002617 0.24997163 0.25017935 0.24982281]
 [0.24998176 0.25001985 0.24988484 0.25011355]], shape=(4, 4), dtype=float32)


In [61]:
row_sums = tf.reduce_sum(attention_weights, axis=-1)

print("Sum of each row in attention_weights")
row_sums

Sum of each row in attention_weights


<tf.Tensor: shape=(4,), dtype=float32, numpy=array([1.        , 1.        , 0.99999994, 1.        ], dtype=float32)>

In [62]:
context_vector = tf.matmul(attention_weights, values)

print("Context Vector shape", context_vector.shape)
print(context_vector)

Context Vector shape (4, 4)
tf.Tensor(
[[-0.00227617 -0.00816971 -0.0015978  -0.01040979]
 [-0.00227252 -0.00816825 -0.00159504 -0.01040884]
 [-0.00228465 -0.00817309 -0.00160424 -0.010412  ]
 [-0.00226438 -0.008165   -0.00158886 -0.01040673]], shape=(4, 4), dtype=float32)


In [63]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super().__init__()
        self.d_out_kq = d_out_kq

        self.W_query = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_key = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_value = tf.Variable(
            tf.random.uniform((d_in, d_out_v)), trainable=True
        )

    def call(self, x):
        keys = tf.matmul(x, self.W_key)      # [T, d_out_kq]
        queries = tf.matmul(x, self.W_query) # [T, d_out_kq]
        values = tf.matmul(x, self.W_value)  # [T, d_out_v]

        # Attention scores: QKᵀ
        attn_scores = tf.matmul(queries, keys, transpose_b=True)  # [T, T]

        # Softmax (scaled by sqrt(d_k))
        attn_weights = tf.nn.softmax(
            attn_scores / tf.math.sqrt(tf.cast(self.d_out_kq, tf.float32)), axis=-1
        )  # [T, T]

        # Weighted sum
        context_vec = tf.matmul(attn_weights, values)  # [T, d_out_v]
        return context_vec

In [64]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 4

sa = SelfAttention(d_in, d_out_kq, d_out_v)

out = sa(embedded_sentence)

print(out.shape)  # (T, d_out_v)
print(out.numpy())

(4, 4)
[[-0.00227617 -0.00816971 -0.0015978  -0.01040979]
 [-0.00227252 -0.00816825 -0.00159504 -0.01040884]
 [-0.00228465 -0.00817309 -0.00160424 -0.010412  ]
 [-0.00226438 -0.008165   -0.00158886 -0.01040673]]


In [65]:
class MultiHeadAttentionWrapper(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v, num_heads):
        super().__init__()
        self.heads = [
            SelfAttention(d_in, d_out_kq, d_out_v)
            for _ in range(num_heads)
        ]

    def call(self, x):
        # รันทุก head แล้ว concat ตามแกนสุดท้าย
        head_outputs = [head(x) for head in self.heads]   # list of [T, d_out_v]
        return tf.concat(head_outputs, axis=-1)           # [T, num_heads * d_out_v]

In [66]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 1

sa = SelfAttention(d_in, d_out_kq, d_out_v)

# ถ้า embedded_sentence.shape = [T, d_in] เช่น [6, 3]
out = sa(embedded_sentence)

print(out.shape)   # (T, d_out_v) -> (6, 1)
print(out.numpy())

(4, 1)
[[-0.00474349]
 [-0.00474116]
 [-0.00474891]
 [-0.00473596]]


In [67]:
tf.random.set_seed(123)

block_size = embedded_sentence.shape[0]   # [T, d_in] → T = sequence length

mha = MultiHeadAttentionWrapper(
    d_in, d_out_kq, d_out_v, num_heads=3
)

# run MHA
context_vecs = mha(embedded_sentence)   # [T, num_heads * d_out_v]

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tf.Tensor(
[[-0.00474349 -0.0033779  -0.00684457]
 [-0.00474116 -0.00337739 -0.00684223]
 [-0.00474891 -0.0033788  -0.00684563]
 [-0.00473596 -0.00337655 -0.00684152]], shape=(4, 3), dtype=float32)
context_vecs.shape: (4, 3)
