In [48]:
!pip install -U pip transformers



In [49]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [50]:
checkpoint = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [51]:
print(f"{len(tokenizer.vocab)}\n")

tokenizer.vocab

256204



{'▁अजून': 72912,
 '▁Timotius': 201656,
 '▁නැත': 64105,
 '▁tumani': 210703,
 '廊': 253998,
 '▁વી': 66514,
 '▁elbette': 230855,
 '▁بچووک': 233202,
 'egia': 115057,
 'okoto': 105427,
 '▁Fini': 240128,
 '憾': 255031,
 'hãla': 165045,
 '▁μυ': 103418,
 '▁rialacha': 147898,
 '入手': 127989,
 'otros': 32150,
 '▁aminareo': 146975,
 '▁esdeveniments': 240560,
 '▁पासून': 148864,
 '▁Sabata': 184716,
 '▁šport': 105617,
 '▁поза': 95372,
 '▁Тау': 247078,
 '▁arvas': 191930,
 '▁ضرور': 53471,
 '▁죽음': 47904,
 '▁ಪ್ರತಿಕ್ರಿಯ': 146032,
 'ξεων': 215766,
 '▁November': 36381,
 '従業': 74557,
 'osola': 129985,
 '▁peduli': 95099,
 '▁ambu': 174538,
 '▁minsi': 110676,
 'नां': 66529,
 '▁세션': 239600,
 '▁komissio': 143044,
 'ಚ್': 9230,
 '▁សួរ': 145617,
 'ċjalment': 193613,
 '▁мек': 54835,
 '▁компјутер': 190636,
 '인과': 97325,
 '▁ڵک': 135722,
 '▁чув': 28726,
 '▁gbanwee': 205024,
 '▁очень': 43997,
 '▁Yérusal': 127431,
 '▁8:': 23388,
 '▁bizonyít': 128507,
 '▁తెలిసిన': 169104,
 '▁Tadbir': 235067,
 'ཁས་': 120717,
 '▁šta': 19633,
 

In [52]:
thai_char_min = 0x0E00
thai_char_max = 0x0E7F

thai_tokens = [
    token for token in tokenizer.vocab.keys()
    if any(thai_char_min <= ord(char) <= thai_char_max for char in token)
]

thai_token_count = len(thai_tokens)
sample_size = 20
thai_tokens_sample = thai_tokens[:sample_size]


print(f"{thai_token_count}\n")
for token in thai_tokens_sample:
  print(token)


1712

ูง
งว
อื่น
ชน
▁มา
แม้
ระหว่าง
▁ดี
ท้าย
ล์
▁แล้วก็
▁ล
▁น
วาง
ื่น
สาร
ปฏิ
ียบ
ทาน
▁แม่


In [53]:
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import math

In [54]:
sentence = 'Work hard, play harder'

In [55]:
cleaned_sentence = sentence.replace(',', '')
cleaned_sentence

'Work hard play harder'

In [56]:
words = cleaned_sentence.split()
words

['Work', 'hard', 'play', 'harder']

In [57]:
sorted_words = sorted(words)
sorted_words

['Work', 'hard', 'harder', 'play']

In [58]:
dc = {word: index for index, word in enumerate(sorted_words)}
dc

{'Work': 0, 'hard': 1, 'harder': 2, 'play': 3}

In [59]:
sentence_int = tf.constant(
    [dc[s] for s in sentence.replace(',', '').split()],
    dtype=tf.int32
)

In [60]:
print(sentence)
print(sentence_int)

Work hard, play harder
tf.Tensor([0 1 3 2], shape=(4,), dtype=int32)


In [61]:
# สร้าง embedding layer
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

In [62]:
embedded_sentence = embed(sentence_int)

In [63]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.04147549, -0.03842679],
       [-0.04023755,  0.04676259],
       [-0.00783229,  0.03582766],
       [ 0.01605607, -0.01558995]], dtype=float32)>

In [64]:
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

dummy_input = tf.constant([0, 1, 2], dtype=tf.int32)

# Case 1 Default initializer (RandomUniform(-0.05, 0.05))
embed_default = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
_ = embed_default(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_default = embed_default.get_weights()[0].flatten()
weights_default.shape

(100000,)

In [65]:
# Case 2 GlorotUniform initializer
tf.random.set_seed(123)
embed_glorot = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.GlorotUniform()
)
_ = embed_glorot(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_glorot = embed_glorot.get_weights()[0].flatten()
weights_glorot.shape

(100000,)

In [66]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=weights_default, nbinsx=50, name="Default Uniform [-0.05, 0.05]", opacity=0.6))
fig.add_trace(go.Histogram(x=weights_glorot, nbinsx=50, name="Glorot Uniform", opacity=0.6))

fig.update_layout(
    title_text='Embedding Layer Initialization Comparison',
    xaxis_title_text='Weight values',
    yaxis_title_text='Frequency',
    barmode='overlay',
    legend_orientation="h",
    legend_yanchor="bottom",
    legend_y=1.02,
    legend_xanchor="right",
    legend_x=1
)

fig.show()

print("Default initializer range ", weights_default.min(), weights_default.max())
print("Glorot initializer range ", weights_glorot.min(), weights_glorot.max())

Default initializer range  -0.049999036 0.049999844
Glorot initializer range  -0.010954028 0.010954154


In [67]:
def glorot_uniform_limits(fan_in, fan_out):
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    a, b = -limit, limit
    return a, b

# ตัวอย่าง Embedding layer (vocab_size=50000, embedding_dim=2)
fan_in = 50000
fan_out = 2

a, b = glorot_uniform_limits(fan_in, fan_out)
print("Glorot Uniform a =", a)
print("Glorot Uniform b =", b)

Glorot Uniform a = -0.010954232067652772
Glorot Uniform b = 0.010954232067652772


In [68]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [69]:
token_embedding_layer = model.model.encoder.embed_tokens
token_embedding_layer.weight.shape

torch.Size([256206, 1024])

In [70]:
long_sentence = "In the vast realm of natural language processing, understanding the nuances of how models handle sequential data is crucial. Positional encoding plays a vital role in providing this essential information to the model, allowing it to differentiate between words at different positions in a sentence, which is fundamental for tasks like translation, summarization, and text generation."

In [71]:
tokens = tokenizer(long_sentence, return_tensors="pt")

print(tokens['input_ids'][0])

tensor([256047,    717,    349,  14430,  12284, 248070,    452,  25307,  65445,
        157278, 248079, 133930,    349,    713,  75831,    452,  11657, 141057,
         47274, 116914, 124785,   6067,    248, 182071, 248075,  12013,  58409,
         12025, 246156,   3054,    705,      9, 104781,  76065,    108, 174693,
          3423, 140515,  18781,    202,    349,  14916, 248079,  82935,     87,
           796,    202,  53054,    502,  25914,  51744,    230,  30158, 199073,
           108,      9, 109267, 248079,   9089,    248,  75529,    351, 226047,
          6399, 200356, 248079,   2493, 109207, 181953, 248079,    540,  35883,
        120531, 248075,      2])


In [72]:
len(tokens['input_ids'][0])

75

In [73]:
token_embedding_layer(tokens['input_ids'][0][0]).shape

torch.Size([1024])

In [74]:
token_embeddings = token_embedding_layer(tokens['input_ids'][0])

print("Token Embedding Matrix shape", token_embeddings.shape)
token_embeddings

Token Embedding Matrix shape torch.Size([75, 1024])


tensor([[-5.0000e+00, -1.2725e+00, -9.3604e-01,  ..., -1.8297e+01,
         -9.1328e+00, -1.0672e+01],
        [ 2.6416e-01,  2.6831e-01,  2.0117e-01,  ...,  3.2715e+00,
         -3.2402e+00,  3.1738e+00],
        [ 4.3579e-01, -2.3352e-01,  2.6825e-02,  ...,  5.4648e+00,
          2.7129e+00,  5.5430e+00],
        ...,
        [ 8.5859e+00, -4.5391e+00, -4.7314e-01,  ..., -7.9529e-02,
          7.4844e+00, -7.5156e+00],
        [-2.4863e+00, -2.7515e-01,  5.6114e-03,  ...,  1.0180e+01,
         -7.2422e+00, -4.8047e+00],
        [-7.8320e-01, -9.0527e-01, -9.4482e-01,  ...,  3.1078e+01,
         -8.1494e-01, -8.7354e-01]], grad_fn=<MulBackward0>)

In [75]:
import plotly.express as px

token_embeddings_np = token_embeddings.detach().numpy()

fig = px.imshow(
    token_embeddings_np,
    color_continuous_scale="RdBu",
    labels=dict(x="Embedding Dimension", y="Token Index", color="Value"),
    title="Token Embedding Heatmap"
)

fig.update_xaxes(side="top")
fig.update_layout(height=500, width=900)
fig.show()

In [76]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.04147549, -0.03842679],
       [-0.04023755,  0.04676259],
       [-0.00783229,  0.03582766],
       [ 0.01605607, -0.01558995]], dtype=float32)>

In [77]:
d = embedded_sentence.shape[-1]
d

2

In [78]:
d_q, d_k, d_v = 2, 2, 4

d_q, d_k, d_v

(2, 2, 4)

In [79]:
tf.random.set_seed(123)
W_query = tf.Variable(tf.random.uniform((d, d_q)), trainable=True)
W_key   = tf.Variable(tf.random.uniform((d, d_k)), trainable=True)
W_value = tf.Variable(tf.random.uniform((d, d_v)), trainable=True)

In [80]:
print(W_query.shape, W_key.shape, W_value.shape)

(2, 2) (2, 2) (2, 4)


In [81]:
W_query

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.12615311, 0.5727513 ],
       [0.2993133 , 0.5461836 ]], dtype=float32)>

In [82]:
W_key

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.88968754, 0.12354946],
       [0.7718717 , 0.6850728 ]], dtype=float32)>

In [83]:
W_value

<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
array([[0.48962688, 0.5857923 , 0.36451697, 0.6550509 ],
       [0.9075084 , 0.37557673, 0.6882372 , 0.25384045]], dtype=float32)>

In [84]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.04147549, -0.03842679],
       [-0.04023755,  0.04676259],
       [-0.00783229,  0.03582766],
       [ 0.01605607, -0.01558995]], dtype=float32)>

In [85]:
queries = tf.matmul(embedded_sentence, W_query)
keys    = tf.matmul(embedded_sentence, W_key)
values  = tf.matmul(embedded_sentence, W_value)

In [86]:
print("Queries shape", queries.shape)
queries

Queries shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.00626939,  0.00276705],
       [ 0.00892057,  0.00249485],
       [ 0.00973563,  0.01508253],
       [-0.00264076,  0.00068116]], dtype=float32)>

In [87]:
print("Keys shape", keys.shape)
keys

Keys shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.00723967, -0.02120088],
       [ 0.00029587,  0.02706445],
       [ 0.02068607,  0.02357688],
       [ 0.00225144, -0.00869653]], dtype=float32)>

In [88]:
print("Values shape", values.shape)
values

Values shape (4, 4)


<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[-0.01456512,  0.00986381, -0.01132823,  0.01741428],
       [ 0.02273606, -0.0060079 ,  0.01751648, -0.0144874 ],
       [ 0.02867901,  0.00886794,  0.02180293,  0.00396396],
       [-0.00628653,  0.0035503 , -0.00487687,  0.00656018]],
      dtype=float32)>

In [89]:
omega = tf.matmul(queries, keys, transpose_b=True)

print("Omega shape", omega.shape)
print("Omega (Unnormalized attention weights)")
print(omega)

Omega shape (4, 4)
Omega (Unnormalized attention weights)
tf.Tensor(
[[-1.04052277e-04  7.30338506e-05 -6.44505199e-05 -3.81789614e-05]
 [ 1.16889169e-05  7.01611789e-05  2.43352450e-04 -1.61239586e-06]
 [-2.49280070e-04  4.11080779e-04  5.56990854e-04 -1.09246466e-04]
 [-3.35593904e-05  1.76538888e-05 -3.85672465e-05 -1.18692469e-05]], shape=(4, 4), dtype=float32)


In [91]:
d_k = tf.cast(d_k, tf.float32)

scaled_omega = omega / tf.sqrt(d_k)

attention_weights = tf.nn.softmax(scaled_omega, axis=-1)

print("Attention Weights")
print(attention_weights)

Attention Weights
tf.Tensor(
[[0.2499875  0.2500188  0.2499945  0.24999915]
 [0.24998777 0.24999811 0.25002873 0.24998541]
 [0.249929   0.25004572 0.25007153 0.24995373]
 [0.24999699 0.25000605 0.24999613 0.25000083]], shape=(4, 4), dtype=float32)


In [92]:
row_sums = tf.reduce_sum(attention_weights, axis=-1)

print("Sum of each row in attention_weights")
row_sums

Sum of each row in attention_weights


<tf.Tensor: shape=(4,), dtype=float32, numpy=array([1., 1., 1., 1.], dtype=float32)>

In [93]:
context_vector = tf.matmul(attention_weights, values)

print("Context Vector shape", context_vector.shape)
print(context_vector)

Context Vector shape (4, 4)
tf.Tensor(
[[0.00764131 0.00406825 0.00577893 0.00336224]
 [0.0076419  0.00406863 0.00577938 0.00336259]
 [0.00764527 0.00406803 0.00578197 0.00336084]
 [0.00764092 0.00406844 0.00577863 0.00336261]], shape=(4, 4), dtype=float32)


In [94]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super().__init__()
        self.d_out_kq = d_out_kq

        self.W_query = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_key = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_value = tf.Variable(
            tf.random.uniform((d_in, d_out_v)), trainable=True
        )

    def call(self, x):
        keys = tf.matmul(x, self.W_key)      # [T, d_out_kq]
        queries = tf.matmul(x, self.W_query) # [T, d_out_kq]
        values = tf.matmul(x, self.W_value)  # [T, d_out_v]

        # Attention scores: QKᵀ
        attn_scores = tf.matmul(queries, keys, transpose_b=True)  # [T, T]

        # Softmax (scaled by sqrt(d_k))
        attn_weights = tf.nn.softmax(
            attn_scores / tf.math.sqrt(tf.cast(self.d_out_kq, tf.float32)), axis=-1
        )  # [T, T]

        # Weighted sum
        context_vec = tf.matmul(attn_weights, values)  # [T, d_out_v]
        return context_vec

In [95]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 4

sa = SelfAttention(d_in, d_out_kq, d_out_v)

out = sa(embedded_sentence)

print(out.shape)  # (T, d_out_v)
print(out.numpy())

(4, 4)
[[0.00764131 0.00406825 0.00577893 0.00336224]
 [0.0076419  0.00406863 0.00577938 0.00336259]
 [0.00764527 0.00406803 0.00578197 0.00336084]
 [0.00764092 0.00406844 0.00577863 0.00336261]]


In [96]:
class MultiHeadAttentionWrapper(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v, num_heads):
        super().__init__()
        self.heads = [
            SelfAttention(d_in, d_out_kq, d_out_v)
            for _ in range(num_heads)
        ]

    def call(self, x):
        # รันทุก head แล้ว concat ตามแกนสุดท้าย
        head_outputs = [head(x) for head in self.heads]   # list of [T, d_out_v]
        return tf.concat(head_outputs, axis=-1)           # [T, num_heads * d_out_v]

In [97]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 1

sa = SelfAttention(d_in, d_out_kq, d_out_v)

# ถ้า embedded_sentence.shape = [T, d_in] เช่น [6, 3]
out = sa(embedded_sentence)

print(out.shape)   # (T, d_out_v) -> (6, 1)
print(out.numpy())

(4, 1)
[[0.00534279]
 [0.00534324]
 [0.00534451]
 [0.0053427 ]]


In [98]:
tf.random.set_seed(123)

block_size = embedded_sentence.shape[0]   # [T, d_in] → T = sequence length

mha = MultiHeadAttentionWrapper(
    d_in, d_out_kq, d_out_v, num_heads=3
)

# run MHA
context_vecs = mha(embedded_sentence)   # [T, num_heads * d_out_v]

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tf.Tensor(
[[0.00534279 0.00690817 0.00831081]
 [0.00534324 0.00690947 0.00830703]
 [0.00534451 0.00690578 0.00830899]
 [0.0053427  0.00690908 0.00830956]], shape=(4, 3), dtype=float32)
context_vecs.shape: (4, 3)
