In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
text = open('khayyam.txt', 'rb').read().decode(encoding='utf-8')

In [3]:
text[:20]

'|برخیز بتا بیا ز بهر'

In [4]:
vocabolaries = sorted(set(text))

In [5]:
vocabolaries

['\n',
 '\r',
 ' ',
 '!',
 ':',
 '|',
 '\xa0',
 '«',
 '»',
 '،',
 '؟',
 'آ',
 'أ',
 'ئ',
 'ا',
 'ب',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ف',
 'ق',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 'َ',
 'ُ',
 'ِ',
 'ّ',
 'ْ',
 'ٔ',
 'پ',
 'چ',
 'ژ',
 'ک',
 'گ',
 'ۀ',
 'ی']

In [6]:
len(vocabolaries)

53

In [7]:
char2index = {u:i for i, u in enumerate(vocabolaries)}
index2char = np.array(vocabolaries)

In [8]:
char2index

{'\n': 0,
 '\r': 1,
 ' ': 2,
 '!': 3,
 ':': 4,
 '|': 5,
 '\xa0': 6,
 '«': 7,
 '»': 8,
 '،': 9,
 '؟': 10,
 'آ': 11,
 'أ': 12,
 'ئ': 13,
 'ا': 14,
 'ب': 15,
 'ت': 16,
 'ث': 17,
 'ج': 18,
 'ح': 19,
 'خ': 20,
 'د': 21,
 'ذ': 22,
 'ر': 23,
 'ز': 24,
 'س': 25,
 'ش': 26,
 'ص': 27,
 'ض': 28,
 'ط': 29,
 'ظ': 30,
 'ع': 31,
 'غ': 32,
 'ف': 33,
 'ق': 34,
 'ل': 35,
 'م': 36,
 'ن': 37,
 'ه': 38,
 'و': 39,
 'َ': 40,
 'ُ': 41,
 'ِ': 42,
 'ّ': 43,
 'ْ': 44,
 'ٔ': 45,
 'پ': 46,
 'چ': 47,
 'ژ': 48,
 'ک': 49,
 'گ': 50,
 'ۀ': 51,
 'ی': 52}

In [9]:
index2char[1]

np.str_('\r')

In [10]:
text_as_integer = np.array([char2index[c] for c in text])

In [11]:
text_as_integer

array([ 5, 15, 23, ..., 52,  1,  0], shape=(22883,))

In [12]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_integer)

In [13]:
char_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [14]:
for i in char_dataset.take(10):
    print(index2char[i.numpy()])

|
ب
ر
خ
ی
ز
 
ب
ت
ا


In [15]:
sequences = char_dataset.batch(30, drop_remainder=True)
sequences

<_BatchDataset element_spec=TensorSpec(shape=(30,), dtype=tf.int64, name=None)>

In [16]:
for i in sequences.take(3):
    print('--->', ''.join(index2char[i.numpy()]))

---> |برخیز بتا بیا ز بهر دل ما
|ح
---> ل کن به جمال خویشتن مشکل ما
|
---> یک کوزه شراب تا به هم نوش کنیم


In [17]:
def sit(batch):
    input_text = batch[:-1]
    target_text = batch[1:]
    return input_text, target_text
dataset = sequences.map(sit)

In [18]:
dataset

<_MapDataset element_spec=(TensorSpec(shape=(29,), dtype=tf.int64, name=None), TensorSpec(shape=(29,), dtype=tf.int64, name=None))>

In [19]:
for i in dataset.take(1):
    print(''.join(index2char[i[0].numpy()]))
    print(''.join(index2char[i[1].numpy()]))

|برخیز بتا بیا ز بهر دل ما
|
برخیز بتا بیا ز بهر دل ما
|ح


In [20]:
dataset = dataset.batch(64, drop_remainder=True)
dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 29), dtype=tf.int64, name=None), TensorSpec(shape=(64, 29), dtype=tf.int64, name=None))>

In [21]:
len(vocabolaries)

53

In [22]:
vocabolary_size = len(vocabolaries)
embedding_dim = 25
rnn_units = 1024

In [50]:
# فعلا با این مدل کاری نداریم
model1 = tf.keras.Sequential([
    tf.keras.layers.Input(batch_shape=(64, None)), 
    tf.keras.layers.Embedding(vocabolary_size, 25),
    tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(vocabolary_size)
])

In [25]:
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabolary_size, 25),
    tf.keras.layers.GRU(rnn_units, return_sequences=True),
    tf.keras.layers.Dense(vocabolary_size)
])

In [28]:
for input_text, target_text in dataset.take(1):
    output = model2.predict(input_text)
    print(output[0])
    print(output[0].shape)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[[ 2.8062135e-04 -3.8516999e-05  2.9221431e-03 ...  9.4969483e-04
   2.9849396e-03  1.5292317e-06]
 [-1.9104539e-03 -2.7292622e-03  2.4986970e-03 ... -1.5328135e-03
   2.8546795e-04  2.6840349e-03]
 [ 2.3662642e-04  1.8655522e-03 -2.6838938e-03 ...  1.7799898e-03
  -4.8739486e-04  4.3711863e-03]
 ...
 [-6.0758824e-03 -4.6053075e-04 -5.9380592e-04 ...  9.5223659e-04
   1.3166092e-03 -5.0447802e-03]
 [-1.7852952e-03  1.9796193e-03  6.3887337e-04 ...  3.0509317e-03
   2.5295052e-03 -1.7006861e-03]
 [-8.3511998e-04  1.2036510e-03  3.5324618e-03 ...  1.6748861e-03
   3.8890217e-03 -9.6449349e-04]]
(29, 53)


این تابع کتگوریکال میره ایندکسی که عددش بیشتر از همه شده رو برمیداره و به ما نشون میده. یعنی الان تو لیست زیر 36 اوله یعنی چیزی که مدل پردیکت کرده حرف اولش همون حرف ایندکس 36 عه

البته دقیقا بیشترین رو برنمیداره و بصورت وزن دار هرکدوم بزرگتر باشه احتمالش بیشتر میشه پس اون انتخاب میشه


In [38]:
si = tf.random.categorical(output[0], num_samples=1)
si

<tf.Tensor: shape=(29, 1), dtype=int64, numpy=
array([[47],
       [ 3],
       [35],
       [16],
       [ 8],
       [32],
       [ 2],
       [39],
       [ 1],
       [18],
       [39],
       [12],
       [39],
       [40],
       [29],
       [36],
       [33],
       [34],
       [36],
       [21],
       [12],
       [24],
       [21],
       [47],
       [26],
       [48],
       [17],
       [ 0],
       [10]])>

In [39]:
tf.squeeze(si, axis=-1).numpy()

array([47,  3, 35, 16,  8, 32,  2, 39,  1, 18, 39, 12, 39, 40, 29, 36, 33,
       34, 36, 21, 12, 24, 21, 47, 26, 48, 17,  0, 10])

In [40]:
''.join(index2char[tf.squeeze(si, axis=-1).numpy()])

'چ!لت»غ و\rجوأوَطمفقمدأزدچشژث\n؟'

الان مثلا اینجا بدون ترین کردن این شعر رو ساخت واسمون

In [41]:
output[0][0]

array([ 2.8062135e-04, -3.8516999e-05,  2.9221431e-03,  3.7522933e-03,
        4.5965326e-06,  1.2474887e-03, -2.7713291e-03, -4.0145074e-03,
       -1.8589636e-03, -3.3685507e-03,  4.4088801e-03,  3.3254165e-03,
        2.4893650e-04, -1.4063760e-03, -1.8916274e-03,  2.4111473e-03,
        1.1201967e-03, -1.7856690e-04,  3.1218415e-03, -8.3121599e-04,
       -1.0095761e-03,  1.4251700e-03,  5.2645477e-04, -1.1315853e-03,
        2.8671848e-04,  1.9407944e-03, -1.2726336e-03, -4.9512438e-03,
        2.2784404e-03,  4.3497360e-03,  2.5752787e-03, -5.5168341e-03,
       -3.0708883e-04, -4.2738421e-03,  3.8371254e-03,  1.6942397e-03,
       -4.4659711e-04, -1.0994872e-03,  2.1553431e-03,  5.6873532e-03,
        6.6371416e-03,  2.5217584e-03,  5.1847491e-03, -2.1484178e-03,
        5.8505620e-04, -1.2878092e-03, -2.7638098e-04,  1.3867597e-03,
        2.0649042e-03,  9.3906501e-04,  9.4969483e-04,  2.9849396e-03,
        1.5292317e-06], dtype=float32)

In [43]:
model2.summary()

In [44]:
def loss_f(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model2.compile(optimizer='adam', loss=loss_f)

In [46]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='khayyam/checkpoints/my_model.weights.h5',
    save_weights_only=True,
    save_best_only=True
)


In [47]:
history = model2.fit(dataset, epochs=10, callbacks=[checkpoint])

Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 127ms/step - loss: 4.1916
Epoch 2/10
[1m 1/11[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 133ms/step - loss: 3.8661

  if self._should_save_model(epoch, batch, logs, filepath):


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 140ms/step - loss: 3.8711
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 139ms/step - loss: 3.7626
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 142ms/step - loss: 3.2769
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 146ms/step - loss: 3.1200
Epoch 6/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 146ms/step - loss: 3.0534
Epoch 7/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 151ms/step - loss: 2.9624
Epoch 8/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 152ms/step - loss: 2.8186
Epoch 9/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 158ms/step - loss: 2.6650
Epoch 10/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154ms/step - loss: 2.5692


In [56]:
model3 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabolary_size, 25),
    tf.keras.layers.GRU(1024, return_sequences=True),
    tf.keras.layers.Dense(vocabolary_size)
])

In [58]:
model3.build(tf.TensorShape([1, None]))

In [59]:
model3.load_weights("khayyam/checkpoints/my_model.weights.h5")


In [60]:
model3.summary()

In [61]:
num_generate = 1000
first_string = 'به نام خداوند جان و خرد'
input_eval = [char2index[s] for s in first_string]
input_eval = tf.expand_dims(input_eval, 0)
input_eval

<tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[15, 38,  2, 37, 14, 36,  2, 20, 21, 14, 39, 37, 21,  2, 18, 14,
        37,  2, 39,  2, 20, 23, 21]], dtype=int32)>

In [62]:
model3.reset_states()

AttributeError: 'Sequential' object has no attribute 'reset_states'

In [63]:
text_generated = []
for i in range(10):
    predictions = model3.predict(input_eval)
    predictions = tf.squeeze(predictions, 0)
    predicted_ids = tf.random.categorical(predictions, num_samples=1).numpy()
    input_eval = tf.expand_dims(tf.squeeze(predicted_ids, axis=-1).numpy(), 0).numpy()
    text_generated.append(index2char[tf.squeeze(predicted_ids, axis=-1).numpy()])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step


In [65]:
for i in text_generated:
    print(''.join(i))
    print('\n')

نانۀ رکییکههوگ 


یکنأ د
م  بخل  د


گری دچن

|بتم


سدسحخکن
رخهزن
ر
|ار ن


ف مت ی   ح ی |
|||دم زی


نوسرمیبلومزرنسهتآکهتییی


ا  و  خی    م


ب  سز
 بفینزهسپیزشتکرام


رسبزن
 ب دی  دت  


َ ،کو
  تقشووآد
آهمیز


