<a href="https://colab.research.google.com/github/AliTavakoli2001/Deep-Learning-Project/blob/main/Persian_text_autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# sample data

In [2]:
texts = [
    "بسی رنج بردم در این سال سی",
    "عجم زنده کردم بدین پارسی",
    "جهان را بپرسید از این داستان",
    "که نامش بماند همیشه به سان"
]

# Converting text to digits

In [3]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

vocab_size = len(tokenizer.word_index) + 1

# Creating model

**Encoder model**

In [4]:
latent_dim = 128  # افزایش ظرفیت فشرده‌سازی
inputs = Input(shape=(max_len,))
embedding = Embedding(vocab_size, 256, input_length=max_len)(inputs)
encoded = LSTM(latent_dim, return_state=True, dropout=0.3, recurrent_dropout=0.3)
encoder_outputs, state_h, state_c = encoded(embedding)




**Decoder model**

In [5]:
decoder_inputs = Input(shape=(max_len,))
decoder_embedding = Embedding(vocab_size, 256, input_length=max_len)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.3, recurrent_dropout=0.3)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

**Seq2Seq model and compiling**

In [6]:
model = Model([inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# prepare data

In [7]:
target_sequences = np.array(sequences)

# train model

In [8]:
model.fit([sequences, sequences], target_sequences, epochs=100, batch_size=16, validation_split=0.2)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step - accuracy: 0.1190 - loss: 2.9399 - val_accuracy: 0.2500 - val_loss: 2.9344
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - accuracy: 0.2738 - loss: 2.9227 - val_accuracy: 0.2500 - val_loss: 2.9228
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step - accuracy: 0.3333 - loss: 2.8992 - val_accuracy: 0.2500 - val_loss: 2.9108
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step - accuracy: 0.3214 - loss: 2.8742 - val_accuracy: 0.2143 - val_loss: 2.8979
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step - accuracy: 0.3333 - loss: 2.8483 - val_accuracy: 0.2143 - val_loss: 2.8836
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step - accuracy: 0.2619 - loss: 2.8177 - val_accuracy: 0.1786 - val_loss: 2.8675
Epoch 7/100
[1m1/1[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x7ff415093250>

#recustruction text

In [9]:
def generate_text(seed_text, tokenizer, model, max_len):
    sequence = tokenizer.texts_to_sequences([seed_text])
    sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    pred = model.predict([sequence, sequence])
    generated_text = ''.join([tokenizer.index_word.get(np.argmax(word), '') for word in pred[0]])
    return generated_text



# testing model

In [15]:
test_text = "عجم زنده کردم بدین پارسی"
output_text = generate_text(test_text, tokenizer, model, max_len)
print("constructed text: ", output_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
constructed text:  عجم زنده کردم بدین پارسی
