In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Input, MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D


In [None]:
# 1️ טקסט לדוגמה
#corpus = "Hello world. This is a meta text corpus to demonstrate text prediction. Text prediction models are meta."
corpus = "לא לפנות אליי לא לדבר אליי אל תבוא אליי אל תתקשר אליי איזה כיף איתי ואוו חוויה ממש ממליצה לך"
# 2️ טוקניזציה
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
word_index = tokenizer.word_index
total_words = len(word_index) + 1

# 3️ יצירת רצפי N-gram
input_sequences = []
for line in corpus.split('.'):  # כל משפט בנפרד
    sequence = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(sequence)):
        input_sequences.append(sequence[:i+1])

# 4️ ריפוד רצפים
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# 5️ פיצול ל-X ול-y
X_train = input_sequences[:, :-1]
y_train = input_sequences[:, -1]
y_train = tf.keras.utils.to_categorical(y_train, num_classes=total_words)


In [None]:
# 6️ בניית TransformerBlock (תיקון – **הסרנו `training`**)
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        #num_heads - בכמה "ראשים" שונים של תשומת לב ילמד המודל
        #embed_dim-  גודל הוקטור לכל מילה
        super(TransformerBlock, self).__init__()
        #הגדרת attention
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):  #
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

# 7️ הגדרת המודל עם GlobalAveragePooling1D
embed_dim = 64
num_heads = 2
ff_dim = 64

inputs = Input(shape=(max_sequence_len - 1,))
embedding_layer = Embedding(total_words, embed_dim)(inputs)#מייצג את שכבת posisional encoding
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)(embedding_layer)
pooled_output = GlobalAveragePooling1D()(transformer_block)  # מיישר את הפלט לווקטור
output_layer = Dense(total_words, activation="softmax")(pooled_output)

model = Model(inputs=inputs, outputs=output_layer)


In [None]:
# 8️ קומפילציה ואימון
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=100, verbose=1)

# 9️ חיזוי משפט חדש
seed_text = "לא ממליצה"
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding="pre")
    predicted = model.predict(token_list, verbose=0)
    predicted_index = tf.argmax(predicted[0]).numpy()
    if predicted_index != 0:
        seed_text += " " + tokenizer.index_word[predicted_index]

print("Generated text:", seed_text)


In [31]:
seed_text = "לא"
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding="pre")
    predicted = model.predict(token_list, verbose=0)
    predicted_index = tf.argmax(predicted[0]).numpy()
    if predicted_index != 0:
        seed_text += " " + tokenizer.index_word[predicted_index]

print("Generated text:", seed_text)

Generated text: לא לפנות אליי אליי לדבר אליי


In [34]:
import numpy as np

seed_text = "לא "
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding="pre")

    probabilities = model.predict(token_list, verbose=0)[0]  # ⬅️ קבלת הסתברויות לכל מילה

    # ⬇️ בחירת מילה רנדומלית על פי ההתפלגות של המודל
    predicted_index = np.random.choice(len(probabilities), p=probabilities)

    if predicted_index != 0:  # לוודא שזה לא טוקן ריק
        seed_text += " " + tokenizer.index_word[predicted_index]

print("Generated text:", seed_text)


Generated text: לא  אליי אליי לפנות אליי אל
