# Step One

In [5]:
import tensorflow as tf
import keras

# Sample sentences
sentences = [
    'i love machine learning',
    'i love deep learning',
    'i enjoy deep learning',
    'i enjoy programming',
    'i love programming'
]

# Create a TextVectorization layer
# max_tokens: The maximum size of the vocabulary.
# output_sequence_length: The length of the output sequences.
vectorize_layer = keras.layers.TextVectorization(
    max_tokens=10, 
    output_sequence_length=4
)

# Adapt the layer to your data
# This is where the vocabulary is built
vectorize_layer.adapt(sentences)

# Convert sentences to integer sequences
vectorized_data = vectorize_layer(tf.constant(sentences))

print("Vocabulary:")
print(vectorize_layer.get_vocabulary())
print("\nVectorized Data (Sequences):")
print(vectorized_data)

Vocabulary:
['', '[UNK]', 'i', 'love', 'learning', 'programming', 'enjoy', 'deep', 'machine']

Vectorized Data (Sequences):
tf.Tensor(
[[2 3 8 4]
 [2 3 7 4]
 [2 6 7 4]
 [2 6 5 0]
 [2 3 5 0]], shape=(5, 4), dtype=int64)


In [6]:
from keras.layers import Embedding
    
# Get the vocabulary size and embedding dimension
vocab_size = vectorize_layer.vocabulary_size()
embedding_dim = 64  # This is a hyperparameter you can tune

# Create a simple model
model = keras.Sequential([
    vectorize_layer,  # The TextVectorization layer we created earlier
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
    )
])

# Print a summary of the model
model.summary()

In [3]:
# Create a dummy input to get the output of the embedding layer
sample_input = tf.constant(sentences)
# Pass the input through the model to get the embedding vectors
word_embeddings = model(sample_input)

# Print the shape and first few embedding vectors
print("Shape of word embeddings:", word_embeddings.shape)
print("Embedding vectors:\n", word_embeddings)

Shape of word embeddings: (5, 4, 64)
Embedding vectors:
 tf.Tensor(
[[[-0.02905157 -0.02192582 -0.02985046 ... -0.00763953  0.02562919
    0.01993114]
  [-0.02469324  0.0156658   0.01142956 ... -0.02961923  0.00948169
   -0.03707187]
  [-0.04378277 -0.03771529 -0.03158895 ... -0.04505865  0.04770208
    0.02161039]
  [ 0.00507027 -0.01211816  0.04651357 ...  0.016757    0.04183363
    0.00303914]]

 [[-0.02905157 -0.02192582 -0.02985046 ... -0.00763953  0.02562919
    0.01993114]
  [-0.02469324  0.0156658   0.01142956 ... -0.02961923  0.00948169
   -0.03707187]
  [ 0.04145371  0.03068919 -0.00516153 ...  0.0254828   0.03504975
   -0.00499316]
  [ 0.00507027 -0.01211816  0.04651357 ...  0.016757    0.04183363
    0.00303914]]

 [[-0.02905157 -0.02192582 -0.02985046 ... -0.00763953  0.02562919
    0.01993114]
  [ 0.02555398  0.0413499  -0.01618407 ... -0.02444042  0.0494576
   -0.00528054]
  [ 0.04145371  0.03068919 -0.00516153 ...  0.0254828   0.03504975
   -0.00499316]
  [ 0.00507027 -

In [4]:
# We need labels for classification
labels = tf.constant([0, 0, 1, 1, 0]) # 0 for "learning", 1 for "programming"

# A simple classification model
classification_model = keras.Sequential([
    vectorize_layer,
    keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim
    ),
    keras.layers.GlobalAveragePooling1D(), # A layer to reduce the dimensions
    keras.layers.Dense(1, activation='sigmoid') # A dense layer for binary classification
])

classification_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

classification_model.summary()

callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss"),
    keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.2, monitor="val_loss")
]

# Train the model
history = classification_model.fit(
    tf.constant(sentences),
    labels,
    epochs=100,
    callbacks=callbacks,
    validation_split=0.2
)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 655ms/step - accuracy: 0.2500 - loss: 0.6980 - val_accuracy: 1.0000 - val_loss: 0.6802 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.5000 - loss: 0.6951 - val_accuracy: 1.0000 - val_loss: 0.6813 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.7500 - loss: 0.6922 - val_accuracy: 1.0000 - val_loss: 0.6824 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 1.0000 - loss: 0.6893 - val_accuracy: 1.0000 - val_loss: 0.6835 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 1.0000 - loss: 0.6864 - val_accuracy: 1.0000 - val_loss: 0.6846 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/

# Step Two /  Semantic Search

In [22]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

# Load a small sample of the IMDb movie reviews
# We'll use the 'train' split to get more sentences for embedding
ds = tfds.load('imdb_reviews', split='train[:1000]', as_supervised=True)
# Convert Bytes (in TensorFlow Dataset Format) to Strings
sentences = [text.decode('utf-8') for text, _ in tfds.as_numpy(ds)]
labels = [label for _, label in tfds.as_numpy(ds)]

# We'll take the first 100 sentences for our project
sentences_subset = np.array(sentences[:100])
labels_subset = np.array(labels[:100])

print(f"Number of sentences loaded: {len(sentences)}")
print("\nFirst sentence:")
print(sentences_subset[0])
print("\nFirst label:")
print(labels_subset[0])

Number of sentences loaded: 1000

First sentence:
This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.

First label:
0


In [58]:
import keras
from keras.layers import Dense, Embedding, Input, LSTM, TextVectorization, GlobalAveragePooling1D
from keras import Sequential

vectorize_layer = TextVectorization(max_tokens=10000, output_mode='int', output_sequence_length=256)
# Adapt the layer to our dataset. This is where the layer learns the vocabulary.
vectorize_layer.adapt(tf.constant(sentences_subset))

# Convert the sentences to vectorized data (sequences of integers)
vectorized_data = vectorize_layer(tf.constant(sentences_subset))

print(f"Number of words in vocabulary: {len(vectorize_layer.get_vocabulary())}")
print("\nFirst 3 vectorized sentences:")
print(vectorized_data[:2])

Number of words in vocabulary: 4759

First 3 vectorized sentences:
tf.Tensor(
[[  10   17   31  292  398   21   79   27 3146    8   36  997 1084   44
   419 3339  236   23  101   93   20   10  230  184   27   57  240  200
     8  654   74   57  101  162   92   29 2558   10   62  810 1169   10
    21    7   31  368 2942  163 1271  414    2   68  833  104   76  128
    48    2 4226 2576   76  504   57 1639   15 2492 3117 4191 4638  382
  2778    3   35 2648 4674   14 1084   17  114   20    5  833  538 2739
     8    5   21   12   17 3994    4   85  130 3091    9  260  541   12
    52   23   62   45   10 2453   93   45  997 1864   49  306    9   92
  1026  602  153   11    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0  

In [59]:
embedding_dim = 128
# we should get the vocabulary size because may its size < max_tokens_size
vocab_size = vectorize_layer.vocabulary_size()

model = Sequential(
    [
        Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            mask_zero=True,  # The `mask_zero=True` argument helps the model ignore the padding zeros.
            name="embedding_layer"
        ),
        GlobalAveragePooling1D(),
        Dense(1, activation="sigmoid"),  # A simple dense layer for a dummy task
    ]
)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [61]:
vectorized_sentences = vectorize_layer(tf.constant(sentences_subset))
# print("Shape of vectorized sentences:", vectorized_sentences.shape)
history = model.fit(
    x=vectorized_sentences,
    y=labels_subset,
    epochs=100,
    validation_split=0.2
)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0460 - val_accuracy: 0.6000 - val_loss: 0.6442
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 1.0000 - loss: 0.0449 - val_accuracy: 0.6000 - val_loss: 0.6438
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 1.0000 - loss: 0.0438 - val_accuracy: 0.6000 - val_loss: 0.6435
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 1.0000 - loss: 0.0429 - val_accuracy: 0.6000 - val_loss: 0.6431
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 1.0000 - loss: 0.0419 - val_accuracy: 0.6000 - val_loss: 0.6427
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 1.0000 - loss: 0.0410 - val_accuracy: 0.6000 - val_loss: 0.6423
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━

## Semantic Search Section

In [62]:
# Create a new, simpler model for just embedding
embedding_model = tf.keras.Sequential([
    vectorize_layer,  # Use the trained layer from the previous model
    model.get_layer('embedding_layer') # Use the trained embedding layer
])

# Use the predict method to get the embeddings for all sentences
sentence_embeddings = embedding_model.predict(tf.constant(sentences_subset))

print("Shape of sentence embeddings:", sentence_embeddings.shape)
print("\nFirst sentence's embedding vector:\n", sentence_embeddings[0])

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Shape of sentence embeddings: (100, 256, 128)

First sentence's embedding vector:
 [[ 0.22146513 -0.2033522  -0.19967037 ... -0.17324048  0.24664827
   0.1637163 ]
 [ 0.17540483 -0.20566894 -0.1328209  ... -0.13410693  0.22773775
   0.1539578 ]
 [ 0.09272396 -0.05507018 -0.08153713 ... -0.02820148  0.10794813
   0.07732717]
 ...
 [-0.0143142  -0.03834929  0.02532173 ...  0.04780747 -0.04444572
  -0.01145894]
 [-0.0143142  -0.03834929  0.02532173 ...  0.04780747 -0.04444572
  -0.01145894]
 [-0.0143142  -0.03834929  0.02532173 ...  0.04780747 -0.04444572
  -0.01145894]]


In [64]:
# To get a single vector per sentence, we average the word vectors.
# We need to handle the padded zeros, so we'll use a mask.

# Get the mask from the embedding layer
mask = embedding_model.get_layer('embedding_layer').compute_mask(vectorize_layer(tf.constant(sentences_subset)))

# Apply the mask to the sentence embeddings to ignore padded zeros
masked_embeddings = sentence_embeddings * np.expand_dims(mask, axis=-1)

# Average the word embeddings to get one vector per sentence
sentence_vectors = np.sum(masked_embeddings, axis=1) / np.sum(mask, axis=1, keepdims=True)

print("\nShape of final sentence vectors:", sentence_vectors.shape)
print("\nFirst sentence's final vector:\n", sentence_vectors[0])


Shape of final sentence vectors: (100, 128)

First sentence's final vector:
 [ 0.10998511 -0.10286884 -0.09105635 -0.10639376  0.08675896 -0.08225816
  0.09703208 -0.09779937  0.0885028   0.09076387 -0.09070267  0.07294329
 -0.08416977 -0.10827542 -0.0849989  -0.09295622  0.07941282 -0.08442829
 -0.08182498 -0.09000168  0.0872865   0.077944    0.08419003  0.08106726
 -0.09847007 -0.09068771  0.09511352 -0.09476045  0.08152567  0.07642266
  0.11498298 -0.08341417 -0.08682119 -0.08253176 -0.07601818 -0.10044832
  0.09009997  0.10524737  0.10818671  0.09349973  0.11421166 -0.091021
 -0.08197806 -0.08877258 -0.08346443 -0.0856621  -0.10604051 -0.09587539
  0.07972436  0.08886651  0.0839703   0.08055855  0.09821053 -0.09011839
  0.07652532  0.07617205 -0.07606422 -0.07571373 -0.0847056   0.10338533
  0.10668675  0.07981015  0.08427466 -0.083869   -0.1001945  -0.08383922
  0.0940271  -0.080902    0.07483279 -0.09594051 -0.07697812  0.08428583
 -0.08554296  0.11250136 -0.08669924  0.09558922

In [88]:
from numpy.linalg import norm


def find_similar_sentences(query_sentence, k=3):
    # Step 1: Convert the query sentence to a single vector
    query_vector_unshaped = embedding_model.predict(
        tf.constant(np.array([query_sentence]))
    )
    query_vector = np.mean(query_vector_unshaped, axis=1)
    query_vector = np.squeeze(query_vector)  # اضافه کردن این خط برای یک‌بعدی کردن

    # Step 2: Calculate the Cosine Similarity between the query and all sentences
    cosine_similarities = np.dot(sentence_vectors, query_vector) / (
        norm(sentence_vectors, axis=1) * norm(query_vector)
    )

    # Step 3: Get the indices of the top k most similar sentences
    top_k_indices = np.argsort(cosine_similarities)[::-1][:k]
    print(top_k_indices.shape)  # باید (3,) باشد

    # Step 4: Retrieve and return the sentences and their similarity scores
    results = []
    for index in top_k_indices:
        results.append((cosine_similarities[index].round(4), sentences_subset[index]))

    return results


# --- Test the function ---
# Example 1: a sentence with a positive sentiment
query_sentence_1 = "This movie was absolutely amazing and beautiful."
similar_sentences_1 = find_similar_sentences(query_sentence_1)
print(f"Query: '{query_sentence_1}'")
print("\nTop 3 similar sentences:")
for score, sentence in similar_sentences_1:
    print(f"Score: {score:.4f} | Sentence: {str(sentence)[:50]}...")

# Example 2: a sentence with a negative sentiment
query_sentence_2 = "The acting was terrible and I hated the story."
similar_sentences_2 = find_similar_sentences(query_sentence_2)
print(f"\nQuery: '{query_sentence_2}'")
print("\nTop 3 similar sentences:")
for score, sentence in similar_sentences_2:
    print(f"Score: {score:.4f} | Sentence: {sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
(3,)
Query: 'This movie was absolutely amazing and beautiful.'

Top 3 similar sentences:
Score: 0.0606 | Sentence: This movie was so badly written, directed and acte...
Score: 0.0526 | Sentence: Are you familiar with concept of children's artwor...
Score: 0.0525 | Sentence: An awful film! It must have been up against some r...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step(3,)
Query: 'This movie was absolutely amazing and beautiful.'

Top 3 similar sentences:
Score: 0.0606 | Sentence: This movie was so badly written, directed and acte...
Score: 0.0526 | Sentence: Are you familiar with concept of children's artwor...
Score: 0.0525 | Sentence: An awful film! It must have been up against some r...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

## Emotion Classifier System

In [99]:
def predict_sentiment(sentence):
    # The model expects a batch of sentences, so we wrap the single sentence in a list
    vectorized_phrase = vectorize_layer(tf.constant([sentence]))
    prediction_score = model.predict(vectorized_phrase)
    print(prediction_score)

    if prediction_score >= 0.5:
        sentiment = "Positive"
    else:
        sentiment = "Negative"

    return sentiment, prediction_score


predict_sentiment("everything is well. i liked it!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[[0.9509964]]
[[0.9509964]]
[[0.9509964]]


('Positive', array([[0.9509964]], dtype=float32))