<a href="https://colab.research.google.com/github/Carapet07/Data-Science-projects/blob/main/Movie_Reviews_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2025-03-01 17:08:17--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-03-01 17:08:24 (11.7 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
dataset = keras.utils.text_dataset_from_directory(
    directory='aclImdb', label_mode=None,  batch_size=256
) # Only raw data is returned because of label_mode='None'

dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", " "))
# Imdb dataset often contain <br /> for line breaks. This doesn't matter for text
# classification, but in our case we wouldn't want to generate <br />

Found 100006 files.


In [None]:
vocabulary_size = 15000
sequence_length = 100

text_vectorization = layers.TextVectorization(
    max_tokens = vocabulary_size,
    output_mode='int',
    output_sequence_length = sequence_length
)

text_vectorization.adapt(dataset)

In [None]:
def prepare_lm_dataset(text_batch):
  vectorized_sequence = text_vectorization(text_batch)

  X = vectorized_sequence[:, :-1]
  y = vectorized_sequence[:, 1:]

  return X, y

lm_dataset = dataset.map(prepare_lm_dataset, num_parallel_calls=4)

In [None]:

class TransformerDecoder(layers.Layer):
  def __init__(self, embed_dim, num_heads, dense_dim, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.num_heads = num_heads
    self.dense_dim = dense_dim

    self.dense = keras.Sequential([
        layers.Dense(dense_dim, activation='relu'),
        layers.Dense(embed_dim)
    ])

    self.attention = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim
    )

    self.layernorm1 = layers.LayerNormalization()
    self.layernorm2 = layers.LayerNormalization()

  def call(self, inputs, mask=None):
    if mask is not None:
      mask = mask[:, tf.newaxis, :]

    attention_output = self.attention(
        inputs, inputs, attention_mask=mask
    )

    dense_input = self.layernorm1(inputs + attention_output)
    dense_output = self.dense(dense_input)
    return self.layernorm2(dense_input + dense_output)


class PositionalEmbedding(layers.Layer):
  def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
    super().__init__(**kwargs)
    self.sequence_length = sequence_length
    self.input_dim = input_dim
    self.output_dim = output_dim

    # Embeddin for storing token indices
    self.token_embedding = layers.Embedding(
        input_dim=input_dim, output_dim=output_dim,
    )
    # Embeddin for saving token positions
    self.positional_embedding = layers.Embedding(
        input_dim=sequence_length, output_dim=output_dim
    )

  def call(self, inputs):
    # inputs is a tensor of shape (batch_size, sequence_length)
    # so [-1] returns sequence_length
    length = tf.shape(inputs)[-1]
    positions = tf.range(start=0, limit=length, delta=1)
    embedded_positions = self.positional_embedding(positions)
    embedded_tokens = self.token_embedding(inputs)
    return embedded_positions + embedded_tokens

  def mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs, 0)

In [None]:
embed_dim = 256
dense_dim = 2048
num_heads = 2

inputs =  keras.Input(shape=(None,), dtype='int64')
x = PositionalEmbedding(sequence_length, vocabulary_size, embed_dim)(inputs)
x = TransformerDecoder(embed_dim, num_heads, dense_dim)(x, x)
outputs = layers.Dense(vocabulary_size, activation='softmax')(x)
model = keras.Model(inputs, outputs)

model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')