<a href="https://colab.research.google.com/github/Arpit1069/Movie-review-Generator/blob/main/Moviereview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import os
import re
import string
import random


In [None]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)


In [None]:

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [None]:
class TokenPositionAndSentimentEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, sentiment_size, embed_dim):
        super(TokenPositionAndSentimentEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.sentiment_emb = layers.Embedding(input_dim=sentiment_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x, s):
        maxlen = tf.shape(x)[-1]
        sentiment = s
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        sentiments= self.sentiment_emb(sentiment)
        return x + positions + sentiments

In [None]:
vocab_size = 20000  # Only consider the top 20k words
sentiment_size = 2  # Positive Negative movie reviews
maxlen = 80  # Max sequence size

embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer


def create_model():
    inputs_tokens = layers.Input(shape=(maxlen,), dtype=tf.int32)
    inputs_sentiments = layers.Input(shape=(1,), dtype=tf.int32)
    #embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    embedding_layer = TokenPositionAndSentimentEmbedding(maxlen, vocab_size, sentiment_size, embed_dim)
    x = embedding_layer(inputs_tokens,inputs_sentiments)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=[inputs_tokens,inputs_sentiments], outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model
my_model=create_model()

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  10.2M      0  0:00:07  0:00:07 --:--:-- 17.7M


In [None]:
batch_size = 128

# The dataset contains each review in a separate text file
# The text files are present in four different folders
# Create a list all files
filenames_positive = []
filenames_negative = []
directories_positive = [
    "aclImdb/train/pos",
    "aclImdb/test/pos",
]

directories_negative = [
    "aclImdb/train/neg",
    "aclImdb/test/neg",
]
for dir in directories_positive:
    for f in os.listdir(dir):
        filenames_positive.append(os.path.join(dir, f))
for dir in directories_negative:
    for f in os.listdir(dir):
        filenames_negative.append(os.path.join(dir, f))
print(f"Total number of positive review files: {len(filenames_positive)}")
print(f"Total number of negative review files: {len(filenames_negative)}")

all_text_ds_raw = tf.data.TextLineDataset([filenames_positive,filenames_negative])
all_text_ds_raw = all_text_ds_raw.batch(batch_size)

text_pos_ds_raw = tf.data.TextLineDataset(filenames_positive)
text_neg_ds_raw = tf.data.TextLineDataset(filenames_negative)


Total number of positive review files: 25000
Total number of negative review files: 25000


In [None]:
def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")

# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen,
)
vectorize_layer.adapt(all_text_ds_raw)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

In [None]:
print("vocab has the ", len(vocab)," entries")
print("vocab has the following first 10 entries")
for word in range(10):
  print(word, " represents the word: ", vocab[word])

for X in all_text_ds_raw.take(1):
  print(" Given raw data (text): ", X[0].numpy() )
  print(" Tokenized and Transformed to a vector of integers: ", vectorize_layer(tf.expand_dims(X[0], -1)))

vocab has the  19999  entries
vocab has the following first 10 entries
0  represents the word:  
1  represents the word:  [UNK]
2  represents the word:  the
3  represents the word:  .
4  represents the word:  ,
5  represents the word:  a
6  represents the word:  and
7  represents the word:  of
8  represents the word:  to
9  represents the word:  is
 Given raw data (text):  b'Clint Eastwood has definitely produced better movies than this, but this one does not embarrass him. Dirty Harry catches everyone\'s attention and unless one wants to watch romance, there is no reason why you won\'t like him. He is cool because he is dirty, is great because he kills without much thinking, is perfect because he gets the bullet right through your heart and a hero because he doesn\'t care.<br /><br />From what I have seen in movies in which Eastwood acts, the character of the lead role always captivates the audience. In White Hunter Black heart, he is the crazy director, in "in the Line of Fire" he is

In [None]:
def prepare_pos_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)

    x = tf.squeeze(tokenized_sentences[:, :-1])
    s = 1
    y = tf.squeeze(tokenized_sentences[:, 1:])
    return (x,s), y

In [None]:
text_pos_ds = text_pos_ds_raw.map(prepare_pos_lm_inputs_labels)

In [None]:
for (X,s),y in text_pos_ds.take(1):
  print("X.shape: ",X.shape,"s.shape: ", s.shape, "y.shape: ", y.shape)
  print("X: ", X)
  print("s :",s)
  print("y: ",y)
  input1 = " ".join([vocab[_] for _ in X])
  input2= s.numpy()
  output = " ".join([vocab[_] for _ in y])
  print("input1 (in text): " , input1)
  print("input2 : " , input2)
  print("output (in text): " , output)


X.shape:  (79,) s.shape:  () y.shape:  (79,)
X:  tf.Tensor(
[ 3491  2620    52   417  1105   137   108    82    13     4    21    13
    34   134    28 12009    94     3  1773  1431  4255   307    15   687
     6   977    34   498     8   114   911     4    46     9    65   293
   149    24   395    26    44    94     3    30     9   621    95    30
     9  1773     4     9    90    95    30  1167   217    81   547     4
     9   425    95    30   226     2  4163   214   154   136   503     6
     5   651    95    30   162    26   466], shape=(79,), dtype=int64)
s : tf.Tensor(1, shape=(), dtype=int32)
y:  tf.Tensor(
[ 2620    52   417  1105   137   108    82    13     4    21    13    34
   134    28 12009    94     3  1773  1431  4255   307    15   687     6
   977    34   498     8   114   911     4    46     9    65   293   149
    24   395    26    44    94     3    30     9   621    95    30     9
  1773     4     9    90    95    30  1167   217    81   547     4     9
   425    9

In [None]:
def prepare_neg_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)

    x = tf.squeeze(tokenized_sentences[:, :-1])
    #s = batch_size * [0]
    s = 0
    y = tf.squeeze(tokenized_sentences[:, 1:])
    return (x,s), y

In [None]:
text_neg_ds = text_neg_ds_raw.map(prepare_neg_lm_inputs_labels)

In [None]:
for (X,s),y in text_neg_ds.take(1):
  print("X.shape: ",X.shape,"s.shape: ", s.shape, "y.shape: ", y.shape)
  print("X: ", X)
  print("s :",s)
  print("y: ",y)
  input1 = " ".join([vocab[_] for _ in X])
  input2= s.numpy()
  output = " ".join([vocab[_] for _ in y])
  print("input1 (in text): " , input1)
  print("input2 : " , input2)
  print("output (in text): " , output)

X.shape:  (79,) s.shape:  () y.shape:  (79,)
X:  tf.Tensor(
[   17     5    18  3529    20   454  4789 18951     4    12    32     8
    73   177    15     7   108     4     6    28    36    61    57   666
     3    56   108    29    43    89     4    24   395    26    33   760
    14    38    29   624   742    25   401     3    19  7871     4     1
   760    14    10     9   624    25   401    37    36     3    13     9
    70   369   412    89   497    89   457  1927     3     5   551     7
  7811   310     5   473     7  7871     3], shape=(79,), dtype=int64)
s : tf.Tensor(0, shape=(), dtype=int32)
y:  tf.Tensor(
[    5    18  3529    20   454  4789 18951     4    12    32     8    73
   177    15     7   108     4     6    28    36    61    57   666     3
    56   108    29    43    89     4    24   395    26    33   760    14
    38    29   624   742    25   401     3    19  7871     4     1   760
    14    10     9   624    25   401    37    36     3    13     9    70
   369   41

In [None]:
all_text_ds = text_pos_ds.concatenate(text_neg_ds)
all_text_ds=all_text_ds.shuffle(buffer_size=250000)
all_text_ds=all_text_ds.batch(batch_size=batch_size)
all_text_ds=all_text_ds.cache()
all_text_ds = all_text_ds.prefetch(tf.data.experimental.AUTOTUNE)


In [None]:
for (X,s), y in all_text_ds.take(1):
  print(X.shape, s.shape, y.shape)
  print("All sentiment values in this batch: ", s)
  print("\nFirst sample in the batch:")
  print("\tX is: " ,X[0])
  print("\ts is: ", s[0])
  print("\ty is: ", y[0])
  input1 = " ".join([vocab[_] for _ in X[0]])
  input2= s[0].numpy()
  output = " ".join([vocab[_] for _ in y[0]])
  print("\tinput1 (in text): " , input1)
  print("\tinput2 : " , input2)
  print("\toutput (in text): " , output)

  print("\nSecond sample in the batch:")
  print("\tX is: " ,X[1])
  print("\ts is: ", s[1])
  print("\ty is: ", y[1])
  input1 = " ".join([vocab[_] for _ in X[1]])
  input2= s[1].numpy()
  output = " ".join([vocab[_] for _ in y[1]])
  print("\tinput1 (in text): " , input1)
  print("\tinput2 : " , input2)
  print("\toutput (in text): " , output)

(128, 79) (128,) (128, 79)
All sentiment values in this batch:  tf.Tensor(
[0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0
 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0 1 0 1 0 0
 1 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 1 0 1 0 1 0
 1 1 1 1 1 0 0 0 1 0 1 1 0 1 0 0 1], shape=(128,), dtype=int32)

First sample in the batch:
	X is:  tf.Tensor(
[4555   35   28    5   89  106    8  383   10    3   12  201   13   16
  224    4   21  102   12 2051   10   16    5 6473    7    2 7665 1048
    7  750 2256    3   12  121  218  750 2256  381   12   16 1203    3
   12  218   13   22   60   12   16  903    3    2   22   15 8708 6685
  215   88   85 1589   11  136  428   35   28    8  747    2   22   15
  776  603   39    2    1    3   13    9    2], shape=(79,), dtype=int64)
	s is:  tf.Tensor(0, shape=(), dtype=int32)
	y is:  tf.Tensor(
[  35   28    5   89  106    8  383   10    3   12  201   13   16  224
    4   21  102   12 2051 

In [None]:
def top_k_sample(logits, k=10):
    logits, indices = tf.math.top_k(logits, k=k, sorted=True)
    indices = np.asarray(indices).astype("int32")
    preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
    preds = np.asarray(preds).astype("float32")
    return np.random.choice(indices, p=preds)

def TextGenerator(model, max_tokens=40, start_prompt = "this movie is", sentiment= 1, index_to_word=vocab, top_k=10):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """
    def detokenize(number):
        return index_to_word[number]
    # Tokenize starting prompt
    word_to_index = {}
    for index, word in enumerate(vocab):
        word_to_index[word] = index


    start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
    start_tokens = [_ for _ in start_tokens]

    num_tokens_generated = 0
    tokens_generated = []
    while num_tokens_generated <= max_tokens:
        pad_len = maxlen - len(start_tokens)
        sample_index = len(start_tokens) - 1
        if pad_len < 0:
            x = start_tokens[:maxlen]
            sample_index = maxlen - 1
        elif pad_len > 0:
            x = start_tokens + [0] * pad_len
        else:
            x = start_tokens
        s= sentiment
        x = np.array([x])
        s = np.array([s])

        y, _ = model.predict((x,s))
        sample_token = top_k_sample(y[0][sample_index])
        tokens_generated.append(sample_token)
        start_tokens.append(sample_token)
        num_tokens_generated = len(tokens_generated)
    txt = " ".join(
        [detokenize(_) for _ in start_tokens + tokens_generated]
    )
    print(f"generated text:\n{txt}\n")


In [None]:
my_model.fit(all_text_ds, verbose=1, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f88485f1c90>

In [None]:
TextGenerator(my_model, start_prompt = "this movie is amazing", sentiment=0)

generated text:
this movie is amazing ! ! ! a great action movie . the actors don 't get any good action and it looks to make a good movie . the characters are just awful . there 's nothing else on the plot and the main ! ! ! a great action movie . the actors don 't get any good action and it looks to make a good movie . the characters are just awful . there 's nothing else on the plot and the main

