# Sentiment Analysis

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name = "imdb_reviews",
    split = ["train[:90%]", "train[90%:]", "test"],
    as_supervised = True
)

tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
train_set = raw_test_set.batch(32).prefetch(1)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.CCOFOG_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.CCOFOG_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.CCOFOG_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
for review , label in raw_train_set.take(4):
    print(review.numpy().decode("utf-8")[:200], "...")
    print("Label:", label.numpy())

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0
Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Moun ...
Label: 0
This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful perf ...
Label: 1


In [None]:
vocab_size = 1000

text_vec_layer = tf.keras.layers.TextVectorization(max_tokens= vocab_size,
                                                   standardize='lower_and_strip_punctuation',
                                                   split='whitespace'
                                                   )
text_vec_layer.adapt(train_set.map(lambda review, label: review))

In [None]:
text_vec_layer.get_vocabulary()[:50]

['',
 '[UNK]',
 np.str_('the'),
 np.str_('a'),
 np.str_('and'),
 np.str_('of'),
 np.str_('to'),
 np.str_('is'),
 np.str_('in'),
 np.str_('i'),
 np.str_('it'),
 np.str_('this'),
 np.str_('that'),
 np.str_('br'),
 np.str_('was'),
 np.str_('as'),
 np.str_('with'),
 np.str_('for'),
 np.str_('movie'),
 np.str_('but'),
 np.str_('film'),
 np.str_('on'),
 np.str_('you'),
 np.str_('not'),
 np.str_('are'),
 np.str_('his'),
 np.str_('have'),
 np.str_('be'),
 np.str_('one'),
 np.str_('he'),
 np.str_('its'),
 np.str_('at'),
 np.str_('all'),
 np.str_('by'),
 np.str_('an'),
 np.str_('they'),
 np.str_('from'),
 np.str_('who'),
 np.str_('so'),
 np.str_('like'),
 np.str_('or'),
 np.str_('just'),
 np.str_('if'),
 np.str_('about'),
 np.str_('out'),
 np.str_('has'),
 np.str_('her'),
 np.str_('some'),
 np.str_('there'),
 np.str_('what')]

In [None]:
list(map(str, text_vec_layer.get_vocabulary()[:50]))

['',
 '[UNK]',
 'the',
 'a',
 'and',
 'of',
 'to',
 'is',
 'in',
 'i',
 'it',
 'this',
 'that',
 'br',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'you',
 'not',
 'are',
 'his',
 'have',
 'be',
 'one',
 'he',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'from',
 'who',
 'so',
 'like',
 'or',
 'just',
 'if',
 'about',
 'out',
 'has',
 'her',
 'some',
 'there',
 'what']

In [None]:
text_vec_layer(['it was a great movie'])

<tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[10, 14,  3, 78, 18]])>

In [None]:
embed_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16)
embed_layer(text_vec_layer(['it was a great movie']))

<tf.Tensor: shape=(1, 5, 16), dtype=float32, numpy=
array([[[ 0.03239198, -0.02225692,  0.01046693,  0.02789899,
         -0.0302089 , -0.01052729, -0.00917891, -0.03588693,
          0.02345153, -0.01119227,  0.02096894, -0.01724235,
          0.02639008,  0.02921954,  0.02086613,  0.03927116],
        [-0.00816597,  0.04157536, -0.04097768,  0.00123867,
         -0.02867159,  0.02901191,  0.04234804, -0.04732881,
          0.02101601,  0.01568906,  0.01675064,  0.04898008,
          0.03548184, -0.01831273, -0.01432105, -0.04014903],
        [ 0.03300815,  0.03941376,  0.04571335, -0.01983825,
         -0.03085377,  0.0269911 , -0.04263626, -0.00776982,
          0.01199427, -0.03042015,  0.00033708,  0.00973967,
          0.03954954,  0.00866031, -0.00031966,  0.03217032],
        [-0.00632383, -0.04391186,  0.00968566, -0.04575476,
         -0.03660556,  0.02876231,  0.03523138, -0.02769432,
          0.03022942, -0.00218786, -0.02685027, -0.01549949,
          0.02833129,  0.02743

In [None]:
embed_size = 128

model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embed_size, mask_zero=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [None]:

model.compile(loss= "binary_crossentropy",
              optimizer = tf.keras.optimizers.Nadam(),
              metrics = ["accuracy"]
              )

model.fit(train_set, epochs=2, validation_data=valid_set)

Epoch 1/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 31ms/step - accuracy: 0.6308 - loss: 0.6190 - val_accuracy: 0.8360 - val_loss: 0.3751
Epoch 2/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 30ms/step - accuracy: 0.8548 - loss: 0.3434 - val_accuracy: 0.8608 - val_loss: 0.3283


<keras.src.callbacks.history.History at 0x7f44cef81cd0>

In [None]:
embedding_weights = model.layers[1].get_weights()[0]

In [None]:
import numpy as np
np.savetxt("embeddings.tsv", embedding_weights, delimiter="\t")

In [None]:
vocab = text_vec_layer.get_vocabulary()

with open("metadata.tsv", "w", encoding="utf-8") as f:
    for word in vocab:
        word = word if word.strip() != "" else "<PAD>"
        f.write(f"{word}\n")

#Generating Shakespearean Text Using a Character RNN

In [None]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [None]:
''.join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(split = 'character',
                                                   standardize = 'lower')
text_vec_layer.adapt(shakespeare_text)
encoded = text_vec_layer([shakespeare_text][0])

In [None]:
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [None]:
encoded -= 2
vocab_size = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

In [None]:
vocab_size

39

In [None]:
dataset_size

1115394

In [None]:
def to_dataset(sequence, length, seed = None, shuffle = False, batch_size  = 32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift = 1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100_000, seed = seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window : (window[:, :-1], window[:, 1:])).prefetch(1)

In [None]:
length =100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle = True,
                       seed = 42)
valid_set =  to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set =  to_dataset(encoded[1_060_000:], length=length)

In [None]:
train_set

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [None]:
tf.random.set_seed(42)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = 16),
    tf.keras.layers.GRU(128, return_sequences = True),
    tf.keras.layers.Dense(vocab_size, activation = 'softmax')
])

model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'nadam',
              metrics = ['accuracy'])

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    'my_shakespeare_model.keras', monitor = 'val_accuracy', save_best_only = True
)

history = model.fit(train_set, validation_data = valid_set, epochs = 1,
                    callbacks = [model_ckpt])

  31247/Unknown [1m411s[0m 12ms/step - accuracy: 0.5454 - loss: 1.5074



[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m429s[0m 13ms/step - accuracy: 0.5454 - loss: 1.5074 - val_accuracy: 0.5315 - val_loss: 1.6081


In [None]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),
    model
])

In [None]:
y_proba = shakespeare_model.predict(tf.constant(['To be or not to b']))[0, -1]
y_pred = tf.argmax(y_proba)
text_vec_layer.get_vocabulary()[y_pred + 2]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


np.str_('e')

#Generating Fake Shakespearean text

In [None]:
log_probas = tf.math.log([[0.5, 0.3, 0.2]])
tf.random.categorical(log_probas, num_samples=8)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 0, 1, 2, 1, 0, 0, 0]])>

In [None]:
def next_char(text, temperature):
    text = tf.constant([text])
    y_proba = shakespeare_model.predict(text)[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0,0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

In [None]:
def extent_text(text, chars = 50, temperature =1):
    for _ in range(chars):
        text += next_char(text, temperature)
    return text

In [None]:
extent_text('to be or not to be', temperature=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46

'to be or not to be done!\nis thought this very part beautor wonst.\n\nd'

In [None]:
extent_text('to be or not to be',chars= 10, temperature=100)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


'to be or not to bekjd3c$;mqm'