# Sentiment Analyzer

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

In [3]:
import tensorflow_datasets as tfds

In [4]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteTEJ9EG/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteTEJ9EG/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteTEJ9EG/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [7]:
#info

In [8]:
train, test = imdb['train'], imdb['test']

In [9]:
type(train)

tensorflow.python.data.ops.prefetch_op._PrefetchDataset

In [13]:
train_sent = []
test_sent = []
y_train = []
y_test = []

In [14]:
for sent, labels in train:
    train_sent.append(sent.numpy().decode('utf8'))
    y_train.append(labels)

In [15]:
for sent, labels in test:
    test_sent.append(sent.numpy().decode('utf8'))
    y_test.append(labels)

In [16]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

In [17]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [18]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

In [21]:
train_sent[3]

'This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.'

In [22]:
y_train[3]

1

In [23]:
np.unique(y_train)

array([0, 1])

In [24]:
np.random.seed(12345)
tf.random.set_seed(12345)

In [25]:
# to encode text to int
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
VOCAB = 1000 # Limit the vocabulary to 1000 words
EMBED_DIM = 32 # n-dimension for embedding layer
MAXLEN = 100 # Maximum length of Sentence

In [27]:
token = Tokenizer(lower=True,
                  num_words=VOCAB,
                  oov_token="<UKW>")

In [28]:
token.fit_on_texts(train_sent)

In [None]:
#token.word_index

In [29]:
train_sent_s = token.texts_to_sequences(train_sent)
test_sent_s = token.texts_to_sequences(test_sent)

In [30]:
train_sent_s = pad_sequences(train_sent_s,
                             maxlen=MAXLEN,
                             padding='post',
                             truncating='post')

In [31]:
test_sent_s = pad_sequences(test_sent_s,
                            maxlen=MAXLEN,
                            padding='post',
                            truncating='post')

In [32]:
train_sent[3]

'This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.'

In [33]:
train_sent_s[3]

array([ 12,   7,   2, 241,   5,  20,  16,   4,   1,   1,   1,  52,   2,
       358,   5,   2, 180,  68, 138,   1,  17,  92, 203, 968,  15,  23,
         1,  81,   4, 192,   1,   1,   3,   1,  16,   4, 376,   5, 632,
       387, 352,  37,   1,   3,   1,   1,  15, 208,   1,   1,   2, 112,
       365,  48,  24,  55,   1,   6,   1,  55,   1,   1,  41,   4,   1,
         3,   1,   1, 141, 160, 780, 111,  31,  92, 116,   4, 221,  20,
         9, 173, 279,   3,  29,  13,   1,   2,   1,  10,   1,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [34]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

# Embeddings

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

In [36]:
emb_model = Sequential()

In [37]:
emb_model.add(Embedding(VOCAB,
                        EMBED_DIM,
                        input_length = MAXLEN)) # Embedding layer

In [38]:
emb_model.add(GlobalAveragePooling1D()) # Average Pooling layer

In [39]:
emb_model.add(Dense(128, activation = 'relu')) # Dense layer - intermediate FC layer

In [40]:
emb_model.add(Dense(1, activation='sigmoid')) # Output layer

In [41]:
emb_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           32000     
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 128)               4224      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 36353 (142.00 KB)
Trainable params: 36353 (142.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
(32+1)*128

In [42]:
emb_model.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])

In [None]:
# Number of words = 1000
# Each word is represented by 32 numbers/features
# Total number of representations that need to be learnt by the model = 32*1000 = 32000 -> Number of parameters for the Embedding layer

In [None]:
# word1 = [10, 2]
# word2 = [3,3]
# word3 = [4,7]
# sent1 = word1 word2 word3
# [3,2]
# Global Average Pooling - > [(10+3+4)/3, [(2+3+7)/3]

In [None]:
(32+1)*128 # Remember the bias

In [None]:
len(y_test)

In [43]:
result = emb_model.fit(train_sent_s,
                       y_train,
                       validation_data=(test_sent_s,y_test),
                       epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [56]:
new_reviews = ["I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!"]

In [57]:
new_reviews

['I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!']

In [58]:
new_reviews = token.texts_to_sequences(new_reviews)

In [59]:
new_reviews = pad_sequences(new_reviews,
                            maxlen=MAXLEN,
                            padding='post',
                            truncating='post')

In [60]:
new_reviews

array([[ 11,   1,  12,  16,  59, 657,  35, 297,   2,   1,  27,   7, 258,
          4, 387,  56, 394, 132, 152,   1,   2, 226,   7,  31, 209, 252,
          6, 330,  86,  73, 102,   2, 272,  14,   1,  16,   1,  37,  51,
         72, 394,  37,  85,   1, 149,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [61]:
emb_model.predict(new_reviews)



array([[0.9761543]], dtype=float32)

In [62]:
e = emb_model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(1000, 32)


In [63]:
reverse_word_index = token.index_word

In [None]:
#reverse_word_index

In [64]:
import io
vectors = io.open('vectors_emb_13Jan2024.tsv', 'w', encoding='utf-8') # Vectors
metadata = io.open('metadata_emb_13Jan2024.tsv', 'w', encoding='utf-8') # Metadata
for word_num in range(1, VOCAB):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    metadata.write(word + "\n")
    vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")
vectors.close()
metadata.close()

# Bidirectional LSTM

In [None]:
# They said Teddy bears are on sale
# They said Teddy Roosevelt was a great president

In [None]:
from tensorflow.keras.layers import  LSTM, Bidirectional

In [None]:
bidi_model = Sequential()

In [None]:
bidi_model.add(Embedding(VOCAB,
                         EMBED_DIM,
                         input_length = MAXLEN))

In [None]:
bidi_model.add(Bidirectional(LSTM(128)))

In [None]:
bidi_model.add(Dense(128,
                     activation = 'relu',
                     kernel_regularizer=tf.keras.regularizers.L2()))

In [None]:
bidi_model.add(Dense(1, activation='sigmoid'))

In [None]:
bidi_model.compile(optimizer = 'adam',
                   loss = 'binary_crossentropy',
                   metrics = ['accuracy'])
print(bidi_model.summary())

In [None]:
cp = tf.keras.callbacks.ModelCheckpoint('MybestModelBIDI.h5',
                                        monitor='val_accuracy',
                                        save_weights=True,
                                        save_best_only = True,
                                        verbose=1)

In [None]:
result = bidi_model.fit(train_sent_s,
                        y_train,
                        validation_data=(test_sent_s,y_test),
                        epochs = 20,
                        batch_size = 500,
                        callbacks = [cp])

In [None]:
new_model = tf.keras.models.load_model('MybestModelBIDI.h5')

In [None]:
new_model.predict(new_reviews)