# Sentiment Analyzer

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

In [3]:
import tensorflow_datasets as tfds

In [4]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [8]:
train, test = imdb['train'], imdb['test']

In [9]:
type(train)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [7]:
train_sent = []
test_sent = []
y_train = []
y_test = []

In [10]:
for sent, labels in train:
    train_sent.append(sent.numpy().decode('utf8'))
    y_train.append(labels)

In [11]:
for sent, labels in test:
    test_sent.append(sent.numpy().decode('utf8'))
    y_test.append(labels)

In [12]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

In [13]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [14]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

In [17]:
train_sent[3:5]

['This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.',
 'As others have mentioned, all the women that go nude in this film are mostly absolutely gorgeous. The plot very ably shows the hypocrisy of the female libido. When men are around they want to be pursued, but when no "men" are around, they become the pursuers of a 14 year old boy. And the boy becomes a man really fast (we should all be so lucky at this age!). He then gets up the courage to pursue his true love.']

In [18]:
y_train[3:5]

array([1, 1])

In [19]:
np.unique(y_train)

array([0, 1])

In [20]:
np.random.seed(12345)
tf.random.set_seed(12345)

In [21]:
# to encode text to int
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
VOCAB = 1000 # Limit the vocabulary to 1000 words
EMBED_DIM = 32 # n-dimension for embedding layer
MAXLEN = 100 # Maximum length of Sentence

In [23]:
token = Tokenizer(lower=True, 
                  num_words=VOCAB, 
                  oov_token="<UKW>")

In [24]:
token.fit_on_texts(train_sent)

In [28]:
#token.word_index

In [29]:
train_sent_s = token.texts_to_sequences(train_sent)
test_sent_s = token.texts_to_sequences(test_sent)

In [30]:
train_sent_s = pad_sequences(train_sent_s, 
                             maxlen=MAXLEN, 
                             padding='post', 
                             truncating='post')

In [31]:
test_sent_s = pad_sequences(test_sent_s, 
                            maxlen=MAXLEN, 
                            padding='post', 
                            truncating='post')

In [33]:
train_sent[0]

"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."

In [34]:
train_sent_s[0]

array([ 12,  14,  33, 425, 392,  18,  90,  28,   1,   9,  32,   1,   1,
        40, 486,   1, 197,  24,  85, 154,  19,  12, 213, 329,  28,  66,
       247, 215,   9, 477,  58,  66,  85, 114,  98,  22,   1,  12,   1,
       643, 767,  12,  18,   7,  33, 400,   1, 176,   1, 416,   2,  89,
         1, 137,  69, 146,  52,   2,   1,   1,  69, 229,  66,   1,  16,
         1,   1,   1,   1,   1,   1,   3,  39,   1, 117,   1,  17,   1,
        14, 162,  19,   4,   1, 917,   1,   9,   4,  18,  13,  14,   1,
         5,  99, 145,   1,  11, 242, 683,  13,  48], dtype=int32)

In [32]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

# Embeddings

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

In [36]:
emb_model = Sequential()

In [37]:
emb_model.add(Embedding(VOCAB, 
                        EMBED_DIM, 
                        input_length = MAXLEN)) # Embedding layer

In [38]:
emb_model.add(GlobalAveragePooling1D()) # Average Pooling layer

In [39]:
emb_model.add(Dense(128, activation = 'relu')) # Dense layer - intermediate FC layer

In [40]:
emb_model.add(Dense(1, activation='sigmoid')) # Output layer

In [41]:
emb_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           32000     
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 128)               4224      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 36,353
Trainable params: 36,353
Non-trainable params: 0
_________________________________________________________________


In [42]:
(32+1)*128

4224

In [43]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])

In [None]:
# Number of words = 1000
# Each word is represented by 32 numbers/features
# Total number of representations that need to be learnt by the model = 32*1000 = 32000 -> Number of parameters for the Embedding layer

In [None]:
# word1 = [10, 2]
# word2 = [3,3]
# word3 = [4,7]
# sent1 = word1 word2 word3
# [3,2]
# Global Average Pooling - > [(10+3+4)/3, [(2+3+7)/3]

In [None]:
(32+1)*128 # Remember the bias

In [None]:
len(y_test)

In [44]:
result = emb_model.fit(train_sent_s, 
                       y_train,
                       validation_data=(test_sent_s,y_test), 
                       epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
new_reviews = ["I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!"]

In [60]:
new_reviews = ["This is hell good"]

In [61]:
new_reviews = token.texts_to_sequences(new_reviews)

In [62]:
new_reviews = pad_sequences(new_reviews,
                            maxlen=MAXLEN,
                            padding='post',
                            truncating='post')

In [63]:
new_reviews

array([[ 12,   7, 607,  50,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [64]:
emb_model.predict(new_reviews)

array([[0.5307499]], dtype=float32)

In [50]:
e = emb_model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(1000, 32)


In [51]:
reverse_word_index = token.index_word

In [53]:
#reverse_word_index

In [54]:
import io
vectors = io.open('vectors_emb_29Jul2023.tsv', 'w', encoding='utf-8') # Vectors
metadata = io.open('metadata_emb_29Jul2023.tsv', 'w', encoding='utf-8') # Metadata
for word_num in range(1, VOCAB):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    metadata.write(word + "\n")
    vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")
vectors.close()
metadata.close()

# Bidirectional LSTM

In [None]:
# They said Teddy bears are on sale
# They said Teddy Roosevelt was a great president

In [65]:
from tensorflow.keras.layers import  LSTM, Bidirectional

In [66]:
bidi_model = Sequential()

In [67]:
bidi_model.add(Embedding(VOCAB,
                         EMBED_DIM,
                         input_length = MAXLEN))

In [68]:
bidi_model.add(Bidirectional(LSTM(128)))

In [69]:
bidi_model.add(Dense(128, 
                     activation = 'relu', 
                     kernel_regularizer=tf.keras.regularizers.L2()))

In [70]:
bidi_model.add(Dense(1, activation='sigmoid'))

In [71]:
bidi_model.compile(optimizer = 'adam', 
                   loss = 'binary_crossentropy',
                   metrics = ['accuracy'])
print(bidi_model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           32000     
                                                                 
 bidirectional (Bidirectiona  (None, 256)              164864    
 l)                                                              
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 229,889
Trainable params: 229,889
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
cp = tf.keras.callbacks.ModelCheckpoint('MybestModelBIDI.h5',
                                        monitor='val_accuracy', 
                                        save_weights=True, 
                                        save_best_only = True,
                                        verbose=1)

In [None]:
result = bidi_model.fit(train_sent_s, 
                        y_train,
                        validation_data=(test_sent_s,y_test), 
                        epochs = 20,
                        batch_size = 500,
                        callbacks = [cp])

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.72164, saving model to MybestModelBIDI.h5
Epoch 2/20
Epoch 2: val_accuracy improved from 0.72164 to 0.78156, saving model to MybestModelBIDI.h5
Epoch 3/20

In [None]:
new_model = tf.keras.models.load_model('MybestModelBIDI.h5')

In [None]:
new_model.predict(new_reviews)