# Sentiment Analyzer

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
import tensorflow_datasets as tfds

In [4]:
imdb, info = tfds.load('imdb_reviews',
                       with_info=True,
                       as_supervised=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZY40RX/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZY40RX/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZY40RX/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [5]:
train, test = imdb['train'],imdb['test']

In [7]:
#info

In [8]:
type(train)

tensorflow.python.data.ops.prefetch_op._PrefetchDataset

In [9]:
train_sent = []
test_sent = []
y_train = []
y_test = []

In [10]:
for sent, labels in train:
    train_sent.append(sent.numpy().decode('utf8'))
    y_train.append(labels)

In [11]:
for sent, labels in test:
    test_sent.append(sent.numpy().decode('utf8'))
    y_test.append(labels)

In [12]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

In [13]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [15]:
train_sent[0:2]

["This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
 'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development wa

In [16]:
y_train[0:2]

array([0, 0])

In [19]:
np.unique(y_train), np.unique(y_test)

(array([0, 1]), array([0, 1]))

In [17]:
np.random.seed(12345)
tf.random.set_seed(12345)

In [18]:
# to encode text to int
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
VOCAB = 1000 # Limit the vocabulary to 1000 words
EMBED_DIM = 32 # n-dimension for embedding layer
MAXLEN = 100 # Maximum length of Sentence

In [21]:
token = Tokenizer(lower=True,
                  num_words=VOCAB,
                  oov_token="<OOV>")

In [22]:
token.fit_on_texts(train_sent)

In [23]:
train_sent_s = token.texts_to_sequences(train_sent)
test_sent_s = token.texts_to_sequences(test_sent)

In [24]:
train_sent_s = pad_sequences(train_sent_s,
                             maxlen=MAXLEN,
                             padding='post',
                             truncating='post')

In [25]:
test_sent_s = pad_sequences(test_sent_s,
                            maxlen=MAXLEN,
                            padding='post',
                            truncating='post')

In [26]:
len(train_sent), len(test_sent), len(y_train), len(y_test)

(25000, 25000, 25000, 25000)

# Embeddings

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

In [28]:
emb_model = Sequential()

In [29]:
emb_model.add(Embedding(VOCAB,
                        EMBED_DIM,
                        input_length = MAXLEN)) # Embedding layer

In [30]:
emb_model.add(GlobalAveragePooling1D()) # Average Pooling layer

In [31]:
emb_model.add(Dense(128, activation = 'relu')) # Dense layer - intermediate FC layer

In [32]:
emb_model.add(Dense(1, activation='sigmoid')) # Output layer

In [33]:
emb_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           32000     
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 128)               4224      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 36353 (142.00 KB)
Trainable params: 36353 (142.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:
(128+1)*1

129

In [37]:
emb_model.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])

In [None]:
# Number of words = 1000
# Each word is represented by 32 numbers/features
# Total number of representations that need to be learnt by the model = 32*1000 = 32000 -> Number of parameters for the Embedding layer

In [None]:
# word1 = [10, 2]
# word2 = [3,3]
# word3 = [4,7]
# sent1 = word1 word2 word3
# [3,2]
# Global Average Pooling - > [(10+3+4)/3, [(2+3+7)/3]

In [None]:
(32+1)*128 # Remember the bias

In [None]:
len(y_test)

In [38]:
result = emb_model.fit(train_sent_s,
                       y_train,
                       validation_data=(test_sent_s,y_test),
                       epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
#new_reviews = ["I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!"]

In [46]:
new_reviews = ['This is a horribly bad product. Stopped working after few days. No response from customer service']

In [47]:
new_reviews = token.texts_to_sequences(new_reviews)

In [48]:
new_reviews = pad_sequences(new_reviews,
                            maxlen=MAXLEN,
                            padding='post',
                            truncating='post')

In [49]:
new_reviews

array([[ 12,   7,   4,   1,  76,   1,   1, 778, 101, 169, 502,  55,   1,
         37,   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [50]:
emb_model.predict(new_reviews)



array([[0.25803635]], dtype=float32)

In [51]:
e = emb_model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(1000, 32)


In [52]:
reverse_word_index = token.index_word

In [None]:
#reverse_word_index

In [54]:
import io
vectors = io.open('vectors_emb_22Oct2023.tsv', 'w', encoding='utf-8') # Vectors
metadata = io.open('metadata_emb_22Oct2023.tsv', 'w', encoding='utf-8') # Metadata
for word_num in range(1, VOCAB):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    metadata.write(word + "\n")
    vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")
vectors.close()
metadata.close()