<a href="https://colab.research.google.com/github/Dansah2/Udacity_Tutorials/blob/main/Udacity_NLP_Embedding_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# extract dataset
!wget --no-check-certificate \
    -O /tmp/sentiment.csv https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P

--2023-06-23 20:35:21--  https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P
Resolving drive.google.com (drive.google.com)... 142.250.141.139, 142.250.141.102, 142.250.141.113, ...
Connecting to drive.google.com (drive.google.com)|142.250.141.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/j5hvf2aev0b0k363a84eejts9k5tjiss/1687552500000/11118900490791463723/*/13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P?uuid=aede4b83-2757-415e-b0af-7d8598cc26ae [following]
--2023-06-23 20:35:22--  https://doc-08-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/j5hvf2aev0b0k363a84eejts9k5tjiss/1687552500000/11118900490791463723/*/13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P?uuid=aede4b83-2757-415e-b0af-7d8598cc26ae
Resolving doc-08-ak-docs.googleusercontent.com (doc-08-ak-docs.googleusercontent.com)... 142.251.2.132, 2607:f8b0:4023:c0d::84
Connecting to doc

In [3]:
import numpy as np
import pandas as pd

# read the data into a dataframe
dataset = pd.read_csv('/tmp/sentiment.csv')

#define the sentences and the labels
sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()


# seperate sentences/labels into training/test sets
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

# convert testing labels into numpy array
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [4]:
# tokenize the data

# set hyperprams
vocab_size = 1000
embedding_dim = 16
max_length = 100
truc_type='post'
padding_type='post'
oov_tok='<OOV>'

#instantiate tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

#fit the tokenizer on the training data
tokenizer.fit_on_texts(training_sentences)

# create a word index
word_index = tokenizer.word_index

# sequence the training data
sequences = tokenizer.texts_to_sequences(training_sentences)

# create padded training sequences
padded = pad_sequences(sequences, maxlen=max_length,
                       padding=padding_type, truncating=truc_type)

# sequence the testing data
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# create padded testing sequences
testing_padded = pad_sequences(testing_sequences, maxlen=max_length,
                               padding=padding_type, truncating=truc_type)

In [5]:
# check the sequences to make sure all the code in the previous cell
# correctly executed
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
  return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[1]))
print(training_sentences[1])

good case excellent value ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
Good case Excellent value.


In [6]:
# build sentiment network
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# look at a summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           16000     
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 6)                 9606      
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 25,613
Trainable params: 25,613
Non-trainable params: 0
_________________________________________________________________


In [7]:
# set epochs
num_epochs = 10

#fit the model
model.fit(padded, training_labels_final,
          epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fcf6c8a1c90>

In [8]:
# get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
weights.shape

(1000, 16)

In [9]:
import io

# write embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + '\n')
  out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_v.close()
out_m.close()

In [10]:
# download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
# predict sentiment using the model
fake_reviews = [
    'I love this phone', 'I hate spaghetti',
    'Everything was cold',
    'Everything was hot exactly as I wanted',
    'Everything was green',
    'the hose seated us immediately',
    'they gave us free chocolate cake and did not charge us',
    'not sure about the wilted flowers on the table',
    'only works when I stand on tippy toes',
    'does not work when I stand on my head',
    'I hate the food here'
]
print(fake_reviews)

# create sequences
padding_type = 'post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\n Check out these reviews, they are completely real')

classes = model.predict(fakes_padded)

# the closer the class is to one the more positive the review predictions
for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

['I love this phone', 'I hate spaghetti', 'Everything was cold', 'Everything was hot exactly as I wanted', 'Everything was green', 'the hose seated us immediately', 'they gave us free chocolate cake and did not charge us', 'not sure about the wilted flowers on the table', 'only works when I stand on tippy toes', 'does not work when I stand on my head', 'I hate the food here']

 Check out these reviews, they are completely real
I love this phone
[0.98650587]


I hate spaghetti
[0.09821965]


Everything was cold
[0.54673505]


Everything was hot exactly as I wanted
[0.5110122]


Everything was green
[0.54160285]


the hose seated us immediately
[0.7933802]


they gave us free chocolate cake and did not charge us
[0.6580141]


not sure about the wilted flowers on the table
[0.05286513]


only works when I stand on tippy toes
[0.91257036]


does not work when I stand on my head
[0.01860403]


I hate the food here
[0.31648389]


