In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
sentences = ['if your love was fake',
             'then abandoning you is a piece of cake',
             'hit it farghaly',
             'she is in the past',
             'and the past is not my concern',
             'now we play the offense',
             'you blew up my pub',
             'in the bleak midwinter']

VOCAB_SIZE = 40

In [4]:
import keras 
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

word_dict = tokenizer.word_index
print('Words:Token\n', word_dict)

sequences = tokenizer.texts_to_sequences(sentences)
print('\nTokenized Sentences')
print(sequences)

print('\nOOV Sentence')
oov_text=['any of you boys faught in france?']
print(tokenizer.texts_to_sequences(oov_text))

Using TensorFlow backend.


Words:Token
 {'<OOV>': 1, 'the': 2, 'is': 3, 'you': 4, 'in': 5, 'past': 6, 'my': 7, 'if': 8, 'your': 9, 'love': 10, 'was': 11, 'fake': 12, 'then': 13, 'abandoning': 14, 'a': 15, 'piece': 16, 'of': 17, 'cake': 18, 'hit': 19, 'it': 20, 'farghaly': 21, 'she': 22, 'and': 23, 'not': 24, 'concern': 25, 'now': 26, 'we': 27, 'play': 28, 'offense': 29, 'blew': 30, 'up': 31, 'pub': 32, 'bleak': 33, 'midwinter': 34}

Tokenized Sentences
[[8, 9, 10, 11, 12], [13, 14, 4, 3, 15, 16, 17, 18], [19, 20, 21], [22, 3, 5, 2, 6], [23, 2, 6, 3, 24, 7, 25], [26, 27, 28, 2, 29], [4, 30, 31, 7, 32], [5, 2, 33, 34]]

OOV Sentence
[[1, 17, 4, 1, 1, 5, 1]]


In [5]:
from keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences, maxlen=10)
print('Padded Sequence\n', padded)

Padded Sequence
 [[ 0  0  0  0  0  8  9 10 11 12]
 [ 0  0 13 14  4  3 15 16 17 18]
 [ 0  0  0  0  0  0  0 19 20 21]
 [ 0  0  0  0  0 22  3  5  2  6]
 [ 0  0  0 23  2  6  3 24  7 25]
 [ 0  0  0  0  0 26 27 28  2 29]
 [ 0  0  0  0  0  4 30 31  7 32]
 [ 0  0  0  0  0  0  5  2 33 34]]


## Sentiment Real Data (AMAZON REVIEWS)

In [6]:
dataset = pd.read_csv('./Amazon Reviews/combined_data.csv', index_col=0)
display(dataset.head(2))


sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()


training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]


training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

Unnamed: 0,text,sentiment
0,So there is no way for me to plug it in here i...,0
1,Good case Excellent value.,1


In [7]:
VOCAB_SIZE = 500
EMBEDDING_DIM = 32
MAX_LENGTH = 50
OOV_TOK = "<OOV>"

tokenizer = Tokenizer(num_words=VOCAB_SIZE ,oov_token=OOV_TOK)
tokenizer.fit_on_texts(training_sentences)
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=MAX_LENGTH)

test_sequences = tokenizer.texts_to_sequences(testing_sentences)
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH)

In [8]:
def decode_sequence(text, index_word):
    return ' '.join([index_word.get(word, '?') for word in text])

print('decoded:\n',decode_sequence(padded[1], tokenizer.index_word))
print('\noriginal:\n',training_sentences[1])

decoded:
 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? good case excellent value

original:
 Good case Excellent value.


In [49]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, GlobalAveragePooling1D
from keras.layers import Bidirectional, LSTM

model = Sequential(
    [
        Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM,
                 input_shape=([MAX_LENGTH])),
        Bidirectional(LSTM(EMBEDDING_DIM, return_sequences=True)),
        Bidirectional(LSTM(EMBEDDING_DIM, return_sequences=True)),
        #Flatten(),
        GlobalAveragePooling1D(),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]
)

model.compile(loss=keras.losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
history=model.fit(padded, training_labels_final, epochs=50, 
                  validation_data=(test_padded, testing_labels_final))

Train on 1593 samples, validate on 399 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [50]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 50, 32)            16000     
_________________________________________________________________
bidirectional_7 (Bidirection (None, 50, 64)            16640     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 50, 64)            24832     
_________________________________________________________________
global_average_pooling1d_4 ( (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                

### Visualizing Embeddings

In [51]:
embeddings_weights = model.layers[0].get_weights()[0]
print("Embedding Matrix's Shape", embeddings_weights.shape)

Embedding Matrix's Shape (500, 32)


### PCA

In [52]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
three_d = pca.fit_transform(embeddings_weights)
print('Data Decomposed Shape', three_d.shape)

Data Decomposed Shape (500, 3)


### Embedded DataFrame

In [53]:
word_emb_df = pd.DataFrame(columns=['word', 'embedding'])
word_emb_df['word'] = np.array(list(tokenizer.word_index.keys()))[:VOCAB_SIZE-1]
word_emb_df.head(1)

Unnamed: 0,word,embedding
0,<OOV>,


In [54]:
embeddings = []
for word_num in range(0, VOCAB_SIZE-1):
    embeddings.append(three_d[word_num])

In [55]:
word_emb_df['embedding'] = embeddings
word_emb_df.tail(2)

Unnamed: 0,word,embedding
497,chicken,"[0.377891, -0.021889165, -0.018992584]"
498,town,"[0.1776778, -0.12323314, -0.023003444]"


### Embedding Dimesions

In [56]:
import plotly.express as px

words = list(tokenizer.word_index.keys())
fig = px.scatter_3d(
    data_frame=word_emb_df,
    x=three_d[:VOCAB_SIZE-1, 0],
    y=three_d[:VOCAB_SIZE-1, 1],
    z=three_d[:VOCAB_SIZE-1, 2],
    hover_data=['word'] 
)

fig.show()

In [57]:
def spherize(emb):
    return np.divide(emb, np.linalg.norm(np.array(emb), ord=2, keepdims=True))

In [58]:
word_emb_df['normalized'] = word_emb_df['embedding'].apply(lambda x:spherize(x))
word_emb_df.head(2)

Unnamed: 0,word,embedding,normalized
0,<OOV>,"[-0.057333782, 0.041911256, -0.12192316]","[-0.40633678, 0.29703403, -0.86409557]"
1,the,"[-0.06834805, 0.18181008, -0.02208801]","[-0.3496339, 0.930048, -0.112991035]"


In [59]:
np.array(word_emb_df['normalized'].tolist())

array([[-0.40633678,  0.29703403, -0.86409557],
       [-0.3496339 ,  0.930048  , -0.11299103],
       [-0.6358594 , -0.48657736, -0.59910375],
       ...,
       [ 0.81361693, -0.57132816, -0.10775646],
       [ 0.9970723 , -0.05775496, -0.05011228],
       [ 0.81709325, -0.5667167 , -0.10578676]], dtype=float32)

In [60]:
spherized_emb = np.array(word_emb_df['normalized'].tolist())

fig = px.scatter_3d(
    data_frame=word_emb_df,
    x=spherized_emb[:VOCAB_SIZE-1, 0],
    y=spherized_emb[:VOCAB_SIZE-1, 1],
    z=spherized_emb[:VOCAB_SIZE-1, 2],
    hover_data=['word'] 
)

fig.show()

In [65]:
# Use the model to predict a review   
fake_reviews = ['I loved the phone', 'I hate ceasar salad', 
                'Everything was cold',
                'burger was juicy', 
                'Everything was horrible', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the flowers on the table',
               ]

print(fake_reviews) 

# Create the sequences
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, maxlen=MAX_LENGTH)           


classes = model.predict(fakes_padded)

# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(fake_reviews)):
    print(fake_reviews[x])
    print(classes[x])
    print('\n')

['I loved the phone', 'I hate ceasar salad', 'Everything was cold', 'burger was juicy', 'Everything was horrible', 'the host seated us immediately', 'they gave us free chocolate cake', 'not sure about the flowers on the table']
I loved the phone
[0.99841607]


I hate ceasar salad
[0.00080779]


Everything was cold
[0.49635372]


burger was juicy
[0.0122986]


Everything was horrible
[0.001]


the host seated us immediately
[0.99510455]


they gave us free chocolate cake
[0.998548]


not sure about the flowers on the table
[0.00254988]


