In [6]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import os
import glob


In [7]:
# Things from keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense,GRU,Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


In [8]:
def load_data(train = True):
    #loading for sentiment ananlysis
    # its like conditional operator
    # it will return a list of reviews as text-strings and a list of corresponding sentimnts

    # part of the path-name for either train or test-set
    train_test_path = "train" if train else "test"

    # base directory where the extarated data s located
    dir_base = os.path.join(data_dir, "aclImdb", train_test_path)

    # Filename-patterns for the data-files
    path_pattern_pos = os.path.join(dir_base, "pos", "*.txt")
    path_pattern_neg = os.path.join(dir_base, "neg", "*.txt")
    
    # Get list of all the file-paths for the data
    path_pos = glob.glob(path_pattern_pos)
    path_neg = glob.glob(path_pattern_neg)

    # Read all the text_files
    data_pos = [_read_text_file(path) for path in path_pos]
    data_neg = [_read_text_file(path) for path in path_neg]

    # Concatenate the positive and negative data
    x = data_pos + data_neg
    
    #creating a list of sentiment for the text-data
    y = [1.0]*len(data_pos) + [0.0] * len(data_neg)

    return x,y
def _read_text_file(path):
    # Read and return alll the content of the text file with the given path

    with open(path, 'rt', encoding = 'utf-8') as file:
        # Read a list of string
        lines = file.readlines()

        # Concatenate to a single string.
        text = " ".join(lines)

    return text


In [9]:
#  saving the data in the location
data_dir = "D:\\Ongoing Work\\Tensorflow\\NLP-Sentiment analysis"

In [10]:
#loading the training and testing dataset
x_train_text,y_train = load_data(train = True)



In [11]:
x_test_text,y_test = load_data(train = False)


In [7]:
print("Train-set size: ",len(x_train_text))


Train-set size:  25000


In [8]:
print("Test-set size: ",len(x_test_text))

Test-set size:  25000


In [12]:
# combine into one dataset for some uses below
data_text = x_train_text + x_test_text
print(x_train_text[2])


Brilliant over-acting by Lesley Ann Warren. Best dramatic hobo lady I have ever seen, and love scenes in clothes warehouse are second to none. The corn on face is a classic, as good as anything in Blazing Saddles. The take on lawyers is also superb. After being accused of being a turncoat, selling out his boss, and being dishonest the lawyer of Pepto Bolt shrugs indifferently "I'm a lawyer" he says. Three funny words. Jeffrey Tambor, a favorite from the later Larry Sanders show, is fantastic here too as a mad millionaire who wants to crush the ghetto. His character is more malevolent than usual. The hospital scene, and the scene where the homeless invade a demolition site, are all-time classics. Look for the legs scene and the two big diggers fighting (one bleeds). This movie gets better each time I see it (which is quite often).


In [12]:
y_train[2]

1.0

In [13]:
# a nn cannot work directly on text-strings dataset so there is a step called tokenizer which
# converts words to integer and is done on the dataset before it is given as input to the nn
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)

In [14]:
#  it then strips it from unwanted characters such as punctuations and converts it to a lower case
# then it bulids a vocabulary of all unique words along with varous datastructures 
# we fit the tokenizer on the entire data-set so it gathers words from both the training and test set

tokenizer.fit_on_texts(data_text)

In [15]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [16]:
len(tokenizer.word_index)

124252

In [17]:
# then use the tokenizer to convert all texts in the training-set to lists of these tokens.
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [18]:
x_train_text[1]

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days withou

In [19]:
np.array(x_train_tokens[1])


array([  38,   14,  744, 3506,   45,   75,   32, 1771,   15,  153,   18,
        110,    3, 1344,    5,  343,  143,   20,    1,  920,   12,   70,
        281, 1228,  395,   35,  115,  267,   36,  166,    5,  368,  158,
         38, 2058,   15,    1,  504,   88,   83,  101,    4,    1, 4339,
         14,   39,    3,  432, 1148,  136, 8697,   42,  177,  138,   14,
       2791,    1,  295,   20, 5276,  351,    5, 3029, 2310,    1,   38,
       8697,   43, 3611,   26,  365,    5,  127,   53,   20,    1, 2032,
          7,    7,   18,   48,   43,   22,   70,  358,    3, 2343,    5,
        420,   20,    1, 2032,   15,    3, 3346,  208,    1,   22,  281,
         66,   36,    3,  344,    1,  728,  730,    3, 3864, 1320,   20,
          1, 1543,    3, 1293,    2,  267,   22,  281, 2734,    5,   63,
         48,   44,   37,    5,   26, 4339,   12,    6, 2079,    7,    7,
       3425, 2891,   35, 4446,   35,  405,   14,  297,    3,  986,  128,
         35,   45,  267,    8,    1,  181,  366, 69

In [20]:
# we also need to convert the texts in the test-set to tokens
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

In [21]:
# The Recurrent Neural Network can take sequences of arbitrary length as input, but in order to use a whole batch of data,it need to have the same length so either we ensure that all are in the entire data-set have the same length, or we write a custom data-generator that ensures that it has the same length within each batch.
#first is simpler but if we use the length of the longest sequence in the data-set, then a lot of memory is wated which is a problem in large dataset.
# So we will use a sequence-length that covers most sequences in the data-set, and we will then truncate longer sequences and pad shorter sequences.
# First we count the number of tokens in all the sequences in the data-s
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
print(np.mean(num_tokens))
print(np.max(num_tokens))

221.27716
2209


In [22]:
# The max number of tokens we will allow is set to the average plus 2 standard deviations
max_tokens = np.mean(num_tokens)+ 2 * np.std(num_tokens)

#Converting the value to int
max_tokens = int(max_tokens)
print(max_tokens)

544


In [23]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9453

In [24]:
# now its imp to decide whether to do padding or truncating pre or post
# trucation means part of the sequence thrown away and padding means adding zeros at the front or at the end
# here we are using re bcoz we have set that model will know the text is starting and if we will do post then there is a cjance of forgetting as so many zeros will come

pad = 'pre'
# but when we aree truncating we may loose some important information or features then we have to make compromise

In [25]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [26]:
x_train_pad

array([[   0,    0,    0, ...,   12,    9,  213],
       [   0,    0,    0, ...,    5,  343,  400],
       [   0,    0,    0, ...,    6,  179,  403],
       ...,
       [   0,    0,    0, ...,   17,   96,   74],
       [   0,    0,    0, ...,  260, 1219,  793],
       [   0,    0,    0, ...,   11,    6, 1377]])

In [27]:

x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [28]:
# training set into one big matrix of integers with this shape
x_train_pad.shape

(25000, 544)

In [29]:
x_test_pad.shape

(25000, 544)

In [30]:
#  now inverse map
# keras implememntation token doesnot seem to have inverse from integer tokens back to words
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [31]:

# Helper-function for converting a list of tokens back to a string of words.
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [32]:
# Lets see how well it converts 
x_train_text[1]

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days withou

In [33]:
tokens_to_string(x_train_tokens[1])

"or as george stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most people think of the homeless as just a lost cause while worrying about things such as racism the war on iraq kids to succeed technology the or worrying if they'll be next to end up on the streets br br but what if you were given a bet to live on the streets for a month without the you once had from a home the entertainment sets a bathroom pictures on the wall a computer and everything you once treasure to see what it's like to be homeless that is lesson br br mel brooks who directs who stars as plays a rich man who has everything in the world until deciding to make a bet with a sissy rival to see if he can live in the streets for thirty days without the if succeeds he can do what he wants with a future project of making more buildings the on where is thrown on the street with a on his leg t

In [34]:
# now we will create the rnn
model = Sequential()
# The first layer in the RNN is a so-called Embedding-layer which converts each integer-token into a vector of values

In [35]:
# each integer token will be converted to a vector of length 8
embedding_size = 8

In [36]:
# embedding layer also need the number of words inthe vocabulary and the length of the padded token sequence
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedded'))

In [37]:
# Adding the first recurrent layer: 
# Gated Recurrent Unit 
# Here we want an output dimensionality of 16
model.add(GRU(16, return_sequences=True))

In [38]:
# This adds the second GRU with 8 output units. This will be followed by another GRU so it must also return sequences.

model.add(GRU(units=8, return_sequences=True))

In [39]:
# This adds the third and final GRU with 4 output units. This will be followed by a dense-layer, so it should only give the final output of the GRU and not a whole sequence of outputs.
model.add(GRU(units=4))

In [40]:
# Add a fully-connected / dense layer which computes a value between 0.0 and 1.0 that will be used as the classification output.

model.add(Dense(1, activation='sigmoid'))

In [41]:
optimizer = Adam(lr=1e-3)

In [42]:

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [43]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedded (Embedding)   (None, 544, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 544, 16)           1200      
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [44]:
# we are using the data-set with the padded sequences. We use 5% of the training-set as a small validation-set, so we have a rough idea whether the model is generalizing well or if it is perhaps overfitting to the training dataset

model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1d9f93f3fd0>

In [49]:
# calculating its classification accuracy on the test set
# %%time
result = model.evaluate(x_test_pad, y_test)



In [50]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 85.94%


In [51]:
# to show thw misclassified text we will first calculate the predicted sentiment in the first 1000 texts in the test text
# %%time
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]
print(y_pred)

[0.95582455 0.4011923  0.81218326 0.9506733  0.9579177  0.31032786
 0.95552397 0.8799609  0.9170641  0.96540195 0.21240501 0.9555901
 0.9537462  0.9236258  0.18779394 0.96319073 0.9672125  0.68618256
 0.9645098  0.9687168  0.82553697 0.9580447  0.9637562  0.9606963
 0.81565493 0.9686631  0.84626734 0.07441531 0.7838693  0.9638453
 0.95521224 0.96644306 0.95134133 0.9555901  0.9617785  0.72868216
 0.93510616 0.95884895 0.92825264 0.9023125  0.96157026 0.9580709
 0.03907356 0.9198529  0.79178154 0.9468232  0.9645281  0.7424391
 0.95718867 0.9239586  0.9574195  0.58071107 0.96441287 0.9634717
 0.95826524 0.95744187 0.9658411  0.9600708  0.95875955 0.9644537
 0.93277055 0.9660344  0.95630336 0.96499515 0.94352305 0.8836047
 0.8961366  0.96077037 0.96561915 0.05895564 0.9670474  0.9663078
 0.9656525  0.05715375 0.69606876 0.9638231  0.8865112  0.91913843
 0.96264106 0.95291173 0.9669572  0.04157428 0.88417774 0.964602
 0.96145946 0.95691633 0.9227511  0.940911   0.805407   0.9663057
 0.4681

In [52]:
cls_pred= np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

In [53]:
cls_true = np.array(y_test[0:1000])

In [54]:
# -We can then get indices for all the texts that were incorrectly classified by comparing all the "clases" of these two arrays.
incorrect = np.where(cls_pred != cls_true)
print(incorrect)

incorrect = incorrect[0]
print(incorrect)

(array([  1,   5,  10,  14,  27,  42,  69,  73,  81,  90,  99, 101, 115,
       118, 143, 144, 145, 154, 157, 166, 168, 172, 178, 179, 182, 188,
       199, 206, 208, 211, 215, 221, 222, 223, 232, 233, 236, 237, 238,
       239, 242, 243, 247, 272, 280, 287, 296, 304, 306, 321, 341, 354,
       357, 359, 360, 370, 371, 379, 392, 393, 394, 395, 396, 397, 400,
       402, 406, 417, 426, 432, 434, 436, 437, 438, 439, 442, 443, 445,
       448, 451, 454, 456, 461, 471, 483, 497, 504, 508, 552, 553, 556,
       557, 560, 561, 563, 566, 567, 570, 576, 592, 600, 601, 602, 603,
       604, 607, 611, 613, 645, 664, 667, 669, 671, 675, 677, 688, 709,
       714, 717, 744, 745, 749, 751, 758, 764, 767, 768, 769, 771, 772,
       773, 774, 778, 779, 787, 794, 811, 819, 834, 839, 842, 870, 871,
       872, 873, 882, 886, 891, 915, 925, 930, 932, 946, 954, 961, 963,
       964, 965, 971, 979, 996, 997], dtype=int64),)
[  1   5  10  14  27  42  69  73  81  90  99 101 115 118 143 144 145 154
 157 166 

In [55]:
#out of 1000 how mqny are mis classified
len(incorrect)

162

In [56]:
# first misclassified text
idx = incorrect[0]

idx

1

In [57]:
# first mis classified text is
text = x_test_text[idx]
text

'Actor turned director Bill Paxton follows up his promising debut, the Gothic-horror "Frailty", with this family friendly sports drama about the 1913 U.S. Open where a young American caddy rises from his humble background to play against his Bristish idol in what was dubbed as "The Greatest Game Ever Played." I\'m no fan of golf, and these scrappy underdog sports flicks are a dime a dozen (most recently done to grand effect with "Miracle" and "Cinderella Man"), but some how this film was enthralling all the same.<br /><br />The film starts with some creative opening credits (imagine a Disneyfied version of the animated opening credits of HBO\'s "Carnivale" and "Rome"), but lumbers along slowly for its first by-the-numbers hour. Once the action moves to the U.S. Open things pick up very well. Paxton does a nice job and shows a knack for effective directorial flourishes (I loved the rain-soaked montage of the action on day two of the open) that propel the plot further or add some unexpec

In [58]:
y_pred[idx]

0.4011923

In [59]:
cls_true[idx] 

1.0

In [60]:
# new data
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [61]:
# We first convert these texts to arrays of integer-tokens
tokens = tokenizer.texts_to_sequences(texts)

In [62]:
# To input texts with different lengths into the model, we also need to pad and truncate them.
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

(8, 544)

In [63]:
# use the trained model to predict the sentiment for these texts
model.predict(tokens_pad)

array([[0.9469527 ],
       [0.9029197 ],
       [0.8120991 ],
       [0.87895447],
       [0.7920635 ],
       [0.5935842 ],
       [0.87700737],
       [0.30296597]], dtype=float32)

In [65]:
# First we need to get the embedding-layer from the model:
layer_embedding = model.get_layer('layer_embedded')

In [67]:
#  then get the weights used for the mapping done by the embedding-layer.
weights_embedding = layer_embedding.get_weights()[0]

In [68]:
weights_embedding.shape

(10000, 8)

In [69]:
# get the integer-token for the word 'good', which is just an index into the vocabulary.
token_good = tokenizer.word_index['good']
token_good

49

In [71]:
token_great = tokenizer.word_index['great']
token_great

78

In [72]:
weights_embedding[token_good]

array([-0.00690045,  0.03755159, -0.00129658, -0.05404105,  0.04052211,
        0.03088633,  0.01241427, -0.06749809], dtype=float32)

In [73]:
weights_embedding[token_great]

array([-0.15195422,  0.07776609, -0.07033638, -0.13806769,  0.10075531,
        0.1582286 ,  0.09626482, -0.06160773], dtype=float32)

In [74]:
# embedding-vectors can be measured by different metrics, e.g. Euclidean distance or cosine distance.

def print_sorted_words(word, metric='cosine'):
    """
    Print the words in the vocabulary sorted according to their
    embedding-distance to the given word.
    Different metrics can be used, e.g. 'cosine' or 'euclidean'.
    """

    # Get the token (i.e. integer ID) for the given word.
    token = tokenizer.word_index[word]

    # Get the embedding for the given word. Note that the
    # embedding-weight-matrix is indexed by the word-tokens
    # which are integer IDs.
    embedding = weights_embedding[token]

    # Calculate the distance between the embeddings for
    # this word and all other words in the vocabulary.
    distances = cdist(weights_embedding, [embedding],
                      metric=metric).T[0]
    
    # Get an index sorted according to the embedding-distances.
    # These are the tokens (integer IDs) for words in the vocabulary.
    sorted_index = np.argsort(distances)
    
    # Sort the embedding-distances.
    sorted_distances = distances[sorted_index]
    
    # Sort all the words in the vocabulary according to their
    # embedding-distance. This is a bit excessive because we
    # will only print the top and bottom words.
    sorted_words = [inverse_map[token] for token in sorted_index
                    if token != 0]

    # Helper-function for printing words and embedding-distances.
    def _print_words(words, distances):
        for word, distance in zip(words, distances):
            print("{0:.3f} - {1}".format(distance, word))

    # Number of words to print from the top and bottom of the list.
    k = 10

    print("Distance from '{0}':".format(word))

    # Print the words with smallest embedding-distance.
    _print_words(sorted_words[0:k], sorted_distances[0:k])

    print("...")

    # Print the words with highest embedding-distance.
    _print_words(sorted_words[-k:], sorted_distances[-k:])

In [75]:
print_sorted_words('great', metric='cosine')

Distance from 'great':
0.000 - great
0.010 - tends
0.011 - lesbians
0.013 - poetry
0.014 - devotion
0.014 - loudly
0.016 - rouge
0.018 - creates
0.018 - captures
0.019 - 1972
...
1.982 - investigating
1.982 - unlikeable
1.983 - lousy
1.983 - indians
1.983 - moore
1.985 - unfunny
1.985 - scantily
1.985 - speaking
1.989 - overlong
1.992 - consists


In [76]:
print_sorted_words('worst', metric='cosine')

Distance from 'worst':
0.000 - worst
0.005 - forgettable
0.005 - supposed
0.006 - mst3k
0.008 - flimsy
0.008 - amateurish
0.008 - hack
0.008 - pepper
0.008 - smarmy
0.008 - obnoxious
...
1.991 - rookie
1.991 - lingering
1.991 - compositions
1.991 - loved
1.993 - perfect
1.993 - twists
1.993 - exceptional
1.995 - impressed
1.995 - greatest
1.998 - 7
