In [None]:
#!pip install tensorflow
#!pip install -q tensorflow-datasets


In [56]:
import numpy as np
import sys
import warnings

import pickle

import re

from bs4 import BeautifulSoup # remove the html tags like <br> <tr> ....

import tensorflow as tf
from tensorflow.keras import datasets, preprocessing, models, layers,regularizers

if not sys.warnoptions:
    warnings.simplefilter("ignore")

print(tf.__version__)

2.4.1


### Load IMDB Dataset : Inclouding reviews and emotions 

In [2]:
import tensorflow_datasets as tfds # Tensorflow datasets 
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True) # call the IMDB dataset


In [3]:
imdb

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}


### Function to Remove Html Tags 

In [4]:

def strip_special_chars(st):
    st = BeautifulSoup(st, "lxml").text
    my_pattern = '[A-Za-z0-9.! ]+'
    return ''.join(re.findall(my_pattern, st))

### Build a Test and Train Dataset  

In [5]:
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:#get sentences and correspondent labels from the tensor dataset
    training_sentences.append(strip_special_chars(str(s.numpy()).lower()))
    #training_sentences.append(str(s.numpy()).lower())
    training_labels.append(l.numpy())
for s,l in test_data:
    testing_sentences.append(strip_special_chars(str(s.numpy()).lower()))
    #testing_sentences.append(str(s.numpy()).lower())
    testing_labels.append(l.numpy())

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [6]:
#### test a dataset shape and sentences

Dict={0:"Negative",1:"Positive"}


for i in range (4):
    
    print("the review is : ",Dict[training_labels_final[i]])
    print(training_sentences[i],"\n")



the review is :  Negative
bthis is a big step down after the surprisingly enjoyable original. this sequel isnt nearly as fun as part one and it instead spends too much time on plot development. tim thomerson is still the best thing about this series but his wisecracking is toned down in this entry. the performances are all adequate but this time the script lets us down. the action is merely routine and the plot is only mildly interesting so i need lots of silly laughs in order to stay entertained during a trancers movie. unfortunately the laughs are few and far between and so this film is watchable at best. 

the review is :  Negative
bperhaps because i was so young innocent and brainwashed when i saw it this movie was the cause of many sleepless nights for me. i havent seen it since i was in seventh grade at a presbyterian school so i am not sure what effect it would have on me now. however i will say that it left an impression on me... and most of my friends. it did serve its purpose

# Tokenizer function 
### Create the dictionary of known words as your bag of knowledge

In [7]:
vocab_size = 10000 # number of knowing Words 
embedding_dim = 16 # similarities dimension for word embedding function 
max_length = 200 # sentences length to build the model
trunc_type='post'
oov_tok = "<OOV>" # codding for unknowing words


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# creat tokenizer 

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False,document_count=0)


# train the tokenizer #### it's required for the prodoction level as well 
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index


#creat the training set : 
sequences = tokenizer.texts_to_sequences(training_sentences)
#padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)
padded = pad_sequences(sequences,maxlen=max_length,padding="post")


testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,padding="post")



In [47]:
#### Save the Tokenizer and padding Functions 

pickle.dump(tokenizer, open( "tokenizer.pkl", "wb" ) )
#pickle.dump( tokenizer, open( "tokenizer.pkl", "wb" ) )






In [22]:
test="""thisanewword the problem is that when you have been watching this movie for an hour you will see the same fantasiesfunny situations again and again and again"""

test_token=tokenizer.texts_to_sequences([test])
test_paded=pad_sequences(test_token,maxlen=max_length,padding="post")

print(test ,"\n","*************** \n",test_token ,"\n","**************** \n",test_paded)

thisanewword the problem is that when you have been watching this movie for an hour you will see the same fantasiesfunny situations again and again and again 
 *************** 
 [[1, 2, 433, 7, 12, 52, 22, 25, 74, 147, 10, 17, 15, 33, 561, 22, 76, 65, 2, 166, 1, 1153, 170, 3, 170, 3, 170]] 
 **************** 
 [[   1    2  433    7   12   52   22   25   74  147   10   17   15   33
   561   22   76   65    2  166    1 1153  170    3  170    3  170    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    

In [23]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.95):
            print("\nReached 95.% accuracy so cancelling training!")
            self.model.stop_training = True
callbacks = myCallback()

In [55]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    layers.SeparableConv1D(filters=10, kernel_size=5, strides=3, padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling1D(),
    layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, kernel_regularizer=regularizers.l1(0.001), activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, kernel_regularizer=regularizers.l1(0.001),activation='relu'),
    tf.keras.layers.Dropout(0.5),
    #tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',tf.keras.metrics.AUC(),tf.keras.metrics.Recall(),tf.keras.metrics.TruePositives()])

model.summary()

NameError: name 'regularizers' is not defined

In [49]:
num_epochs = 500
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final),callbacks=[callbacks])

Epoch 1/500

Reached 95.% accuracy so cancelling training!


<tensorflow.python.keras.callbacks.History at 0x14db31d4ac8>

In [50]:
model.save("Sequential.h5")

In [None]:
### Test a New Sentence

In [51]:
def text_convertore(text,max_length=200):
    text=strip_special_chars(text.lower())
    test=tokenizer.texts_to_sequences([text])
    
    paded_test=pad_sequences(test,maxlen=max_length,padding="post")
    
    return paded_test
    

In [52]:
text="""
It has some iffy parts but in the end kids are exposed to much worse at school and being able to talk about it in a safe environment is great. Explain to your child that some of the behavior shown on the show is not OK.
"""
test=text_convertore(text,max_length=200)
print(test)

print(model.predict(test))
print(Dict[model.predict_classes(test)[0][0]])


[[   9   43   46    1  516   18    8    2  126  326   23 3899    6   72
   429   31  402    3  108  498    6  731   42    9    8    4 2266 2734
     7   84 1223    6  123  520   12   46    5    2 1979  602   20    2
   119    7   21  711    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [53]:
Test_1="""
it was one of the best things that ever happened in my life and i just finished it so i wanna thank all the cast and producers for making those such great moments.i love guys.

"""
Test_1=text_convertore(Test_1,max_length=200)
print(Test_1)

print(model.predict(Test_1))
print(Dict[model.predict_classes(Test_1)[0][0]])

[[   9   13   28    5    2  113  179   12  121  559    8   57  117    3
    11   41 1742    9   37   11 3394 1285   30    2  174    3 1139   15
   248  142  136   84  374   11  114  458    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [30]:
model.predict(test)

array([[0.20093167]], dtype=float32)

In [31]:
model.predict_classes(test)

array([[0]])

In [32]:
model.predict_proba(test)

array([[0.20093167]], dtype=float32)