# !pip install -q  tensorflow-datasets

In [2]:
import tensorflow as tf

In [3]:
import tensorflow_datasets as tfds

In [4]:
imdb , info = tfds.load(name="imdb_reviews",with_info=True,as_supervised=True)

In [5]:
import numpy as np

In [6]:
train_data , test_data = imdb['train'],imdb['test']

In [7]:
train_sentence = []
train_label    = []
test_sentence  = []
test_label     = []

for s,l in train_data:
    train_sentence.append(s.numpy())
    train_label.append(l.numpy())
    
for s,l in test_data:
    test_sentence.append(s.numpy())
    test_label.append(l.numpy())

In [8]:
str(train_sentence[0])[2:-1]

'This is a big step down after the surprisingly enjoyable original. This sequel isn\\\'t nearly as fun as part one, and it instead spends too much time on plot development. Tim Thomerson is still the best thing about this series, but his wisecracking is toned down in this entry. The performances are all adequate, but this time the script lets us down. The action is merely routine and the plot is only mildly interesting, so I need lots of silly laughs in order to stay entertained during a "Trancers" movie. Unfortunately, the laughs are few and far between, and so, this film is watchable at best.'

In [9]:
train_label[0]

0

In [10]:
train_label_final = np.array(train_label)
test_label_final  = np.array(test_label)

In [14]:
for i in range(len(test_sentence)):
    test_sentence[i] = str(test_sentence[i])[2:-1]

In [15]:
for i in range(len(train_sentence)):
    train_sentence[i] = str(train_sentence[i])[2:-1]

In [11]:
vocab_size = 10000
emdedding_dim = 16
max_len = 120
trunc_type = 'post'
oov_tok = '<OOV>'

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentence)
word_index = tokenizer.word_index
sequence = tokenizer.texts_to_sequences(train_sentence)
padded = pad_sequences(sequence,maxlen=max_len,truncating=trunc_type)

In [17]:
len(sequence[0])

107

In [18]:
test_sequence = tokenizer.texts_to_sequences(test_sentence)
test_padded   = pad_sequences(test_sequence,maxlen=max_len)

In [19]:
model1 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size,emdedding_dim,input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid")
])

In [20]:
model1.compile(optimizer='adam',loss=tf.keras.losses.binary_crossentropy,metrics=['acc'])

In [21]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = model1.fit(padded,train_label_final,epochs=10,validation_data=(test_padded,test_label_final))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10


Epoch 3/10


Epoch 4/10


Epoch 5/10


Epoch 6/10


Epoch 7/10


Epoch 8/10


Epoch 9/10


Epoch 10/10




In [23]:
weights = model1.layers[0].get_weights()[0]

In [24]:
weights.shape

(10000, 16)

In [25]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

In [26]:
import io

In [27]:
out_v = io.open("vecs.tsv","w",encoding='utf-8')
out_m = io.open("meta.tsv","w",encoding='utf-8')

In [28]:
for word_num in range(1,vocab_size):
    word = reverse_word_index[word_num]
    embedding = weights[word_num]
    out_m.write(word+'\n')
    out_v.write('\t'.join([str(x) for x in embedding]) + '\n')
out_m.close()
out_v.close()

In [29]:
weights[0]

array([ 0.02825627,  0.00834996, -0.04371049,  0.02685196,  0.03628504,
        0.00297034,  0.00527533, -0.01787513, -0.00093686, -0.03207723,
       -0.01693283, -0.01183231, -0.0505828 ,  0.07710297, -0.01004159,
       -0.04056597], dtype=float32)

In [32]:
reverse_word_index[5]

'of'

In [33]:
len(word_index)

84459

In [53]:
sen = ["i am ankit","hi there"]
tk = Tokenizer(3)

In [54]:
tk.fit_on_texts(sen)
print(tk.word_index)

{'i': 1, 'am': 2, 'ankit': 3, 'hi': 4, 'there': 5}


In [55]:
tk.texts_to_sequences(sen)

[[1, 2], []]