In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#!pip install tensorflow keras gensim scikit-learn

import numpy as np
import tensorflow as tf

from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import Word2Vec
from gensim.models import FastText

In [0]:
def load_data(vocab_size,max_len):
    """
        Loads the keras imdb dataset

        Args:
            vocab_size = {int} the size of the vocabulary
            max_len = {int} the maximum length of input considered for padding

        Returns:
            X_train = tokenized train data
            X_test = tokenized test data

    """
    INDEX_FROM = 3

    # save np.load
    np_load_old = np.load

    # modify the default parameters of np.load
    np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True)

    (X_train,y_train),(X_test,y_test) = imdb.load_data(num_words = vocab_size,index_from = INDEX_FROM)

    # restore np.load for future normal usage
    np.load = np_load_old

    print(len(X_train), len(X_test), len(y_train), len(y_test), "#####################################")

    return X_train,X_test,y_train,y_test


In [0]:
def prepare_data_for_word_vectors_imdb(X_train):
    """
        Prepares the input

        Args:
            X_train = tokenized train data

        Returns:
            sentences = {list} sentences containing words as tokens
            word_index = {dict} word and its indexes in whole of imdb corpus

    """
    INDEX_FROM = 3
    word_to_index = imdb.get_word_index()
    word_to_index = {k:(v+INDEX_FROM) for k,v in word_to_index.items()}
    word_to_index["<START>"] =1
    word_to_index["<UNK>"]=2

    index_to_word = {v:k for k,v in word_to_index.items()}

    sentences = []
    for i in range(len(X_train)):
        temp = [index_to_word[ids] for ids in X_train[i]]
        sentences.append(temp)
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    word_indexes = tokenizer.word_index
    """

    #print(sentences[:10],word_to_index,"sentences[:10],word_to_index[:10]*********************************************")
    return sentences,word_to_index

In [0]:

def building_word_vector_model(option,sentences,embed_dim,window):
    """
        Builds the word vector

        Args:
            option = {bool} 0 for Word2vec. 1 for gensim Fastext. 2 for Fasttext 2018.
            sentences = {list} list of tokenized words
            embed_dim = {int} embedding dimension of the word vectors
            window = {int} max distance between current and predicted word

        Returns:
            model = Word2vec/Gensim fastText/ Fastext_2018 model trained on the training corpus


    """
    if option == 0:
        print("Training a word2vec model")
        model = Word2Vec(sentences=sentences, size = embed_dim, window = window) 
        print("Training complete")

    elif option == 1:
        print("Training a Gensim FastText model")
        model = FastText(sentences=sentences, size = embed_dim, window = window) # workers = workers, 
        print("Training complete")

    return model

In [46]:
# specify “option” as  0 – Word2vec, 1 – FastText

option = 1

embed_dim = 300
max_len= 200
vocab_size= 1000
window = 1


x_train,x_test,y_train,y_test = load_data(vocab_size,max_len)
sentences,word_ix = prepare_data_for_word_vectors_imdb(x_train)
model_wv = building_word_vector_model(option,sentences,embed_dim, window)

25000 25000 25000 25000 #####################################
Training a Gensim FastText model
Training complete


In [47]:
x_train.shape, y_train.shape, len(x_train[0]), y_train[0], x_train[0]

((25000,),
 (25000,),
 218,
 1,
 [1,
  14,
  22,
  16,
  43,
  530,
  973,
  2,
  2,
  65,
  458,
  2,
  66,
  2,
  4,
  173,
  36,
  256,
  5,
  25,
  100,
  43,
  838,
  112,
  50,
  670,
  2,
  9,
  35,
  480,
  284,
  5,
  150,
  4,
  172,
  112,
  167,
  2,
  336,
  385,
  39,
  4,
  172,
  2,
  2,
  17,
  546,
  38,
  13,
  447,
  4,
  192,
  50,
  16,
  6,
  147,
  2,
  19,
  14,
  22,
  4,
  2,
  2,
  469,
  4,
  22,
  71,
  87,
  12,
  16,
  43,
  530,
  38,
  76,
  15,
  13,
  2,
  4,
  22,
  17,
  515,
  17,
  12,
  16,
  626,
  18,
  2,
  5,
  62,
  386,
  12,
  8,
  316,
  8,
  106,
  5,
  4,
  2,
  2,
  16,
  480,
  66,
  2,
  33,
  4,
  130,
  12,
  16,
  38,
  619,
  5,
  25,
  124,
  51,
  36,
  135,
  48,
  25,
  2,
  33,
  6,
  22,
  12,
  215,
  28,
  77,
  52,
  5,
  14,
  407,
  16,
  82,
  2,
  8,
  4,
  107,
  117,
  2,
  15,
  256,
  4,
  2,
  7,
  2,
  5,
  723,
  36,
  71,
  43,
  530,
  476,
  26,
  400,
  317,
  46,
  7,
  4,
  2,
  2,
  13,
  104,
  88,
  

In [0]:
def padding_input(X_train,X_test,maxlen):
    """
        Pads the input upto considered max length

        Args:
            X_train = tokenized train data
            X_test = tokenized test data

        Returns:
            X_train_pad = padded tokenized train data
            X_test_pad = padded tokenized test data

    """
    print(X_train.shape, X_test.shape, "before padding")

    X_train_pad = pad_sequences(X_train,maxlen=maxlen,padding="post")

    X_test_pad = pad_sequences(X_test,maxlen=maxlen,padding="post")

    print(X_train_pad.shape, X_test_pad.shape, "after padding")

    return X_train_pad,X_test_pad

In [49]:
x_train_pad,x_test_pad = padding_input(x_train,x_test,max_len)

(25000,) (25000,) before padding
(25000, 200) (25000, 200) after padding


In [50]:
embedding_matrix = np.zeros((vocab_size,embed_dim))

for word, i in word_ix.items():
    try:
        embedding_vector = w2vmodel[word]
        
    except:
        pass
    try:
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
    except:
        pass

print(embedding_matrix.shape ,"embedding_matrix")

(1000, 300) embedding_matrix


In [51]:
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import Flatten
from keras.initializers import Constant

print('Training model.')

# define the model
model = Sequential()
model.add(Embedding(vocab_size,
                            embed_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_len,
                            trainable=False))
model.add(Flatten())
model.add(Dense(512, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dense(128, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

# summarize the model
print(model.summary())

model.fit(x_train_pad,y_train,
          batch_size=2048,
          epochs=1,
          validation_data=(x_test_pad,y_test))

Training model.
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 300)          300000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 60000)             0         
_________________________________________________________________
dense_13 (Dense)             (None, 512)               30720512  
_________________________________________________________________
dense_14 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_15 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 129       
Total params: 31,184,865
Trainable params: 30,884,865
Non-trainable params: 300,000
____________________

<keras.callbacks.History at 0x7f8dd0b59e80>

In [0]:
# loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

# print('Accuracy: %f' % (accuracy))
# print('Loss: %f' % (loss))

In [52]:
model.predict(x_test_pad[6].reshape(1, x_test_pad.shape[1]))

array([[0.51251537]], dtype=float32)

In [53]:
y_test[6]

1