In [2]:
import pyprind
import pandas as pd
import numpy as np
import os
import sys
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Flatten, Conv1D, MaxPooling1D, Embedding, Dropout
from tensorflow.keras.models import Model

# loading data

In [3]:
data_path = './aclImdb'
glove_path = './glove.6B/'
# max number of words in the texts to be vectorized (choose the frequent words)
max_nb_words = 20000
# max number of words in a review (the review is padded or trucated to the number)
num_words_per_review = 1000
# glove embedding dimension
glove_dim = 100
validation_ratio = 0.2
np.random.seed(123)

In [4]:
# load the movie review texts and sentiment labels
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(data_path, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r',encoding="utf8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
            
df.columns = ['review', 'sentiment']
texts = df['review'].values.tolist()
labels = df['sentiment'].values.tolist()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:08:52


#  load the glove vectors
# the dictionary for mapping a word to a 100-dim vector

In [5]:
glove_embedding = {}
f = open(os.path.join(glove_path, 'glove.6B.100d.txt'),encoding="utf8")
for line in f:
    fields = line.split()
    word = fields[0] # the first element is the word
    word_vector = np.asarray(fields[1:], dtype='float32') 
    glove_embedding[word] = word_vector
f.close()

# Tokenization of words

In [6]:
tokenizer = Tokenizer(num_words = max_nb_words) 
tokenizer.fit_on_texts(texts) 
# convert each review text into a sequence of word-indices
matrix_word_indices = tokenizer.texts_to_sequences(texts)
# the dictionary for mapping a word to an index
dictionary_word_index = tokenizer.word_index

# Padding 

In [7]:
matrix_word_indices_fixed_length = pad_sequences(matrix_word_indices, maxlen = num_words_per_review) 
data = np.array(matrix_word_indices_fixed_length)
labels = np.array(labels)

# Data splitting

In [8]:
# shuffle the data 
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# percentage of validation data
nb_validation_samples = int(validation_ratio*data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_validation = data[-nb_validation_samples:]
y_validation = labels[-nb_validation_samples:]

# Creating embedding matrix

In [9]:
num_words = min(max_nb_words, len(dictionary_word_index))
# embedding_matrix[0] is a all-zero vector representing no word
embedding_matrix = np.zeros((num_words+1, glove_dim)) 
for word, index in dictionary_word_index.items():
    if index > max_nb_words:
        continue 
    # get the glove vector for the word
    glove_vector = glove_embedding.get(word) 
    if glove_vector is not None: 
        embedding_matrix[index] = glove_vector        

# Define the model

In [10]:
sequence_input = Input(shape=(num_words_per_review,), dtype='int32')

embedding_layer = Embedding(num_words+1, glove_dim, weights=[embedding_matrix], input_length=num_words_per_review, trainable=True)
embedded_output = embedding_layer(sequence_input)

x = Conv1D(filters=32, kernel_size=5, activation='relu')(embedded_output)
x = MaxPooling1D(5)(x)
x = Dropout(0.25)(x)

x = Conv1D(32, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Dropout(0.25)(x)


x = Flatten()(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu')(x)
final_output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=sequence_input, outputs=final_output)

In [12]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc','mae'])

In [13]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 100)         2000100   
_________________________________________________________________
conv1d (Conv1D)              (None, 996, 32)           16032     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 199, 32)           0         
_________________________________________________________________
dropout (Dropout)            (None, 199, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 195, 32)           5152      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 39, 32)            0     

# fit the model

In [14]:
pbar = pyprind.ProgBar(50000)
model.fit(x=x_train, y=y_train, validation_data=(x_validation, y_validation), epochs=5, batch_size=128, verbose=2)

Epoch 1/5
313/313 - 94s - loss: 0.6374 - acc: 0.6091 - mae: 0.4498 - val_loss: 0.4776 - val_acc: 0.7780 - val_mae: 0.3292
Epoch 2/5
313/313 - 92s - loss: 0.3965 - acc: 0.8304 - mae: 0.2501 - val_loss: 0.3078 - val_acc: 0.8748 - val_mae: 0.2111
Epoch 3/5
313/313 - 91s - loss: 0.3010 - acc: 0.8783 - mae: 0.1821 - val_loss: 0.2724 - val_acc: 0.8907 - val_mae: 0.1800
Epoch 4/5
313/313 - 91s - loss: 0.2546 - acc: 0.9000 - mae: 0.1506 - val_loss: 0.2554 - val_acc: 0.9014 - val_mae: 0.1714
Epoch 5/5
313/313 - 92s - loss: 0.2258 - acc: 0.9132 - mae: 0.1313 - val_loss: 0.2853 - val_acc: 0.8851 - val_mae: 0.1620


<tensorflow.python.keras.callbacks.History at 0x265647bbe20>

# Evaluate the model 

In [15]:
test_accuracy = model.evaluate(x_validation, y_validation, verbose=1)



In [16]:
pred=model.predict(x_train)

In [74]:
y_train[6]

0

In [75]:
PRED=list(pred)
k=0
for i in pred:
    if i<0.5:
        PRED[k]=0
        k+=1
    else:
        PRED[k]=1
        k+=1

In [76]:
PRED[6]

0

In [21]:
model.save("DLProject.h5")

In [77]:
sequence = tokenizer.texts_to_sequences(["ow. A truly exceptional film that somehow lives up to the immense praise. It manages to be so many things at once: an operatic satire, a classist takedown, a cheeky comedy, a twisty thriller, a chilling horror, a somber rumination. Much of the success is owed to the absolutely phenomenal screenplay which is one of the most intricately constructed in recent memory. Every setup is paid off, every tonal shift deftly balanced, every progression carefully timed. The movie feels like a well-oiled machine firing on all cylinders, thrusting you through the densely layered narrative without so much as a hiccup. And perhaps most importantly, despite its highly allegorical nature, the film never compromises on the entertainment value and emotional catharsis of its literal story and characters. It's not often you see a film realize its ambition this resoundingly. Don't miss it." ,
               "After all, it's already wasted enough of my time, so I'll be brief. First, this is an R-rated movie...about tag? Maybe if this was naked tag...If that was the case, this would be a very different review. Once again, I should have read the rating before buying it. I didn't do so because it had been marked down in price (for obvious reasons) and bought it impulsively. And at what point did dropping f-bombs into every sentence constitute good writing and movie-making? Assuming that was the objective. There was more swearing in this film than I remember hearing in my last view of 'The Rock' which was a much better film, even if Nic Cage was in it. Decent comedy films apparently can't be made anymore unless there is an inordinate amount of crude language and cuss words. But...this film wasn't even remotely close to decent. I have a massive film collection (everyone needs a hobby) and I've only trashed two films after purchasing and watching them. This one is number three. Don't waste your time on this one; don't rent it, and definitely don't buy it"])
test = pad_sequences(sequence, maxlen=num_words_per_review)
x=model.predict(test)
for i in x:
    if i<0.5:
        print('negative')
    else:
        print("positive")

positive
negative


# Custom metrics function

In [23]:
import tensorflow.keras.backend as K
def mae(y_true, y_pred):            
    eval = K.abs(y_pred - y_true)
    eval = K.mean(eval, axis=-1)
    return eval


# building model with custom metrics function

In [24]:
model1 = Model(inputs=sequence_input, outputs=final_output)
model1.compile(loss="binary_crossentropy", optimizer='rmsprop', metrics=['acc',mae])

In [25]:
model1.fit(x=x_train, y=y_train, validation_data=(x_validation, y_validation), epochs=5, batch_size=128, verbose=2)

Epoch 1/5
313/313 - 92s - loss: 0.2028 - acc: 0.9248 - mae: 0.1151 - val_loss: 0.2391 - val_acc: 0.9070 - val_mae: 0.1560
Epoch 2/5
313/313 - 91s - loss: 0.1754 - acc: 0.9350 - mae: 0.0991 - val_loss: 0.2501 - val_acc: 0.9029 - val_mae: 0.1611
Epoch 3/5
313/313 - 85s - loss: 0.1558 - acc: 0.9440 - mae: 0.0875 - val_loss: 0.2399 - val_acc: 0.9065 - val_mae: 0.1453
Epoch 4/5
313/313 - 76s - loss: 0.1409 - acc: 0.9496 - mae: 0.0783 - val_loss: 0.2513 - val_acc: 0.9072 - val_mae: 0.1273
Epoch 5/5
313/313 - 78s - loss: 0.1238 - acc: 0.9564 - mae: 0.0680 - val_loss: 0.2397 - val_acc: 0.9082 - val_mae: 0.1312


<tensorflow.python.keras.callbacks.History at 0x26566204280>

In [26]:
model1.evaluate(x_validation,y_validation,verbose=1)



[0.23970139026641846, 0.9082000255584717, 0.13122884929180145]

In [70]:
from sklearn.metrics import confusion_matrix
conf=confusion_matrix(y_train, PRED)
conf

array([[17112,  2927],
       [  468, 19493]], dtype=int64)