In [1]:
%load_ext autoreload
%autoreload 2

# Models

In [2]:
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint 
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras import initializers
from keras.models import Model, Sequential
from keras.layers import Convolution1D, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, LSTM, GRU, CuDNNGRU, CuDNNLSTM, concatenate, Input, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.constraints import maxnorm

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
#Input -> embed_size,maxlen,max_features,lr
#Output -> model()

#Multilayer - CNN - RNN model
def get_model_1(embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Dropout(0.25)(x)
    x = Convolution1D(2*embed_size, kernel_size = 3)(x)
    prefilt = Convolution1D(2*embed_size, kernel_size = 3)(x)
    x = prefilt
    for strides in [1, 1, 2]:
        x = Convolution1D(128*2**(strides), strides = strides, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_size=3, kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x_f = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)  
    x_b = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x = concatenate([x_f, x_b])
    x = Dropout(0.5)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['binary_accuracy'])
    return model

#MLP Model
def get_model_2 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Dense(128, activation="relu")(x)    
    x = Flatten()(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#CNN Model
def get_model_3 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Convolution1D(2 * embed_size, kernel_size = 2, activation = "relu")(x)    
    x = Flatten()(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#2 layers with Dropout - CNN Model
def get_model_4 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Convolution1D(2 * embed_size, kernel_size = 3, activation = "relu")(x)   
    x = Dropout(0.25)(x)
    x = Convolution1D(2 * embed_size, kernel_size = 2, activation = "relu")(x)   
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#3 layers with Dropout - CNN Model
def get_model_5 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Convolution1D(2 * embed_size, kernel_size = 3, activation = "relu")(x)   
    x = Dropout(0.25)(x)
    x = Convolution1D(2 * embed_size, kernel_size = 2, activation = "relu")(x)   
    x = Dropout(0.25)(x)
    x = Convolution1D(embed_size, kernel_size = 3, activation = "relu")(x)   
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#Simple RNN with Dropout
def get_model_6 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = SimpleRNN(256) (x)
    x = Dropout(0.10)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#Simple LSTM with Dropout
def get_model_7 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = CuDNNLSTM(256) (x)
    x = Dropout(0.10)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#Simple GRU with Dropout
def get_model_8 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = CuDNNGRU(256) (x)
    x = Dropout(0.10)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#CNN-RNN model with Dropout
def get_model_9 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Convolution1D(2 * embed_size, kernel_size = 3, activation = "relu")(x)   
    x = Dropout(0.25)(x)
    x = Convolution1D(2 * embed_size, kernel_size = 2, activation = "relu")(x)   
    x = Dropout(0.10)(x)
    x = CuDNNLSTM(256) (x)
    x = Dropout(0.10)(x)    
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#RNN-CNN model with Dropout
def get_model_10 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = CuDNNLSTM(256, return_sequences=True) (x)
    x = Dropout(0.10)(x)   
    x = Convolution1D(2 * embed_size, kernel_size = 3, activation = "relu")(x)   
    x = Dropout(0.25)(x)
    x = Convolution1D(2 * embed_size, kernel_size = 2, activation = "relu")(x)   
    x = Dropout(0.10)(x)
    x = Flatten()(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#CNN-RNN model with Concatenate and Dropout
def get_model_11 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Dropout(0.25)(x)
    #Branch A
    x_a = Convolution1D(2 * embed_size, kernel_size = 3, activation = "relu")(x)
    x_a = Dropout(0.10)(x_a)
    x_a = CuDNNLSTM(512)(x_a)  
    #Branch B
    x_b = Convolution1D(embed_size, kernel_size = 2, activation = "relu")(x)
    x_b = Dropout(0.10)(x_b)
    x_b = CuDNNLSTM(256, return_sequences=True)(x_b)  
    x_b = Dropout(0.10)(x_b)
    x_b = CuDNNLSTM(128)(x_b)  
    #Concatenate Branch A-B
    x = concatenate([x_a, x_b])
    x = Dropout(0.25)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

#CNN-RNN-MLP model with Concatenate and Dropout
def get_model_12 (embed_size,maxlen,max_features,lr):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Dropout(0.25)(x)
    #Branch A - 2 layers CNN + MLP
    x_a = Convolution1D(2 * embed_size, kernel_size = 3, activation = "relu")(x)
    x_a = Dropout(0.10)(x_a)
    x_a = Convolution1D(2 * embed_size, kernel_size = 3, activation = "relu")(x)
    x_a = Dropout(0.10)(x_a)
    x_a = Dense(512, activation="relu")(x_a)
    x_a = Flatten()(x_a)
    #Branch B - RNN + MLP
    x_b = Convolution1D(embed_size, kernel_size = 2, activation = "relu")(x)
    x_b = Dropout(0.10)(x_b)
    x_b = CuDNNLSTM(256, return_sequences=True)(x_b)  
    x_b = Dropout(0.10)(x_b)
    x_b = Dense(512, activation="relu")(x_b)
    x_b = Flatten()(x_b)
    #Concatenate Branch A-B
    x = concatenate([x_a, x_b])
    x = Dropout(0.25)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    ADAM = optimizers.Adam(lr=lr)
    model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
    return model

# Pre-processing

In [4]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import chardet
import re
import os
import random

In [5]:
#Checking files in Kaggle
# List data files that are connected to the kernel
os.listdir('../input')

['test.ft.txt.bz2', 'train.ft.txt.bz2']

In [6]:
# Read Train & Test Files

#Kaggle
train_file = bz2.BZ2File('../input/train.ft.txt.bz2')
test_file = bz2.BZ2File('../input/test.ft.txt.bz2')

#Localhost
#train_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/train.ft.txt.bz2')
#test_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/test.ft.txt.bz2')

#Localhost - Versión recortada del archivo
#train_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/Version_Recortada/r_train.ft.txt.bz2')
#test_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/Version_Recortada/r_test.ft.txt.bz2')

#Create Lists containing Train & Test sentences
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

#Convert from raw binary strings to strings that can be parsed
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [7]:
#Delete memory reference (?)
del train_file, test_file
#Garbage collector
gc.collect()

0

In [8]:
print("Cantidad de elementos del Training Set: {}".format(len(train_file_lines)))
print("Cantidad de elementos del Testing Set: {}".format(len(test_file_lines)))

Cantidad de elementos del Training Set: 3600000
Cantidad de elementos del Testing Set: 400000


## Clean data

In [9]:
# Change labels: __label__1 -> 0 (Negative) / __label__2 -> 1 (Positive)
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]

# Make everything Lower Case
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

# Modify URLs to <url>
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

## Checking data before and after cleaning

In [10]:
#Random
r = random.randint(1,len(train_file_lines))

#Before
print("Data before cleaning:\n{}".format(train_file_lines[r-1:r]))

#After
print("\nData after cleaning:\n{}".format((train_sentences[r-1:r])))

#Labels
print("\nLabel:{}".format(train_labels[r-1:r]))

Data before cleaning:
['__label__2 Good shape but a a little worn: My book was in good shape but a little worn. Not quite the "like new" I expected but still worth the price.\n']

Data after cleaning:
['good shape but a a little worn: my book was in good shape but a little worn. not quite the "like new" i expected but still worth the price.']

Label:[1]


### Output
From the above output it can be seen that each sentence begins with it's sentiment (label1 -> Negative, label2 -> Positive), which is then followed by the review and ends with a newline character \n.

So, first I go convert all the labels to O(Negative) and 1(Positive) and store it in lists that only contain the label values. After this, I store the remainder of the sentence excluding the newline character in lowercase in lists. Also, convert all numbers to 0.


In [11]:
#Delete memory reference (?)
del train_file_lines, test_file_lines
#Garbage collector
gc.collect()

0

## Text Pre-processing

In [12]:
from keras.preprocessing import text, sequence

#Base definitions for text preprocessing
max_features = 20000
maxlen = 100

In [13]:
#Tokenizer definition
#Filtro caracteres especiales usando el Tokenizer de keras.
tokenizer = text.Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

#Fit on text -> Only the train dataset !!!
tokenizer.fit_on_texts(train_sentences)

#Training set
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

#Test set
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [14]:
#Print a random matrix
X_train[r]
# summarize what was learned -> Si quiero ver el tokenizer que aprendio usando los 2 parametros (Max_features,max_length)
#print(t.word_counts)
#print(t.document_count)
#print(t.word_index)
#print(t.word_docs)

array([ 1919,     2,   606,     1,  1211,   450,    22,     4,  2308,
        3809, 19247,     2,  2763,   261,   152,    10,    79,   479,
         158,    97,   111,    14,  4993,     5,    26,     4,   578,
       12772, 16596,  9496,   335,   902,    79,   356,     2,  5251,
           6,    17,    42,  3413,  1634,     2,   606,   169,     5,
          50,   735,    12,    96,  5690,   824,     7,     1,  5690,
          38, 14162,   102,   448,  2521,    37,    74,     1,   118,
        1565,  2308, 16606,    61, 14565,  1220,     2,    42,   419,
          38,   125,    37,    79,   921,  5298,     4,   215,   179,
          50,    11,   207,  1347,    10,     1,  2308,  2143,  1018,
           3,   395,    15,   279,     6,     9,    73,    22,     3,
         101], dtype=int32)

### Validation dataset

In [15]:
from sklearn.model_selection import train_test_split
# Create a validation dataset
validation_size = 0.2
X_train, X_valid, train_labels, valid_labes = train_test_split(X_train, train_labels, test_size = validation_size)

In [16]:
#Delete memory reference (?)
del tokenized_test, tokenized_train, tokenizer, train_sentences, test_sentences
#Garbage collector
gc.collect()

0

## Model

In [17]:
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint 
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras import initializers
from keras.models import Model, Sequential
from keras.layers import Convolution1D, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, LSTM, GRU, CuDNNGRU, CuDNNLSTM, concatenate, Input
from keras.layers.embeddings import Embedding

In [18]:
#Defino los parametros del modelo:
lr = 0.0001 #Learning Rate
batch_size = 1024
epochs = 5
embed_size = 128 #Embedding size

In [33]:
# <<<<Models>>>>
#Input -> get_model_1(embed_size,maxlen,lr)
model = get_model_2(embed_size,maxlen,max_features,lr)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_14 (Embedding)     (None, 100, 128)          2560000   
_________________________________________________________________
dense_21 (Dense)             (None, 100, 128)          16512     
_________________________________________________________________
flatten_8 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 12801     
Total params: 2,589,313
Trainable params: 2,589,313
Non-trainable params: 0
_________________________________________________________________


In [35]:
## Callback para guardar pesos
checkpointer = ModelCheckpoint(filepath='Sentiment_Analysis_Amazon_Reviews.hdf5', monitor='val_loss'
                                   ,verbose=1, save_best_only=True, mode='min')
callbacks = [checkpointer]

In [38]:
# Fit del modelo -> Usando todo el dataset

model.fit(X_train,train_labels
          ,epochs=epochs
          ,batch_size = batch_size          
          ,shuffle = True
          ,validation_data = (X_valid,valid_labes)
          ,callbacks=callbacks)
"""
# Fit del modelo -> Usando solo un fragmento del datasset
model.fit(X_train[:100000], train_labels[:100000]
          ,epochs=epochs
          ,batch_size = batch_size          
          ,shuffle = True
          ,validation_split=0.20
          ,callbacks=callbacks)
"""

Train on 2880000 samples, validate on 720000 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.25410, saving model to Sentiment_Analysis_Amazon_Reviews.hdf5
Epoch 2/5

Epoch 00002: val_loss improved from 0.25410 to 0.25144, saving model to Sentiment_Analysis_Amazon_Reviews.hdf5
Epoch 3/5

Epoch 00003: val_loss improved from 0.25144 to 0.24885, saving model to Sentiment_Analysis_Amazon_Reviews.hdf5
Epoch 4/5

Epoch 00004: val_loss improved from 0.24885 to 0.24728, saving model to Sentiment_Analysis_Amazon_Reviews.hdf5
Epoch 5/5

Epoch 00005: val_loss improved from 0.24728 to 0.24664, saving model to Sentiment_Analysis_Amazon_Reviews.hdf5


'\n\n# Fit del modelo -> Usando solo un fragmento del datasset\nmodel.fit(X_train[:100000], train_labels[:100000]\n          ,epochs=epochs\n          ,batch_size = batch_size          \n          ,shuffle = True\n          ,validation_split=0.20\n          ,callbacks=callbacks)\n'

## Test 

In [40]:
#Load the model
model.load_weights('Sentiment_Analysis_Amazon_Reviews.hdf5')
#Test
score, acc = model.evaluate(X_test, test_labels, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.24728829448699952
Test accuracy: 0.9023199999618531


## Predict

In [45]:
cant = 3
random_test = random.randint(1,len(X_test))
X_test_cant = X_test[random_test:random_test+cant]
#Prediction
test_prediction = model.predict(X_test_cant)
#test_prediction_labels = test_prediction.argmax(axis = -1)
#Print predictions
print("Dataset labels: {}".format(test_labels[random_test:random_test+cant]))
#print("Predicted labels: {}".format(test_prediction_labels))
print("Predicted probability labels: {}".format(test_prediction))

Dataset labels: [1, 0, 1]
Predicted probability labels: [[0.9810409 ]
 [0.41099635]
 [0.98329055]]
