In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import chardet
import re
import os
import random

# Pre-processing

In [None]:
#Checking files in Kaggle
# List data files that are connected to the kernel
os.listdir('../input')

In [None]:
# Read Train & Test Files

#Kaggle
train_file = bz2.BZ2File('../input/train.ft.txt.bz2')
test_file = bz2.BZ2File('../input/test.ft.txt.bz2')

#Localhost
#train_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/train.ft.txt.bz2')
#test_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/test.ft.txt.bz2')

#Localhost - Versión recortada del archivo
#train_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/Version_Recortada/r_train.ft.txt.bz2')
#test_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/Version_Recortada/r_test.ft.txt.bz2')

#Create Lists containing Train & Test sentences
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

#Convert from raw binary strings to strings that can be parsed
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [None]:
#Delete memory reference (?)
del train_file, test_file
#Garbage collector
gc.collect()

In [None]:
print("Cantidad de elementos del Training Set: {}".format(len(train_file_lines)))
print("Cantidad de elementos del Testing Set: {}".format(len(test_file_lines)))

## Clean data

In [None]:
# Change labels: __label__1 -> 0 (Negative) / __label__2 -> 1 (Positive)
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]

# Make everything Lower Case
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

# Modify URLs to <url>
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

## Checking data before and after cleaning

In [None]:
#Random
r = random.randint(1,len(train_file_lines))

#Before
print("Data before cleaning:\n{}".format(train_file_lines[r-1:r]))

#After
print("\nData after cleaning:\n{}".format((train_sentences[r-1:r])))

#Labels
print("\nLabel:{}".format(train_labels[r-1:r]))

### Output
From the above output it can be seen that each sentence begins with it's sentiment (label1 -> Negative, label2 -> Positive), which is then followed by the review and ends with a newline character \n.

So, first I go convert all the labels to O(Negative) and 1(Positive) and store it in lists that only contain the label values. After this, I store the remainder of the sentence excluding the newline character in lowercase in lists. Also, convert all numbers to 0.


In [None]:
#Delete memory reference (?)
del train_file_lines, test_file_lines
#Garbage collector
gc.collect()

## Text Pre-processing

In [None]:
from keras.preprocessing import text, sequence

#Base definitions for text preprocessing
max_features = 20000
maxlen = 100

In [None]:
#Tokenizer definition
#Filtro caracteres especiales usando el Tokenizer de keras.
tokenizer = text.Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

#Fit on text -> Only the train dataset !!!
tokenizer.fit_on_texts(train_sentences)

#Training set
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

#Test set
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [None]:
#Print a random matrix
X_train[r]
# summarize what was learned -> Si quiero ver el tokenizer que aprendio usando los 2 parametros (Max_features,max_length)
#print(t.word_counts)
#print(t.document_count)
#print(t.word_index)
#print(t.word_docs)

### Validation dataset

In [None]:
from sklearn.model_selection import train_test_split
# Create a validation dataset
validation_size = 0.2
X_train, X_valid, train_labels, valid_labes = train_test_split(X_train, train_labels, test_size = validation_size)

In [None]:
#Delete memory reference (?)
del tokenized_test, tokenized_train, tokenizer, train_sentences, test_sentences
#Garbage collector
gc.collect()

## Model

In [None]:
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint 
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras import initializers
from keras.models import Model, Sequential
from keras.layers import Convolution1D, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, LSTM, GRU
from keras.layers.embeddings import Embedding

In [58]:
#Defino los parametros del modelo:
p = 0.10 #Dropout
lr = 0.0001 #Learning Rate
batch_size = 2048
epochs = 4 #Bajamos de 10 -> 4.

#Embedding size -> Ver para que sirve, todavia falta entenderlo?
embed_size = 128
#CNN_Filters
CNN_Filters = embed_size * 2
#RNN
RNN_Neurons = 128
time_steps = 0

In [60]:
# Creo el modelo
model=Sequential()
#Embedding
model.add(Embedding(max_features, embed_size, input_length=maxlen))
model.add(Dropout(p))
#CNN
model.add(Convolution1D(filters=CNN_Filters, kernel_size=3, padding="same", name='Conv1'))
model.add(Activation('relu'))
model.add(Convolution1D(filters=CNN_Filters, kernel_size=3, padding="same", name='Conv2'))
model.add(Activation('relu'))
#RNN
model.add(GRU(RNN_Neurons, return_sequences=True))
model.add(Dropout(p * 2))
model.add(GRU(RNN_Neurons * 2, return_sequences=True))
model.add(GRU(RNN_Neurons * 4))
#Dense
model.add(Dropout(p * 2))
model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dropout(p))
model.add(Dense(1, activation='softmax'))

model.summary()

#Optimizers
ADAM = optimizers.Adam(lr=lr)
model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['binary_accuracy'])
#Accuracy en metrics es más generico, y depende de la LOSS.
#model.compile(loss = 'binary_crossentropy', optimizer=ADAM, metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 100, 128)          2560000   
_________________________________________________________________
dropout_34 (Dropout)         (None, 100, 128)          0         
_________________________________________________________________
Conv1 (Conv1D)               (None, 100, 256)          98560     
_________________________________________________________________
activation_27 (Activation)   (None, 100, 256)          0         
_________________________________________________________________
Conv2 (Conv1D)               (None, 100, 256)          196864    
_________________________________________________________________
activation_28 (Activation)   (None, 100, 256)          0         
_________________________________________________________________
gru_4 (GRU)                  (None, 100, 128)          147840    
__________

In [61]:
# Callbacks

## Callback para guardar pesos
checkpointer = ModelCheckpoint(filepath='Sentiment_Analysis_Amazon_Reviews.hdf5', monitor='val_loss'
                                   ,verbose=1, save_best_only=True, mode='min')
callbacks = [checkpointer]

In [63]:
# Fit del modelo
"""
model.fit(X_train,train_labels
          ,epochs=epochs
          ,batch_size = batch_size          
          ,shuffle = True
          ,validation_data = (X_valid,valid_labes)
          ,callbacks=callbacks)
"""

# Fit del modelo -> Usando solo un fragmento del datasset
model.fit(X_train[:100000], train_labels[:100000]
          ,epochs=epochs
          ,batch_size = batch_size          
          ,shuffle = True
          ,validation_split=0.20
          ,callbacks=callbacks)


Train on 80000 samples, validate on 20000 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 7.96960, saving model to Sentiment_Analysis_Amazon_Reviews.hdf5
Epoch 2/4

Epoch 00002: val_loss did not improve
Epoch 3/4

KeyboardInterrupt: 

## Test 

In [None]:
score, acc = model.evaluate(X_test, test_labels, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

# Modelo de QRNN

In [64]:
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, Conv1D, GlobalMaxPool1D, Dropout, concatenate, Layer, InputSpec, CuDNNLSTM
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import activations, initializers, regularizers, constraints
from keras.utils.conv_utils import conv_output_length
from keras.regularizers import l2
from keras.constraints import maxnorm

In [66]:
def cudnnlstm_model(conv_layers = 2, max_dilation_rate = 3):
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Dropout(0.25)(x)
    x = Conv1D(2*embed_size, kernel_size = 3)(x)
    prefilt = Conv1D(2*embed_size, kernel_size = 3)(x)
    x = prefilt
    for strides in [1, 1, 2]:
        x = Conv1D(128*2**(strides), strides = strides, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_size=3, kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x_f = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)  
    x_b = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x = concatenate([x_f, x_b])
    x = Dropout(0.5)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_accuracy'])

    return model

cudnnlstm_model = cudnnlstm_model()
cudnnlstm_model.summary()

batch_size = 2048
epochs = 4

weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks = [checkpoint, early_stopping]

Instructions for updating:
Use the retry module or similar alternatives.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 100, 128)     2560000     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_39 (Dropout)            (None, 100, 128)     0           embedding_20[0][0]               
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 98, 256)      98560       dropout_39[0][0]                 
__________________________________________________________________________________________________
conv1d_4 (

In [68]:
cudnnlstm_model.fit(X_train[:100000]
                    ,train_labels[:100000]
                    ,batch_size=batch_size
                    ,epochs=epochs
                    ,shuffle = True
                    ,validation_split=0.20
                    ,callbacks=callbacks)

Train on 80000 samples, validate on 20000 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.34887, saving model to early_weights.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.34887 to 0.30955, saving model to early_weights.hdf5
Epoch 3/4

Epoch 00003: val_loss improved from 0.30955 to 0.27859, saving model to early_weights.hdf5
Epoch 4/4

Epoch 00004: val_loss did not improve


<keras.callbacks.History at 0x7f0b3c74b470>

In [69]:
cudnnlstm_model.load_weights(weight_path)
score, acc = cudnnlstm_model.evaluate(X_test, test_labels, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.2790753042125702
Test accuracy: 0.8894675
