In [9]:
#use extra dataset to test 
import os
os.environ['KERAS_BACKEND']='theano'
import re
import numpy as np
from bs4 import BeautifulSoup
from nltk import tokenize
import itertools
from keras.models import model_from_json
from keras.optimizers import rmsprop
#load the model
from keras.engine.topology import Layer, InputSpec
from keras import backend as K
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from keras.utils.np_utils import to_categorical
import nltk
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import pandas as pd
nltk.download('punkt')
import pickle


SEN_NUM = 15
WORDS_NUM = 100
MAX_NB_WORDS = 18907
CONTEXT_DIM = 100


class AttLayer(Layer):
    def __init__(self, regularizer=None, **kwargs):
        self.regularizer = regularizer
        self.supports_masking = True
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight(name='W', shape=(input_shape[-1], CONTEXT_DIM), initializer='normal', trainable=True,
                                 regularizer=self.regularizer)
        self.b = self.add_weight(name='b', shape=(CONTEXT_DIM,), initializer='normal', trainable=True,
                                 regularizer=self.regularizer)
        self.u = self.add_weight(name='u', shape=(CONTEXT_DIM,), initializer='normal', trainable=True,
                                 regularizer=self.regularizer)
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.dot(K.tanh(K.dot(x, self.W) + self.b), self.u)
        ai = K.exp(eij)
        alphas = ai / K.sum(ai, axis=1).dimshuffle(0, 'x')
        if mask is not None:
            # use only the inputs specified by the mask
            alphas *= mask
        weighted_input = x * alphas.dimshuffle(0, 1, 'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

    def get_config(self):
        config = {}
        base_config = super(AttLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_mask(self, inputs, mask):
        return None



def clean_text_to_text(review):
    # remove the the',",\
    review = re.sub(r"\\", "", review)
    review = re.sub(r"\'", "", review)
    review = re.sub(r"\"", "", review)
    # return the lower case
    text = review.strip().lower()

    return text


def get_normalized_data(data):
    reviews = []
    review_sentences = []
    review_tokens = []

    # clean the test dataset
    for review in data["Column2"]:
        text = BeautifulSoup(review)
        cleaned_text = text.get_text().encode('ascii', 'ignore')
        cleaned_string = cleaned_text.decode('utf-8')
        cleaned_review = clean_text_to_text(cleaned_string)
        reviews.append(cleaned_review)
        sentence = tokenize.sent_tokenize(cleaned_review)
        # number of the review
        review_sentences.append(sentence)

        for s in sentence:
            if (len(s) > 0):
                tokens = text_to_word_sequence(s)
                # filter out non-alpha
                tokens = [token for token in tokens if token.isalpha()]
                # filter out those short letters
                tokens = [t for t in tokens if len(t) > 1]
                review_tokens.append(tokens)

    return reviews, review_sentences, review_tokens

def get_matrix(reviews, sentences, vocabulary):
    texts_matrix = np.zeros((len(reviews), SEN_NUM, WORDS_NUM), dtype='int32')
    print(texts_matrix.shape)

    for index1, review in enumerate(sentences):
        for index2, sentence in enumerate(review):
            if (index2 < SEN_NUM):
                tokens = text_to_word_sequence(sentence)
                count = 0
                non_exist = 0
                for _, w in enumerate(tokens):
                    if (w not in vocabulary.keys()):
                        print("non_exist")
                        print(w)
                        non_exist += 1
                        continue
                    if (count < WORDS_NUM and vocabulary[w] < MAX_NB_WORDS):
                        texts_matrix[index1, index2, count] = vocabulary[w]
                        count = count + 1

    return texts_matrix

#------add extra dataset-------
extra_train = pd.read_csv("/Users/xiaoyiwen/Desktop/MasterProject/MasterProject/data_Preprocessing/Datasets/kaggle_data/training_set.csv")
extra_test = pd.read_csv("/Users/xiaoyiwen/Desktop/MasterProject/MasterProject/data_Preprocessing/Datasets/kaggle_data/test_set.csv")


reviews1, sentences1, tokens1 = get_normalized_data(extra_train)

reviews2, sentences2, tokens2 = get_normalized_data(extra_test)

#-------create---vocab--------
reviews =reviews1 + reviews2

tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews)

vocabulary = tokenizer.word_index

print("all the vocabulary(train + test)")
print(vocabulary)
print(len(vocabulary))


#------gettrain_mtrix------
x_test= get_matrix(reviews1, sentences1, vocabulary)

print(x_test.shape)


#-----get_labels----------
labels = []
for sentiment in extra_train["Column1"]:
    labels.append(sentiment)
    
y_test = to_categorical(np.asarray(labels))

#------loaded_the model-------------
json_file = open('/Users/xiaoyiwen/Desktop/MasterProject/MasterProject/data_Preprocessing/HAN_Classifier/HAN_Model/HAN1_rmsprop_L20.005_15.json','r')

loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json,custom_objects={'AttLayer':AttLayer()})
#load_weight
loaded_model.load_weights("/Users/xiaoyiwen/Desktop/MasterProject/MasterProject/data_Preprocessing/HAN_Classifier/HAN_Model/HAN1_rmsprop_L20.005_15.h5")

print("loaded model from disk")

optimizer1 = rmsprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
loaded_model.compile(loss = 'categorical_crossentropy',optimizer=optimizer1,metrics=['acc'])

score, acc = loaded_model.evaluate(x_test, y_test, batch_size=125, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)



[nltk_data] Downloading package punkt to /Users/xiaoyiwen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


all the vocabulary(train + test)
18907
(6397, 15, 100)
(6397, 15, 100)


loaded model from disk
Test score: 0.879778649689
Test accuracy: 0.515554164664
