In [3]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import nltk
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag
from nltk import word_tokenize
from spellchecker import SpellChecker
import string
import re
from nltk import sent_tokenize
import talos

def predictInput(user_input):
    # import text and remove those with null category
    input_data_category = pd.read_csv("ABSACOMBINED.csv")
    input_data_category = input_data_category.dropna(subset=['category'])

    input_data_polarity = pd.read_csv("ABSACOMBINED.csv")
    input_data_polarity = input_data_polarity.dropna(subset=['polarity'])

    # define x_train and y_train data
    phrase_category = input_data_category.phrase
    category = input_data_category.category

    phrase_polarity = input_data_polarity.phrase
    polarity = input_data_polarity.polarity

    MAX_SEQ_LENGTH = 10  # most of the phrase is within length of 10
    MAX_NB_WORDS = 400000  # I set this based on the number of words found in the glove.txt

    ############## THIS PART FOR ASPECT CATEGORY ##############
    cat_tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=False)
    cat_tokenizer.fit_on_texts(phrase_category)
    cat_sequences = cat_tokenizer.texts_to_sequences(phrase_category)

    category_list = category.tolist()

    class_list = ['FOOD#QUALITY', 'SERVICE#GENERAL', 'AMBIENCE#GENERAL', 'RESTAURANT#GENERAL']
    filtered_category = []

    for i in range(0, 4150):  # to choose the top 4 largest class
        if category_list[i] in class_list:
            filtered_category.append(category_list[i])

    np_category = np.array(filtered_category)

    cat_label_encoder = LabelEncoder()
    cat_integer_encoded = cat_label_encoder.fit_transform(np_category)

    # binary encode
    cat_onehot_encoder = OneHotEncoder(sparse=False)
    cat_integer_encoded = cat_integer_encoded.reshape(len(cat_integer_encoded), 1)


    ############## THIS PART FOR SENTIMENT POLARITY ##############
    pol_tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=False)
    pol_tokenizer.fit_on_texts(phrase_polarity)
    pol_sequences = pol_tokenizer.texts_to_sequences(phrase_polarity)

    pol_label_encoder = LabelEncoder()
    pol_integer_encoded = pol_label_encoder.fit_transform(polarity)
    # binary encode
    pol_onehot_encoder = OneHotEncoder(sparse=False)
    pol_integer_encoded = pol_integer_encoded.reshape(len(pol_integer_encoded), 1)

    ###### THIS PART IS USER INPUT AND OUTPUT #####

    # Model already saved, so we can just load the model.
    filename_1a = 'finalized_hybrid_lstm_aspect.sav'
    filename_1b = 'finalized_hybrid_svm_aspect.sav'
    loaded_model_aspect_lstm = joblib.load(filename_1a)
    loaded_model_aspect_svm = joblib.load(filename_1b)

    filename_2a = 'finalized_hybrid_lstm_sentiment.sav'
    filename_2b = 'finalized_hybrid_svm_sentiment.sav'
    loaded_model_sentiment_lstm = joblib.load(filename_2a)
    loaded_model_sentiment_svm = joblib.load(filename_2b)
    
    vect = CountVectorizer(max_df=1.0,stop_words='english')

    ## User Input Part ##
    cat_sequences = cat_tokenizer.texts_to_sequences(user_input)
    cat_tokenised_sequence = pad_sequences(cat_sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
    cat_np_phrase = np.array(cat_tokenised_sequence)

    pol_sequences = pol_tokenizer.texts_to_sequences(user_input)
    pol_tokenised_sequence = pad_sequences(pol_sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
    pol_np_phrase = np.array(pol_tokenised_sequence)

    from keras import backend as K
    
    outputs = []
    for i in range(3, 4):
        layer = loaded_model_aspect_lstm.layers[i]
        keras_function = K.function([loaded_model_aspect_lstm.input], [layer.output])
        outputs.append(keras_function([cat_np_phrase, 1]))
    out = np.array(outputs)
    r_out = np.reshape(out, (out.shape[2], 10))
    predict_aspect = []
    for elem in r_out:
        e_out = np.expand_dims(elem, axis=0)
        predict_aspect.append(loaded_model_aspect_svm.predict(e_out)[0])
    sort_list = sorted(class_list)
    
    outputs = []
    for i in range(3, 4):
        layer = loaded_model_sentiment_lstm.layers[i]
        keras_function = K.function([loaded_model_sentiment_lstm.input], [layer.output])
        outputs.append(keras_function([cat_np_phrase, 1]))
    out = np.array(outputs)
    r_out = np.reshape(out, (out.shape[2], 100))
    e_out = np.expand_dims(r_out, axis=0)
    #predict_sentiment.append(loaded_model_sentiment_svm.predict(e_out)[0])
    predict_sentiment = []
    for elem in r_out:
        e_out = np.expand_dims(elem, axis=0)
        predict_sentiment.append(loaded_model_sentiment_svm.predict(e_out)[0])
    
    sort_list = sorted(class_list)
    polarity_list = ['negative', 'neutral', 'positive']
    for i in range(len(user_input)):
        print("\n")
        print("Phrase " + str(i+1) + ":", user_input[i])
        print("Category: ", sort_list[predict_aspect[i]])
        print("Sentiment: ", polarity_list[predict_sentiment[i]])


def getAspectInput(tagged_review):
    aspectlist=[]
    for text_list in tagged_review:
        for word,tag in text_list:
            if tag in ['NN','NNS','NNP','NNPS']:
                aspectlist.append(word)
    return aspectlist

def get_all_phrases_containing_tar_wrd(target_word, tar_passage, left_margin=5, right_margin=5):
    """
        Function to get all the pharses that contain the target word in a text/passage tar_passage.
        Workaround to save the output given by nltk Concordance function

        str target_word --> aspect to be searched for
        str tar_passage  --> sentence extracted from a customer review
        int left_margin int right_margin --> left_margin and right_margin allocate the number of words/punctuation before and after target word
        Left margin will take note of the beginning of the text
    """

    ## Create list of tokens using nltk function
    tokens = nltk.word_tokenize(tar_passage)
    tokens = [x for x in tokens if len(x) > 2]
    ## Create the text of tokens
    text = nltk.Text(tokens)

    ## Collect all the index or offset position of the target word
    c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower())

    ## Collect the range of the words that is within the target word by using text.tokens[start;end].
    ## The map function is use so that when the offset position - the target range < 0, it will be default to zero

    concordance_txt = (
    [text.tokens[list(map(lambda x: x - 5 if (x - left_margin) > 0 else 0, [offset]))[0]:offset + right_margin] for
     offset in c.offsets(target_word)])

    ## join the sentences for each of the target phrase and return it
    return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]

# java_path = "C:/Program Files/Java/jdk-11.0.2/bin/java.exe" - Wei Ming's Java path
java_path = "C:/Program Files/Java/jdk1.8.0_201/bin/java.exe"
os.environ['JAVA_HOME'] = java_path

#For stanford POS Tagger
home = os.getcwd() + "/stanford-postagger-2018-10-16"
_path_to_model = home + '/models/english-bidirectional-distsim.tagger'
_path_to_jar = home + '/stanford-postagger.jar'
stanford_tag = POS_Tag(model_filename=_path_to_model, path_to_jar=_path_to_jar)

#To tag using stanford pos tagger
def posTag(review):
    tagged_text_list=[]
    for text in review:
        tagged_text_list.append(stanford_tag.tag(word_tokenize(text)))
    return tagged_text_list

#Filter the word with tag- noun,adjective,verb,adverb
def filterTag(tagged_review):
    final_text_list=[]
    for text_list in tagged_review:
        final_text=[]
        for word,tag in text_list:
            if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:
                final_text.append(word)
        final_text_list.append(' '.join(final_text))
    return final_text_list


# Reduce duplicated letters in a word to be maximum of 2.

def word_lengthening(sentence):
    list_words = sentence
    pattern = re.compile(r"(.)\1{2,}")
    reduced_list = [pattern.sub(r"\1\1", word) for word in list_words] # Perform reduce lenghtening
    return reduced_list

# Perform spell correction
# Downside: Some names/abbreviations are also used for spell correction which could cause some inconsistency.

spell = SpellChecker()

def spell_correction(sentence):
    list_words = sentence
    spell_list = [spell.correction(word) for word in list_words]
    return spell_list

# Remove punctuations from all sentences
def remove_punct(my_sentence):
    trans_table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    no_punct = my_sentence.translate(trans_table)
    return no_punct

isUserInput = True
while isUserInput:
    print("--------------------------------------------------------------------------------------------------")
    userInput = input("Enter a restaurant review (Type 'Q' to Quit):\n")
    if userInput != "Q":
        # Remove punctuations from sentence
        userInput = userInput.lower()
        userInput = remove_punct(userInput)

        # Tokenize input sentence
        token_input = nltk.word_tokenize(userInput)
        # print("Tokenize words: ", token_input)

        # Perform word correction
        word_correction = word_lengthening(token_input)
        correct_sentence = spell_correction(word_correction)
        joined_words = (" ".join(correct_sentence))

        # Preprocessing and vectorizing
        tagged_user_input = posTag([joined_words])
        # print("Part-of-Speech Tagging: ", tagged_user_input)
        filter_tagged_user_input = filterTag(tagged_user_input)
        # print("Filtered Part-of-Speech Tagging: ", filter_tagged_user_input[0])
        aspect_list = getAspectInput(tagged_user_input)

        phrases=[]

        for sentence in sent_tokenize(filter_tagged_user_input[0]):
            for important_word in aspect_list:
                phrases_in_sentence = get_all_phrases_containing_tar_wrd(important_word, sentence, left_margin = 5, right_margin = 5)
                for phrase in phrases_in_sentence:
                    phrases.append(phrase)

        phrases = list(set(phrases))
        predictInput(phrases)
    else:
        isUserInput = False
        break


--------------------------------------------------------------------------------------------------
Enter a restaurant review (Type 'Q' to Quit):
waiter was rude but pasta was nice


Phrase 1: waiter was rude pasta was 
Category:  FOOD#QUALITY
Sentiment:  negative


Phrase 2: waiter was rude pasta was nice 
Category:  FOOD#QUALITY
Sentiment:  negative
--------------------------------------------------------------------------------------------------
Enter a restaurant review (Type 'Q' to Quit):
waiter was rude


Phrase 1: waiter was rude 
Category:  SERVICE#GENERAL
Sentiment:  negative
--------------------------------------------------------------------------------------------------
Enter a restaurant review (Type 'Q' to Quit):
pasta was tasty


Phrase 1: pasta was tasty 
Category:  FOOD#QUALITY
Sentiment:  positive
--------------------------------------------------------------------------------------------------
Enter a restaurant review (Type 'Q' to Quit):
music was good


Phrase 1: mu

KeyboardInterrupt: Interrupted by user