In [1]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Embedding
from keras.models import Model
import pandas as pd
import sys

Using TensorFlow backend.


### Importing texts

In [2]:
# import text and remove those with null category
input_data = pd.read_csv("ABSACOMBINED.csv")
print('Before removing NaN value: ', input_data.shape)
input_data = input_data.dropna(subset=['category'])
print('After removing NaN value: ', input_data.shape)

# define data grouped by review id (as dataframe)
grouped_df = input_data.groupby('review_id')
grouped_data = grouped_df['category'].agg(lambda column: ",".join(column))
grouped_data = grouped_data.reset_index(name='category')
print(grouped_data)

# define x_train and y_train data
review_id = input_data.review_id
phrase = input_data.phrase
category = input_data.category
print(category.value_counts())

Before removing NaN value:  (4579, 5)
After removing NaN value:  (4150, 5)
      review_id                                           category
0             0                                 RESTAURANT#GENERAL
1             1                    SERVICE#GENERAL,SERVICE#GENERAL
2             2  SERVICE#GENERAL,SERVICE#GENERAL,SERVICE#GENERA...
3             3                          FOOD#QUALITY,FOOD#QUALITY
4             4              FOOD#STYLE_OPTIONS,FOOD#STYLE_OPTIONS
...         ...                                                ...
2151       3155              FOOD#STYLE_OPTIONS,FOOD#STYLE_OPTIONS
2152       3157                                       FOOD#QUALITY
2153       3158                                     DRINKS#QUALITY
2154       3159                                   AMBIENCE#GENERAL
2155       3160                                  RESTAURANT#PRICES

[2156 rows x 2 columns]
FOOD#QUALITY                1368
SERVICE#GENERAL              747
AMBIENCE#GENERAL             4

In [3]:
MAX_SEQ_LENGTH = 10  # most of the phrase is within length of 10
MAX_NB_WORDS = 400000  # I set this based on the number of words found in the glove.txt (should have no effect as only 2744 tokens found below) 
EMBEDDING_DIM = 100  # I tried using glove 100d

#### vectorize the text samples into a 2D integer tensor and padding the sentences

In [4]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=False)
tokenizer.fit_on_texts(phrase)
sequences = tokenizer.texts_to_sequences(phrase)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print ("Let's have a quick look at the word_index data..")
print (list(word_index.items())[:10])

Found 2744 unique tokens.
Let's have a quick look at the word_index data..
[('food', 1), ('great', 2), ('good', 3), ('service', 4), ('place', 5), ('nice', 6), ('best', 7), ('excellent', 8), ('restaurant', 9), ('menu', 10)]


In [5]:
tokenised_sequence = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
print(tokenised_sequence)
print('Shape of data tensor:', tokenised_sequence.shape)

[[1946 1211 1947 ...    0    0    0]
 [ 486  899 1456 ...   12 1948  199]
 [ 486  899 1456 ...   12    0    0]
 ...
 [   3    1   58 ...    0    0    0]
 [   3    1   58 ...    0    0    0]
 [   3    1   58 ...    0    0    0]]
Shape of data tensor: (4150, 10)


#### defining output data

In [6]:
review_id_list = review_id.tolist()
category_list = category.tolist()

In [7]:
# from imblearn.over_sampling import (RandomOverSampler, 
#                                     SMOTE, 
#                                     ADASYN)

from collections import Counter
from sklearn.utils import class_weight

class_list = ['FOOD#QUALITY', 'SERVICE#GENERAL', 'AMBIENCE#GENERAL', 'RESTAURANT#GENERAL']
filtered_phrase = []
filtered_category = []
filtered_id = []

for i in range(0, 4150):  # to choose the top 4 largest class
    if category_list[i] in class_list:
        filtered_id.append(review_id_list[i])
        filtered_phrase.append(tokenised_sequence[i])
        filtered_category.append(category_list[i])
        
np_phrase = np.array(filtered_phrase)  # So this is without oversampling, the accuracy improves from 0.2 to 0.4 but still all same prob
np_category = np.array(filtered_category)
print(sorted(Counter(np_category).items()))

# smote_x, smote_y = SMOTE().fit_sample(filtered_x, filtered_y)

# class weight to handle imbalanced data
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(np_category),
                                                 np_category)
print(class_weights)

[('AMBIENCE#GENERAL', 483), ('FOOD#QUALITY', 1368), ('RESTAURANT#GENERAL', 440), ('SERVICE#GENERAL', 747)]
[1.57246377 0.55519006 1.72613636 1.0167336 ]


**transform output data into categorical index**

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(np_category)
    
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
print(onehot_encoded.shape)

[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 ...
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
(3038, 4)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


**Tidy up all variables**

In [9]:
# 80% as training data
# 20% as testing data
x_train = np_phrase[:2431]
x_test = np_phrase[2431:3039]
y_train = onehot_encoded[:2431]
y_test = onehot_encoded[2431:3039]
id_test = filtered_id[2431:3039]

Training and testing data 

In [10]:
from sklearn.model_selection import *
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, r2_score

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [11]:
svm = make_pipeline(StandardScaler(), SVC(gamma="auto"))
svm.fit(x_train, np.argmax(y_train, axis = 1))
res = svm.predict(x_test)
print(svm.score(x_test, np.argmax(y_test, axis = 1), sample_weight=None))

0.5189456342668863


In [12]:
pre_res = label_encoder.inverse_transform(res)

In [13]:
df = pd.DataFrame({'review_id': id_test,
     'predicted_category': pre_res})
predicted_data = df.groupby('review_id')['predicted_category'].apply(lambda x: ','.join(x))
predicted_data  # this is in series format

review_id
2400    RESTAURANT#GENERAL
2403          FOOD#QUALITY
2404          FOOD#QUALITY
2407          FOOD#QUALITY
2409          FOOD#QUALITY
               ...        
3150          FOOD#QUALITY
3152          FOOD#QUALITY
3153          FOOD#QUALITY
3157          FOOD#QUALITY
3159          FOOD#QUALITY
Name: predicted_category, Length: 366, dtype: object

In [14]:
grouped_data.to_csv("svm_actual_category.csv")
predicted_data.to_csv("svm_predicted_category.csv")

  


In [15]:
no_of_sentence = predicted_data.size
no_of_sentence_correct = 0
for index, value in predicted_data.items():
    match = True
    predicted_value = value.split(',')
    actual_value = grouped_data[grouped_data['review_id'] == index]['category']
    actual_value = actual_value.tolist()[0].split(',')
    for elem in actual_value:
        if elem not in predicted_value:
            match = False
            break
    if match:
        no_of_sentence_correct += 1

print(no_of_sentence_correct/no_of_sentence)

# END HERE

0.5519125683060109


### User Input

In [16]:
import nltk
from nltk.tokenize import word_tokenize 
from spellchecker import SpellChecker
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_df=1.0,stop_words='english')  

# java_path = "C:/Program Files/Java/jdk-11.0.2/bin/java.exe" - Wei Ming's Java path
java_path = "C:/Program Files/Java/jdk1.8.0_201/bin/java.exe"
os.environ['JAVA_HOME'] = java_path 

#For stanford POS Tagger
home = os.getcwd() + "/stanford-postagger-2018-10-16"
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag
from nltk import word_tokenize
_path_to_model = home + '/models/english-bidirectional-distsim.tagger' 
_path_to_jar = home + '/stanford-postagger.jar'
stanford_tag = POS_Tag(model_filename=_path_to_model, path_to_jar=_path_to_jar)

#To tag using stanford pos tagger
def posTag(review):
    tagged_text_list=[]
    for text in review:
        tagged_text_list.append(stanford_tag.tag(word_tokenize(text)))
    return tagged_text_list

#Filter the word with tag- noun,adjective,verb,adverb
def filterTag(tagged_review):
    final_text_list=[]
    for text_list in tagged_review:
        final_text=[]
        for word,tag in text_list:
            if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:
                final_text.append(word)
        final_text_list.append(' '.join(final_text))
    return final_text_list


# Reduce duplicated letters in a word to be maximum of 2.

def word_lengthening(sentence):
    list_words = sentence
    pattern = re.compile(r"(.)\1{2,}")
    reduced_list = [pattern.sub(r"\1\1", word) for word in list_words] # Perform reduce lenghtening
    return reduced_list

# Perform spell correction
# Downside: Some names/abbreviations are also used for spell correction which could cause some inconsistency.

spell = SpellChecker()

def spell_correction(sentence):
    list_words = sentence
    spell_list = [spell.correction(word) for word in list_words]
    return spell_list

# Remove punctuations from all sentences
def remove_punct(my_sentence):
    trans_table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    no_punct = my_sentence.translate(trans_table)
    return no_punct

In [19]:
#Aspect analyis of user's input.
user_input=input("Enter a restaurant review:\n\n")

# Remove punctuations from sentence
user_input = remove_punct(user_input)

# Tokenize input sentence
token_input = nltk.word_tokenize(user_input)
print("Tokenize words: ", token_input)

# Perform word correction
word_correction = word_lengthening(token_input)
correct_sentence = spell_correction(word_correction)
joined_words = ( " ".join(correct_sentence))
print("Spelling correction: ", joined_words)

#Preprocessing and vectorizing
tagged_user_input = posTag([joined_words])
print("Part-of-Speech Tagging: ", tagged_user_input)
filter_tagged_user_input = filterTag(tagged_user_input)
print("Filtered Part-of-Speech Tagging: ", filter_tagged_user_input)

sequences = tokenizer.texts_to_sequences(filter_tagged_user_input)
tokenised_sequence = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
np_phrase = np.array(tokenised_sequence)

predict_aspect= svm.predict(np_phrase)
print("Category: ", predict_aspect)

Enter a restaurant review:

I loove this restaurant
Tokenize words:  ['I', 'loove', 'this', 'restaurant']
Spelling correction:  I love this restaurant
Part-of-Speech Tagging:  [[('I', 'PRP'), ('love', 'VBP'), ('this', 'DT'), ('restaurant', 'NN')]]
Filtered Part-of-Speech Tagging:  ['love restaurant']
Category:  [1]
