# Notebook14 Combined Aspect Sentiment Model

# Section 1: Import libraries

In [800]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.utils import class_weight
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense,Flatten,Embedding,Dropout
import contractions
import pickle
import string
import ast
import os
from spellchecker import SpellChecker
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model


# Section 2: Import data

In [910]:
X_train_accom = pd.read_csv("X_train_accom.csv",index_col = 0, squeeze = True)
X_val_accom = pd.read_csv("X_val_accom.csv",index_col = 0, squeeze = True)
X_test_accom = pd.read_csv("X_test_accom.csv",index_col = 0, squeeze = True)

X_train_food = pd.read_csv("X_train_food.csv",index_col = 0, squeeze = True)
X_val_food = pd.read_csv("X_val_food.csv",index_col = 0, squeeze = True)
X_test_food = pd.read_csv("X_test_food.csv",index_col = 0, squeeze = True)

X_train_attract = pd.read_csv("X_train_attract.csv",index_col = 0, squeeze = True)
X_val_attract = pd.read_csv("X_val_attract.csv",index_col = 0, squeeze = True)
X_test_attract = pd.read_csv("X_test_attract.csv",index_col = 0, squeeze = True)

y_train_accom = pd.read_csv("y_train_accom.csv",index_col = 0, squeeze = True)
y_val_accom = pd.read_csv("y_val_accom.csv",index_col = 0, squeeze = True)
y_test_accom = pd.read_csv("y_test_accom.csv",index_col = 0, squeeze = True)

y_train_food = pd.read_csv("y_train_food.csv",index_col = 0, squeeze = True)
y_val_food = pd.read_csv("y_val_food.csv",index_col = 0, squeeze = True)
y_test_food = pd.read_csv("y_test_food.csv",index_col = 0, squeeze = True)

y_train_attract = pd.read_csv("y_train_attract.csv",index_col = 0, squeeze = True)
y_val_attract = pd.read_csv("y_val_attract.csv",index_col = 0, squeeze = True)
y_test_attract = pd.read_csv("y_test_attract.csv",index_col = 0, squeeze = True)

# Section 3: Functions to encode the data

In [912]:
def lab(y_train, y_val, y_test):
    le = preprocessing.LabelEncoder()
    y_train_l = le.fit_transform(y_train)
    y_val_l = le.fit_transform(y_val)
    y_test_l = le.transform(y_test)
    return y_train_l,y_val_l, y_test_l

In [913]:
def bin(y_train, y_val, y_test):
    binarizer = preprocessing.LabelBinarizer()
    y_train_e = binarizer.fit_transform(y_train)
    y_val_e = binarizer.transform(y_val)
    y_test_e = binarizer.transform(y_test)
    return y_train_e,y_val_e,y_test_e

In [914]:
def tok(X_train, X_val, X_test):
    tokenizer = Tokenizer(num_words = 6000)
    tokenizer.fit_on_texts(X_train)
    X_train_s = tokenizer.texts_to_sequences(X_train)
    X_val_s = tokenizer.texts_to_sequences(X_val)
    X_test_s = tokenizer.texts_to_sequences(X_test)
    X_train_w = pad_sequences(np.array(X_train_s,dtype = "object"), maxlen=15, padding="post", truncating="post", value=0.0)
    X_val_w = pad_sequences(np.array(X_val_s,dtype = "object"), maxlen=15, padding="post", truncating="post", value=0.0)
    X_test_w = pad_sequences(np.array(X_test_s, dtype = "object"), maxlen = 15, padding = "post", truncating = "post", value = 0.0)
    
    return X_train_w, X_val_w, X_test_w

### Apply functions

In [915]:
# Apply function 1 and 2
y_train_accom_l, y_val_accom_l, y_test_accom_l = lab(y_train_accom, y_val_accom, y_test_accom)
y_train_food_l, y_val_food_l, y_test_food_l = lab(y_train_food, y_val_food, y_test_food)
y_train_attract_l, y_val_attract_l, y_test_attract_l = lab(y_train_attract, y_val_attract, y_test_attract)

y_train_accom_e, y_val_accom_e, y_test_accom_e = bin(y_train_accom, y_val_accom, y_test_accom)
y_train_food_e, y_val_food_e, y_test_food_e = bin(y_train_food, y_val_food, y_test_food)
y_train_attract_e, y_val_attract_e, y_test_attract_e = bin(y_train_attract, y_val_attract, y_test_attract)

In [916]:
# Apply function 3
X_train_accom_w, X_val_accom_w, X_test_accom_w = tok(X_train_accom, X_val_accom, X_test_accom)
X_train_food_w, X_val_food_w, X_test_food_w = tok(X_train_food, X_val_food, X_test_food)
X_train_attract_w, X_val_attract_w, X_test_attract_w = tok(X_train_attract, X_val_attract, X_test_attract)

In [917]:
# Get shape of training, validation and test sets
print(X_train_accom_w.shape,y_train_accom_e.shape)
print(X_train_food_w.shape, y_train_food_e.shape)
print(X_train_attract_w.shape, y_train_attract_e.shape)

(1776, 15) (1776, 7)
(1680, 15) (1680, 6)
(1320, 15) (1320, 7)


# Section 4: Build Models

In [918]:
# Apect model 1 for use with accommodation and attractions
modela = Sequential()
modela.add(Embedding(input_dim = 6000, output_dim = 16, input_length = 15))
modela.add(Flatten())
modela.add(Dense(512, activation='relu', input_shape=(1600,)))
modela.add(Dropout(rate=0.5))
modela.add(Dense(256,activation = 'relu'))
modela.add(Dropout(rate=0.5))
modela.add(Dense(128,activation = 'relu'))
modela.add(Dropout(rate=0.5))
modela.add(Dense(7, activation='softmax'))
modela.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#modela.summary()

In [919]:
# Aspect model 2 for use with food
modelb = Sequential()
modelb.add(Embedding(input_dim = 6000, output_dim = 16, input_length = 15))
modelb.add(Flatten())
modelb.add(Dense(512, activation='relu', input_shape=(1600,)))
modelb.add(Dropout(rate=0.5))
modelb.add(Dense(256,activation = 'relu'))
modelb.add(Dropout(rate=0.5))
modelb.add(Dense(128,activation = 'relu'))
modelb.add(Dropout(rate=0.5))
modelb.add(Dense(5, activation='softmax'))
modelb.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])     
#modelb.summary()

In [920]:
# Sentiment model 1 for use for all categories
modelc = Sequential()
modelc.add(Embedding(input_dim = 6000, output_dim = 16, input_length=20))
modelc.add(Flatten())
modelc.add(Dense(64, activation='relu', input_shape=(1600,)))
modelc.add(Dense(1, activation='sigmoid'))
modelc.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#modelc.summary()

In [921]:
# Run weights function to get weights
def weight(y_train):
    class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
    return class_weights

In [922]:
# Output directory and callbacks
def out(name):
    output_dir = name
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    return output_dir
        
def call(output_dir,patience):
    modelcheckpoint = ModelCheckpoint(filepath=output_dir+"/weights.{epoch:02d}.hdf5")
    callbacks = [EarlyStopping(monitor = "val_loss", patience = patience),modelcheckpoint]
    return callbacks

In [923]:
def run_mod(model, X_train, X_val, y_train, y_val):
    val_acc = []
    history = model.fit(X_train, y_train,
                       epochs = 50,
                       verbose = 2,
                       callbacks = callbacks,
                       validation_data = (X_val, y_val),
                       batch_size = 10,
                       class_weight = class_weights)
    print("\n\n")
    print(model.evaluate(X_train, y_train))
    print(model.evaluate(X_val, y_val))
    print("\n\n")

### Apply functions

In [924]:
# Class weights
cw_accom = weight(y_train_accom.Aspect)
cw_food = weight(y_train_food.Aspect)
cw_attract = weight(y_train_attract.Aspect)

class_weights_accom = {0:cw_accom[0],1:cw_accom[1],2:cw_accom[2],3:cw_accom[3],4:cw_accom[4],5:cw_accom[5],6:cw_accom[6]}
class_weights_food = {0:cw_food[0],1:cw_food[1],2:cw_food[2],3:cw_food[3],4:cw_food[4],5:cw_food[5]}
class_weights_attract = {0:cw_attract[0],1:cw_attract[1],2:cw_attract[2],3:cw_attract[3],4:cw_attract[4],5:cw_attract[5],6:cw_attract[6]}

AttributeError: 'Series' object has no attribute 'Aspect'

In [925]:
# Run model 1 - Accommodation
output_dir = out('model_output/Accommodation')
callbacks = call(output_dir,5)
class_weights = class_weights_accom

history1 = run_mod(modela, X_train_accom_w, X_val_accom_w, y_train_accom_e, y_val_accom_e)

Epoch 1/50
178/178 - 0s - loss: 1.7560 - accuracy: 0.2584 - val_loss: 1.1431 - val_accuracy: 0.6036
Epoch 2/50
178/178 - 0s - loss: 0.7686 - accuracy: 0.7280 - val_loss: 0.4054 - val_accuracy: 0.9099
Epoch 3/50
178/178 - 0s - loss: 0.2972 - accuracy: 0.9240 - val_loss: 0.3041 - val_accuracy: 0.9234
Epoch 4/50
178/178 - 0s - loss: 0.1464 - accuracy: 0.9657 - val_loss: 0.2826 - val_accuracy: 0.9369
Epoch 5/50
178/178 - 0s - loss: 0.0758 - accuracy: 0.9859 - val_loss: 0.2994 - val_accuracy: 0.9324
Epoch 6/50
178/178 - 0s - loss: 0.0579 - accuracy: 0.9842 - val_loss: 0.2662 - val_accuracy: 0.9324
Epoch 7/50
178/178 - 0s - loss: 0.0438 - accuracy: 0.9910 - val_loss: 0.2938 - val_accuracy: 0.9369
Epoch 8/50
178/178 - 0s - loss: 0.0388 - accuracy: 0.9916 - val_loss: 0.2585 - val_accuracy: 0.9414
Epoch 9/50
178/178 - 0s - loss: 0.0224 - accuracy: 0.9944 - val_loss: 0.3115 - val_accuracy: 0.9369
Epoch 10/50
178/178 - 0s - loss: 0.0348 - accuracy: 0.9927 - val_loss: 0.4563 - val_accuracy: 0.9234

In [None]:
# Run Model 2 - Food
output_dir = out('model_output/Food')
callbacks = call(output_dir,5)
class_weights = class_weights_food

history1 = run_mod(modelb, X_train_food_w, X_val_food_w, y_train_food_e, y_val_food_e)

## Function 1 - preprocessing

In [None]:
sentences = nltk.sent_tokenize(review)
    text_words = []

In [689]:
# Function to clean sentences

def process(text):
    if text != []:
        text = text.replace('\n',' ')
        text = text.strip().lower()
        text = text.replace('xmas','christmas')
        text = text.replace('\£',"") 
        text = text.replace(r'\/'," ") 
        text = text.replace('\d+\-\d+',"") 
        text = text.replace('\d+\w{2}',"") 
        text = text.replace('\.{3,}',"") 
        text = text.replace(' i ',"")
        text = text.replace(' le ',"")
        text = contractions.fix(text)
        text = nltk.word_tokenize(text)
        punc = string.punctuation
        text = [word for word in text if word not in punc]
        text = [n for n in text if not n.isnumeric()]
        text = [e for e in text if e.encode("ascii","ignore")]
        stop = stopwords.words("english")
        stop_remove = ["not","don't","didn't","wasn't","won't","isn't"]
        stop1 = [w for w in stop if w not in stop_remove]
        add_stop = ['etc','read','read less','lot','butlins', 'bognor','regis','b',' i ','..','arundel castle','premier','inn','u',
                    'castle',"year","hilton","time","day","shoreline","oyster","bay","church farm","hotham","hotham park",
                    "hawk walk","hawk","arundel","littlehampton"]
        stop1.extend(add_stop)
        text = [w for w in text if w not in stop1]
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(w) for w in text ]
        spell = SpellChecker()
        word_list = []
        for w in text:
            new = spell.correction(w)
            if new != w:
                word_list.append(new)
            else:
                word_list.append(w)
            text_joined = ' '.join(word_list) 
            
                       
    return text_joined

In [691]:
processed_text = process("This was a wonderful place to stay")
processed_text

'wonderful place stay'

## Function 2 - Aspect Extraction

In [707]:
def extract(text):
    text = word_tokenize(text)
    text_pos = nltk.tag.pos_tag(text)
    noun = [i[0] for i in text_pos if i[1].startswith('N')]
    return noun

In [708]:
extract(processed_text)

['place', 'stay']

## Function 3 - Noun Phrase Extraction

In [713]:
def phrase_extract(text):
    blob = TextBlob(text)
    noun_phrases = blob.noun_phrases
    noun_phrases = ' '.join(noun_phrases)
    return noun_phrases 

In [714]:
phrase_extract(processed_text)

'wonderful place stay'

## Function 4 - 

## Function 4 - Encoding text

In [None]:
def encode(text,category):
    if category == "accommodation":
        aspects_accom = tokenizer.fit_on_texts(X_train)
        aspects_s = tokenizer.texts_to_sequences(nouns)
        aspects_w = pad_sequences(np.array(aspects_s, dtype = "object"), maxlen = 15,padding = "post", truncating = "post", value = 0.0)
        model = load_model('accommodation.h5') 
                
    elif category == "food":
        with open('tok_food.pickle',rb) as handle:
            tokenizer = pickle.load(handle)
            aspects_s = tokenizer.texts_to_sequences(nouns)   
            aspects_w = pad_sequences(np.array(aspects_s, dtype = "object"), maxlen = 15, padding = "post", truncating = "post", value = 0.0)
    
    else:
        with open('tok_attract.pickle',rb) as handle:
            tokenizer = pickle.load(handle)
            aspects_s = tokenizer.texts_to_sequences(nouns)  
            aspects_w = pad_sequences(np.array(aspects_s, dtype = "object"), maxlen = 15, padding = "post", truncating = "post", value = 0.0)  
                

In [521]:
# Noun encoding
        
                        
        
            
        
                             
        # Aspect prediction
        
                
        # Sentiment prediction
        with open('tok_sent.pickle', 'rb') as handle:
                tokenizer = pickle.load(handle)
        sent_s = tokenizer.texts_to_sequences(phrases)
        sent_w = pad_sequences(np.array(sent_s,dtype = "object"), maxlen = 15, padding = "post",truncating = "post",value = 0.0)
        model_s = load_model('Sentiment.h5') 
        sent_pred = model_s.predict(sent_w)
        sent_class = (model_s.predict(X_val_w) > 0.5).astype("int32")
        
        return  sent_class

IndentationError: unexpected indent (<ipython-input-521-976a8a52f78f>, line 2)

In [550]:
# Function to pre-process text and run through aspect and sentiment model. Input is expected to be in a text format
# e.g. "I love this place"

def full_model(text):
                   
        # Preprocessing text
        text = text.replace('\n',' ')
        text = text.strip().lower()
        text = text.replace('xmas','christmas')
        text = text.replace('\£',"") 
        text = text.replace(r'\/'," ") 
        text = text.replace('\d+\-\d+',"") 
        text = text.replace('\d+\w{2}',"") 
        text = text.replace('\.{3,}',"") 
        text = text.replace(' i ',"")
        text = text.replace(' le ',"")
        text = contractions.fix(text)
        text = nltk.word_tokenize(text)
        punc = string.punctuation
        text = [word for word in text if word not in punc]
        text = [n for n in text if not n.isnumeric()]
        text = [e for e in text if e.encode("ascii","ignore")]
        stop = stopwords.words("english")
        stop_remove = ["not","don't","didn't","wasn't","won't","isn't"]
        stop1 = [w for w in stop if w not in stop_remove]
        add_stop = ['etc','read','read less','lot','butlins', 'bognor','regis','b',' i ','..','arundel castle','premier','inn','u',
                    'castle',"year","hilton","time","day","shoreline","oyster","bay","church farm","hotham","hotham park",
                    "hawk walk","hawk","arundel","littlehampton"]
        stop1.extend(add_stop)
        text = [w for w in text if w not in stop1]
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(w) for w in text ]
        spell = SpellChecker()
        word_list = []
        for w in text:
            new = spell.correction(w)
            if new != w:
                word_list.append(new)
            else:
                word_list.append(w)
        text_joined = ' '.join(word_list)
        
        # Noun extraction
        text_pos = nltk.tag.pos_tag(word_list)
        nouns = [i[0] for i in text_pos if i[1].startswith('N')]
                
        # Noun phrase extraction
        blob = TextBlob(text_joined)
        noun_phrases = blob.noun_phrases
        phrase_list = []
        for item in noun_phrases:
            phrase_list.append(item)
                            
        return nouns, phrase_list

In [551]:
# Function to run reviews - enter review text and category of review

def review_analyser(review):
    
        