<h1>Aspect-based Sentiment Analysis using LSTM and Word Embeddings<h1>

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
import gensim
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense,Input, Dropout
from tensorflow.keras.layers import LSTM,Bidirectional
from tensorflow.keras.optimizers import Adamax

In [2]:
# Read data from given data (json) file into pandas dataframe
train_data = pd.read_json("data/sentihood-train.json")
test_data = pd.read_json("data/sentihood-test.json")
# Split rows into multiple rows where multiple opinions are provided in this dataset
train_data = train_data.explode('opinions',ignore_index=True)
test_data = test_data.explode('opinions',ignore_index=True)
# Drop rows with NaN values (where no opinion is provided)
train_data.dropna(axis=0,inplace=True)
test_data.dropna(axis=0,inplace=True)
train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)
# Convert reviews into lowercased strings
train_data['text'] = train_data['text'].str.lower()
test_data['text'] = test_data['text'].str.lower()
# Remove leading whitespaces from reviews
train_data['text'] = train_data['text'].str.lstrip()
test_data['text'] = test_data['text'].str.lstrip()
# Insert a whitespace before and another one after wherever location1 or location2 appears
train_data['text'] = train_data['text'].str.replace('.location1.',' location1 ',regex=True)
test_data['text'] = test_data['text'].str.replace('.location1.',' location1 ',regex=True)
train_data['text'] = train_data['text'].str.replace('.location2.',' location2 ',regex=True)
test_data['text'] = test_data['text'].str.replace('.location2.',' location2 ',regex=True)

In [3]:
# A look at training data
train_data.head(5)

Unnamed: 0,opinions,id,text
0,"{'sentiment': 'Negative', 'aspect': 'price', '...",1430,location1 is transforming and the prices will ...
1,"{'sentiment': 'Positive', 'aspect': 'shopping'...",2013,along location1 there are lots of electronics ...
2,"{'sentiment': 'Positive', 'aspect': 'transit-l...",1244,and location1 is ten mins direct on the tube t...
3,"{'sentiment': 'Positive', 'aspect': 'nightlife...",209,another option is location1 which is very cent...
4,"{'sentiment': 'Positive', 'aspect': 'transit-l...",209,another option is location1 which is very cent...


In [4]:
# Load GloVe word embeddings into a dictionary (can be downloaded from here: https://nlp.stanford.edu/projects/glove/)
gloveEmbeddings = {}
with open('glove.twitter.27B/glove.twitter.27B.100d.txt','r',encoding='utf8') as f:
    for line in f:
        l = line.split()
        gloveEmbeddings[str(l[0])] = np.array(l[1:],dtype=np.float32)

In [5]:
def text_to_tokens(text_column):
    """
    Function to convert reviews into lists of tokens

    args:
        text_column(dataframe column)
    
    returns:
        texts(a list of lists): each list contains tokens associated with a particular review
        max_len(int): length of list with maximum number of tokens
"""
    texts = []
    max_len = 0
    tknzr = TweetTokenizer()
    for text in text_column:
        output = tknzr.tokenize(text) #tokenize review
        i = 1
        n = len(output)
        #If tokenizer has tokenized 'location1' into 'location' and '1' (or 'location2' like this), concatenate them 
        while i < n:
            if (output[i] == '1' or output[i] == '2') and output[i-1] == 'location':
                output[i-1] = output[i-1]+output[i]
                output.remove(output[i])
                n = len(output)
            i += 1
        if len(output) > max_len:
            max_len = len(output)
        texts.append(output)
    return (texts,max_len)

In [6]:
train_texts,max_len = text_to_tokens(train_data['text'])
test_texts = text_to_tokens(test_data['text'])[0]

In [7]:
# Train custom Word2Vec word embeddings on training text in case some of the tokens are not there in downloaded GloVe vocabulary 
word_embeddings = gensim.models.Word2Vec(train_texts,min_count = 1,size=100,window = 3,iter=50)

In [8]:
def text_to_tensor(texts,gloveEmbeddings,wordEmbeddings,max_len):
    """
    Function to convert list of lists of tokens into an array(padded) of word embeddings
    
    args:
        texts(list of lists of tokens)
        gloveEmbeddings(GloVe word vectors)
        wordEmbeddings(Word2Vec word vectors)
        max_len(int): length of sentence with maximum number of tokens
        
    returns:
        array(float): Shape - number_of_reviews*max_len*length_of_word_embeddings(100)
    """
    array = np.zeros((len(texts),max_len,100),dtype=np.float32)
    for i in range(len(texts)):
        for j in range(len(texts[i])):
            if texts[i][j] in gloveEmbeddings:
                a = gloveEmbeddings[texts[i][j]].reshape(1,-1)
            elif texts[i][j] in wordEmbeddings:
                a = word_embeddings[texts[i][j]].reshape(1,-1)
            else:
                a = np.zeros((1,100))
            array[i][j] = a
    return array

In [9]:
train_input_data = text_to_tensor(train_texts,gloveEmbeddings,word_embeddings,max_len)
test_input_data = text_to_tensor(test_texts,gloveEmbeddings,word_embeddings,max_len)

  elif texts[i][j] in wordEmbeddings:
  a = word_embeddings[texts[i][j]].reshape(1,-1)


In [10]:
# Prepare targets for model training

# For sentiment training
train_sentiment = np.zeros(len(train_data))
test_sentiment = np.zeros((len(test_data)))
aspect_dict = {} # For aspect training purposes (key:aspect,value:index of aspect)
count = 0
for i in range(len(train_data)):
    train_sentiment[i] = 1 if train_data['opinions'][i]['sentiment'] == 'Positive' else 0
    if not train_data['opinions'][i]['aspect'] in aspect_dict:
        aspect_dict[train_data['opinions'][i]['aspect']] = count
        count += 1
for i in range(len(test_data)):
    test_sentiment[i] = 1 if test_data['opinions'][i]['sentiment'] == 'Positive' else 0
#For aspect training
train_aspect = np.zeros((len(train_data),len(aspect_dict)))
test_aspect = np.zeros((len(test_data),len(aspect_dict)))
for i in range(len(train_data)):
    train_aspect[i][aspect_dict[train_data['opinions'][i]['aspect']]] = 1
for i in range(len(test_data)):
    test_aspect[i][aspect_dict[test_data['opinions'][i]['aspect']]] = 1

In [11]:
# Prepare another input to our model i.e. target entity
# train_target_entity contains index of target entity in the list of tokens of that particular review
train_target_entity = np.empty((len(train_data),),dtype=int)
test_target_entity = np.empty((len(test_data),),dtype=int)
for i in range(len(train_data)):
    train_target_entity[i] = train_texts[i].index(train_data['opinions'][i]['target_entity'].lower())
for i in range(len(test_data)):
    test_target_entity[i] = test_texts[i].index(test_data['opinions'][i]['target_entity'].lower())

In [12]:
# Model definiton
input_1 = Input(shape=(max_len,100)) #train_input_data
input_2 = Input(shape=(1,),dtype=tf.int32) #train_target_entity
#Bidirectional LSTM layer applied to train_input_data
#It's output(out) is a sequence of hidden states corresponding to each timestep
out = Bidirectional(LSTM(128,return_sequences=True,return_state=True))(input_1)
#hidden_seq is a sequence of hidden states corresponding to each timestep
hidden_seq = out[0]
#use the hidden state of the timestep corresponding to the position of the target entity in the input sentence
hidden =  tf.gather(hidden_seq,input_2[0],axis=1)
hidden = tf.squeeze(hidden,axis=1)
#feed forward neural layer for further processing
dense = Dense(64,activation='relu')(hidden)
#Dropout for regularization purposes
dense = Dropout(0.4)(dense)
dense = Dense(32,activation='relu')(dense)
dense = Dropout(0.4)(dense)
#output_1 - Sentiment
output_1 = Dense(1,activation='sigmoid',name='output_1')(dense)
#output_2 - Aspect
output_2 = Dense(len(aspect_dict),activation='softmax',name='output_2')(dense)
model = Model(inputs=[input_1,input_2],outputs=[output_1,output_2],name="ABSA")

In [13]:
model.summary()

Model: "ABSA"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 122, 100)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 122, 256), ( 234496      input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(1,)]               0           input_2[0][0]                    
_______________________________________________________________________________________________

In [14]:
#Optimizer : Adamax
opt = Adamax(learning_rate=0.01)
#BinaryCrossentropy for sentiment output and CategoricalCrossentropy for aspect output
model.compile(loss = {'output_1':'BinaryCrossentropy','output_2':'CategoricalCrossentropy'},optimizer=opt,metrics=['accuracy'])

In [15]:
#Train on training data
model.fit([train_input_data,train_target_entity],[train_sentiment,train_aspect],batch_size=64,epochs = 75)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<tensorflow.python.keras.callbacks.History at 0x1efb6b84ac0>

In [16]:
#Evaluate on test data
total_loss,sentiment_loss,aspect_loss,sentiment_acc,aspect_acc = model.evaluate([test_input_data,test_target_entity],[test_sentiment,test_aspect])
print("Accuracy on Sentiment Prediction :",sentiment_acc)
print("Accuracy on Aspect Prediction:",aspect_acc)

Accuracy on Sentiment Prediction : 0.7978532910346985
Accuracy on Aspect Prediction: 0.6070363521575928


In [17]:
# Another dictionary. Reverse the mapping of aspect_dict(helpful for writing preds.jsonl file)
inv_aspect_dict = {value : key for key,value in aspect_dict.items()}

In [18]:
# Predictions on test set
predictions = model.predict([test_input_data,test_target_entity])
sentiment_pred = predictions[0]
aspect_pred = predictions[1]

In [19]:
# a list of dictionaries(keys:sentiment,aspect,target_entity) for storing model predictions
preds_list = []
for i in range(len(test_data)):
    d = {}
    if sentiment_pred[i] < 0.5:
        d['sentiment'] = 'Negative'
    else:
        d['sentiment'] = 'Positive'
    for j in range(len(aspect_pred[1])):
        max_prob = max(aspect_pred[i])
        if aspect_pred[i][j] == max_prob:
            d['aspect'] = inv_aspect_dict[j]
            break
    d['target_entity'] = test_data['opinions'][i]['target_entity']
    preds_list.append(d)

In [20]:
# Add model predictions to the test_data dataframe
test_data['model_pred'] = preds_list

In [21]:
# Bring test_data dataframe into required form
new_test_data = test_data.groupby(test_data['id'],as_index=False,sort = False).aggregate({'opinions':lambda x : x.to_list(),'text':'first','model_pred':lambda x:x.to_list()})
new_test_data = new_test_data[['opinions','id','text','model_pred']]

In [22]:
# Write predictions to preds.jsonl file
new_test_data.to_json("preds.jsonl",orient = "records",lines=True,indent = 4)

In [23]:
#Analyzing results
print("Times model got it right to the ground truth ratio for every aspect")
for asp in aspect_dict:
    asp_count = 0
    times_model_predicted_correct = 0
    for i in range(len(test_data)):
        if test_data['opinions'][i]['aspect'] == asp:
            asp_count += 1
            if test_data['model_pred'][i]['aspect'] == asp:
                times_model_predicted_correct += 1
    print(asp.capitalize()+":",times_model_predicted_correct/asp_count)
    

Times model got it right to the ground truth ratio for every aspect
Price: 0.8924302788844621
Shopping: 0.5641025641025641
Transit-location: 0.5248868778280543
Nightlife: 0.5454545454545454
General: 0.5870307167235495
Live: 0.6
Safety: 0.6835443037974683
Multicultural: 0.6274509803921569
Green-nature: 0.2765957446808511
Touristy: 0.23333333333333334
Quiet: 0.16666666666666666
Dining: 0.5135135135135135


**Point of Failure:**
Model peforms weakly when a single target entity is evaluated on more than one aspect in a single review that may have arisen due to simplicity of this word embedding and LSTM-based model.