# T-ABSA Random Forest Model using word2vec

#### Preprocessing the reviews 

Importing the libraries for preprocessing the reviews

In [1]:
import os
import pandas as pd
import nltk
from gensim.models import Word2Vec, word2vec
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords
import os
import re

Loading the training dataset into python

In [2]:
data_dir = 'D:/Generate_Data/data/5_aspects/'
df_train = pd.read_csv(os.path.join(data_dir, "train_NLI.tsv"),sep="\t")
df_dev = pd.read_csv(os.path.join(data_dir, "train_NLI.tsv"),sep="\t")
df_test = pd.read_csv(os.path.join(data_dir, "test_NLI.tsv"),sep="\t")
df_train.tail(2)

Unnamed: 0,id,sentence1,sentence2,label
20061,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - General,
20062,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Network,


In [3]:
frames = [df_train, df_dev, df_test]
combined_dataframe = pd.concat(frames)
combined_dataframe.iloc[3000]

id                                 1095557109549359104
sentence1    how much data is in airtelug s gb sleepy 
sentence2                     airtel - CustomerService
label                                             None
Name: 3000, dtype: object

In [4]:
combined_dataframe.tail()

Unnamed: 0,id,sentence1,sentence2,label
5664,1168162668429139968,mtn whenever i get to bbunga the network start...,mtn - Calls,
5665,1168162668429139968,mtn whenever i get to bbunga the network start...,mtn - CustomerService,
5666,1168162668429139968,mtn whenever i get to bbunga the network start...,mtn - Data,
5667,1168162668429139968,mtn whenever i get to bbunga the network start...,mtn - General,
5668,1168162668429139968,mtn whenever i get to bbunga the network start...,mtn - Network,Negative


In [None]:
combined_dataframe['concatinated'] = combined_dataframe['sentence1'] + ' ' + combined_dataframe['sentence2']

In [None]:
word2vec_training_dataset = combined_dataframe['concatinated'].values

### Preprocessing the data

Convert each review in the training set to a list of sentences where each sentence is in turn a list of words.
Besides splitting reviews into sentences, non-letters and stop words are removed and all words
coverted to lower case.

In [7]:
def review_to_wordlist(review, remove_stopwords=True):
    """
    Convert a review to a list of words.
    """
    # remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review)
    
    # convert to lower case and split at whitespace
    words = review_text.lower().split()
    
    # remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return words

In [8]:
# Load the punkt tokenizer used for splitting reviews into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
def review_to_sentences(review, tokenizer, remove_stopwords=True):
    """
    Split review into list of sentences where each sentence is a list of words.
    """
    # use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())

    # each sentence is furthermore split into words
    sentences = []    
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
            
    return sentences

In [10]:
train_sentences = []  # Initialize an empty list of sentences
for review in word2vec_training_dataset:
    train_sentences += review_to_sentences(review, tokenizer)

### Training a word2vec model

In [12]:
model_name = 'train_model'
# Set values for various word2vec parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 3       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words
if not os.path.exists(model_name): 
    # Initialize and train the model (this will take some time)
    model = word2vec.Word2Vec(train_sentences, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling)

    # If you don't plan to train the model any further, calling 
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and 
    # save the model for later use. You can load it later using Word2Vec.load()
    model.save(model_name)
else:
    model = Word2Vec.load(model_name)

In [15]:
model.most_similar("internet")

  """Entry point for launching an IPython kernel.


[('kla', 0.6493709087371826),
 ('extremely', 0.6402525305747986),
 ('cry', 0.6238170862197876),
 ('ug', 0.6131152510643005),
 ('brought', 0.610197901725769),
 ('z', 0.6041684746742249),
 ('edge', 0.5958809852600098),
 ('upload', 0.5940711498260498),
 ('hardly', 0.5938520431518555),
 ('super', 0.5913786888122559)]

### Building a Classifier

In [16]:
# shape of the data

df_train.shape

(20063, 4)

Encoding the labels of the dataset

In [17]:
y_train = df_train['label'].replace(['None','Positive','Negative'],[1,2,0])

In [18]:
x_cols = [x for x in df_train.columns if x != 'label']

# Split the data into two dataframes (one for the labels and the other for the independent variables)
X_data = df_train[x_cols]

In [19]:
X_data.tail()

Unnamed: 0,id,sentence1,sentence2
20058,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Calls
20059,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - CustomerService
20060,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Data
20061,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - General
20062,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Network


In [20]:
X_data['concatinated'] = X_data['sentence1'] + ' ' + X_data['sentence2']

X_data['concatinated'][9]

'africellug your internet is so frustrating pout  africell - Network'

In [21]:
X_train = X_data['concatinated'].values

X_train[100]

'its a crazy weekend with africellug gb for ugx day gb for ugx days dial to activate you go kill me woawith these pretty internet bundles  africell - Network'

In [22]:
y_train[100]

1

## 3. Build classifier using word embedding



Each review is mapped to a feature vector by averaging the word embeddings of all words in the review. These features are then fed into a random forest classifier.

In [23]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0
    #index2word_set = set(model.index2word)  # words known to the model
    index2word_set = set(model.wv.index2word)  # words known to the model
    
    

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            feature_vec = np.add(feature_vec,model[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    """
    Calculate average feature vectors for all reviews
    """
    counter = 0
    review_feature_vecs = np.zeros((len(reviews),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter = counter + 1
    return review_feature_vecs

In [24]:
# calculate average feature vectors for training and test sets
clean_train_reviews = []

for review in X_train:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))
    
    
trainDataVecs = get_avg_feature_vecs(clean_train_reviews, model, num_features)

  from ipykernel import kernelapp as app


#### Fit a random forest classifier to the training data

In [25]:
from sklearn.ensemble import RandomForestClassifier

# Fit a random forest to the training data, using 100 trees
forest = RandomForestClassifier(n_estimators = 100)

print("Fitting a random forest to labeled training data...")
forest = forest.fit(trainDataVecs, y_train)

print('Completed!')

Fitting a random forest to labeled training data...
Completed!


## 4. Prediction

### Test set data preparation

In [26]:
# Split the data into two dataframes (one for the labels and the other for the independent variables)
x_cols = [x for x in df_test.columns if x != 'label']
X_data_test = df_test[x_cols]

# Combining the review with the generated auxilliary sentence
X_data_test['concatinated'] = X_data_test['sentence1'] + ' ' + X_data_test['sentence2']


# X test data
X_test = X_data_test['concatinated'].values

print(X_test[100:108])


# y test data
y_test = df_test['label'].replace(['None','Positive','Negative'],[1,2,0])

y_test[100:108]

['gb for compared to gb for k good bye mtn hello africellug  africell - Calls'
 'gb for compared to gb for k good bye mtn hello africellug  africell - CustomerService'
 'gb for compared to gb for k good bye mtn hello africellug  africell - Data'
 'gb for compared to gb for k good bye mtn hello africellug  africell - General'
 'gb for compared to gb for k good bye mtn hello africellug  africell - Network'
 'africellug some of us dont just get excited most especially when it comes to ott it used to be ugx gb but now mb for ugx mbu plus free ott not among the excited a sure nothing is free of charge in ug apart from death  africell - Calls'
 'africellug some of us dont just get excited most especially when it comes to ott it used to be ugx gb but now mb for ugx mbu plus free ott not among the excited a sure nothing is free of charge in ug apart from death  africell - CustomerService'
 'africellug some of us dont just get excited most especially when it comes to ott it used to be ugx gb bu

100    1
101    1
102    2
103    1
104    1
105    1
106    1
107    0
Name: label, dtype: int64

In [27]:
clean_test_reviews = []
for review in X_test:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))
testDataVecs = get_avg_feature_vecs(clean_test_reviews, model, num_features)

  from ipykernel import kernelapp as app


In [28]:
# remove instances in test set that could not be represented as feature vectors
nan_indices = list({x for x,y in np.argwhere(np.isnan(testDataVecs))})
if len(nan_indices) > 0:
    print('Removing {:d} instances from test set.'.format(len(nan_indices)))
    testDataVecs = np.delete(testDataVecs, nan_indices, axis=0)
    test_reviews.drop(test_reviews.iloc[nan_indices, :].index, axis=0, inplace=True)
    assert testDataVecs.shape[0] == len(test_reviews)

In [29]:
print("Predicting labels for test data..")
Y_predicted = forest.predict(testDataVecs)

Predicting labels for test data..


Evaluating the performance of the model

In [30]:
from  sklearn.metrics  import accuracy_score
print(accuracy_score(y_test, Y_predicted))

0.8186629035103192


In [31]:
Y_forest_score = forest.predict_proba(testDataVecs)
Y_forest_score

array([[0.18, 0.8 , 0.02],
       [0.15, 0.83, 0.02],
       [0.13, 0.82, 0.05],
       ...,
       [0.09, 0.9 , 0.01],
       [0.07, 0.89, 0.04],
       [0.1 , 0.88, 0.02]])

In [32]:
import csv

# Open/Create a file to append data
csvFile_pred = open('prediction_score.csv', 'w')

#Use csv Writer
csvWriter_pred = csv.writer(csvFile_pred)

csvWriter_pred.writerow(['predicted','score_none','score_pos','score_neg'])

42

In [33]:
for f in range(len(Y_predicted)):
    csvWriter_pred.writerow([Y_predicted[f],Y_forest_score[f][1], Y_forest_score[f][0], Y_forest_score[f][2]])
csvFile_pred.close()

In [34]:
dataframe = pd.read_csv('prediction_score.csv')
dataframe.tail(10)

Unnamed: 0,predicted,score_none,score_pos,score_neg
5659,1,0.65,0.26,0.09
5660,1,0.61,0.36,0.03
5661,0,0.43,0.51,0.06
5662,1,0.555,0.365,0.08
5663,1,0.61,0.33,0.06
5664,1,0.91,0.07,0.02
5665,1,0.86,0.1,0.04
5666,1,0.9,0.09,0.01
5667,1,0.89,0.07,0.04
5668,1,0.88,0.1,0.02


### Evaluating the model

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 20 11:40:28 2019

@author: David
"""

import collections

import numpy as np
import pandas as pd
from sklearn import metrics



def get_y_true():
#    """ 
#    Read file to obtain y_true.
#        
#    """
    true_data_file = "D:/jupyter/Year2_Research/Generate_Data/data/5_aspects/test_NLI.tsv"

    df = pd.read_csv(true_data_file,sep='\t')
    y_true = []
    for i in range(len(df)):
        label = df['label'][i]
        assert label in ['None', 'Positive', 'Negative'], "error!"
        if label == 'None':
            n = 1
        elif label == 'Positive':
            n = 2
        else:
            n = 0
        y_true.append(n)

    print(len(y_true))   
    return y_true

        
def get_y_pred():
#    """ 
#    Read file to obtain y_pred and scores.
#    """
               
    dataframe = pd.read_csv('prediction_score.csv')
    
    pred=[]
    score=[]



    for f in range(len(dataframe)):
        pred.append(dataframe.predicted[f])
        score.append([float(dataframe.score_pos[f]),float(dataframe.score_none[f]),float(dataframe.score_neg[f])])
                 
    return pred, score





def _strict_acc(y_true, y_pred):
    """
    Calculate "strict Acc" of aspect detection task.
    """
    total_cases=int(len(y_true)/5)
    true_cases=0
    for i in range(total_cases):
        if y_true[i*5]!=y_pred[i*5]:continue
        if y_true[i*5+1]!=y_pred[i*5+1]:continue
        if y_true[i*5+2]!=y_pred[i*5+2]:continue
        if y_true[i*5+3]!=y_pred[i*5+3]:continue
        if y_true[i*5+4]!=y_pred[i*5+4]:continue
        true_cases+=1
    aspect_strict_Acc = true_cases/total_cases

    return aspect_strict_Acc


def _macro_F1(y_true, y_pred):
    """
    Calculate "Macro-F1" of aspect detection task.
    """
    p_all=0
    r_all=0
    count=0
    for i in range(len(y_pred)//5):
        a=set()
        b=set()
        for j in range(5):
            if y_pred[i*5+j]!=1:
                a.add(j)
            if y_true[i*5+j]!=1:
                b.add(j)
        if len(b)==0:continue
        a_b=a.intersection(b)
        if len(a_b)>0:
            p=len(a_b)/len(a)
            r=len(a_b)/len(b)
        else:
            p=0
            r=0
        count+=1
        p_all+=p
        r_all+=r
    Ma_p=p_all/count
    Ma_r=r_all/count
    aspect_Macro_F1 = 2*Ma_p*Ma_r/(Ma_p+Ma_r)

    return aspect_Macro_F1


def _AUC_Acc(y_true, score):
    """
    Calculate "Macro-AUC" of both aspect detection and sentiment classification tasks.
    Calculate "Acc" of sentiment classification task.
    """
    # aspect-Macro-AUC
    aspect_y_true=[]
    aspect_y_score=[]
    aspect_y_trues=[[],[],[],[],[]]
    aspect_y_scores=[[],[],[],[],[]]
    for i in range(len(y_true)):
        if y_true[i]>0:
            aspect_y_true.append(0)
        else:
            aspect_y_true.append(1) # "None": 1
        tmp_score=score[i][0] # probability of "None"
        aspect_y_score.append(tmp_score)
        aspect_y_trues[i%5].append(aspect_y_true[-1])
        aspect_y_scores[i%5].append(aspect_y_score[-1])

    aspect_auc=[]
    for i in range(5):
        aspect_auc.append(metrics.roc_auc_score(aspect_y_trues[i], aspect_y_scores[i]))
        
    print("AUC per aspect:\t Calls, CustomerService, Data, General, Network")
    print(aspect_auc)
    
    aspect_Macro_AUC = np.mean(aspect_auc)
    
    # sentiment-Macro-AUC
    sentiment_y_true=[]
    sentiment_y_pred=[]
    sentiment_y_score=[]
    sentiment_y_trues=[[],[],[],[],[]]
    sentiment_y_scores=[[],[],[],[],[]]
    for i in range(len(y_true)):
        if y_true[i]>0:
            sentiment_y_true.append(y_true[i]-1) # "Postive":0, "Negative":1
            tmp_score=score[i][2]/(score[i][1]+score[i][2])  # probability of "Negative"
            sentiment_y_score.append(tmp_score)
            if tmp_score>0.5:
                sentiment_y_pred.append(1) # "Negative": 1
            else:
                sentiment_y_pred.append(0)
            sentiment_y_trues[i%5].append(sentiment_y_true[-1])
            sentiment_y_scores[i%5].append(sentiment_y_score[-1])

    sentiment_auc=[]
    for i in range(5):
        sentiment_auc.append(metrics.roc_auc_score(sentiment_y_trues[i], sentiment_y_scores[i]))
    sentiment_Macro_AUC = np.mean(sentiment_auc)

    # sentiment Acc
    sentiment_y_true = np.array(sentiment_y_true)
    sentiment_y_pred = np.array(sentiment_y_pred)
    sentiment_Acc = metrics.accuracy_score(sentiment_y_true,sentiment_y_pred)

    return aspect_Macro_AUC, sentiment_Acc, sentiment_Macro_AUC

#####################################################################


y_true = (get_y_true())
y_pred, score = get_y_pred()

result = collections.OrderedDict()

aspect_strict_Acc = _strict_acc(y_true, y_pred)
aspect_Macro_F1 = _macro_F1(y_true, y_pred)
aspect_Macro_AUC, sentiment_Acc, sentiment_Macro_AUC = _AUC_Acc(y_true, score)
result = {'aspect_strict_Acc': aspect_strict_Acc,
                'aspect_Macro_F1': aspect_Macro_F1,
                'aspect_Macro_AUC': aspect_Macro_AUC,
                'sentiment_Acc': sentiment_Acc,
                'sentiment_Macro_AUC': sentiment_Macro_AUC}


print(result)

nameHandle = open('RF_word2vec_evaluation_results.txt', 'w')

nameHandle.write('aspect_strict_Acc:\t'+ str(aspect_strict_Acc))
nameHandle.write('\naspect_Macro_F1:\t' + str(aspect_Macro_F1))
nameHandle.write('\naspect_Macro_AUC:\t' + str(aspect_Macro_AUC))
nameHandle.write('\n\nsentiment_Acc:\t' + str(sentiment_Acc))
nameHandle.write('\nsentiment_Macro_AUC:\t' + str(sentiment_Macro_AUC))
nameHandle.close()

5669
AUC per aspect:	 Calls, CustomerService, Data, General, Network
[0.7006989083939131, 0.6694798642326392, 0.6745171515701973, 0.7540231330148354, 0.6692446245637735]
{'aspect_strict_Acc': 0.23654015887025595, 'aspect_Macro_F1': 0.08959557259358601, 'aspect_Macro_AUC': 0.6935927363550716, 'sentiment_Acc': 0.9651406971860563, 'sentiment_Macro_AUC': 0.8196489813725861}
