# T-ABSA Random Forest Model using TF-IDF 

#### Preprocessing the reviews 

Importing the libraries for preprocessing the reviews

In [1]:
# read in some helpful libraries

import os
import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe

Loading the training dataset into python

In [6]:
data_dir = 'D:/Models/Random_Forest/tfidf/data/generated/'


df = pd.read_csv(os.path.join(data_dir, "train_NLI.tsv"),sep="\t")

df.tail(4)

Unnamed: 0,id,sentence1,sentence2,label
16131,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Calls,
16132,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Data,
16133,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - General,
16134,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Network,


#### Tokenizing the reviews

Top 10 words in the reviews before removing any stop words

Removing the stop words

Top 100 words after removing stop words

Plotting a word cloud of the words in the reviews

### Building a Classifier

In [7]:
# shape of the data
df.shape

(16135, 4)

Encoding the labels of the dataset

In [10]:
y_train = df['label'].replace(['None','Positive','Negative'],[1,2,0])

y_train.head(4)

0    1
1    2
2    1
3    1
Name: label, dtype: int64

In [12]:
y_train.tail(4)

16131    1
16132    1
16133    1
16134    1
Name: label, dtype: int64

Importing the libraries for carrying out TFID on the reviews

In [13]:
x_cols = [x for x in df.columns if x != 'label']

# Split the data into two dataframes (one for the labels and the other for the independent variables)
X_data = df[x_cols]

In [14]:
X_data.tail()

Unnamed: 0,id,sentence1,sentence2
16130,1168185422222155778,shading mtn for having the worst customer serv...,mtn - Network
16131,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Calls
16132,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Data
16133,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - General
16134,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Network


In [17]:
y_train[100]

1

### Preprocessing the data

In [18]:
# read in some helpful libraries

import re                         # regular expression
from nltk.corpus import stopwords  
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np

In [19]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    
    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    return text

In [20]:
X_data['concatinated'] = X_data['concatinated'].map(transformText)

### TD-IDF Embedding

In [22]:
## Get the word vocabulary out of the data

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()


X_train_counts = count_vect.fit_transform(X_data['concatinated'])


## Count of 'mtn' in corpus
print ('mtn appears:', count_vect.vocabulary_.get(u'mtn') , 'in the corpus')

mtn appears: 3786 in the corpus


## 3. Training a classifier

In [23]:
## Get the TF-IDF vector representation of the data

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print ('Dimension of TF-IDF vector :' , X_train_tfidf.shape)

Dimension of TF-IDF vector : (16135, 6762)


In [24]:
from sklearn.ensemble import RandomForestClassifier

# Fit a random forest to the training data, using 100 trees
forest = RandomForestClassifier(n_estimators = 100)

print("Fitting a random forest to labeled training data...")
forest = forest.fit(X_train_tfidf, y_train)

print("Fitting Completed")

Fitting a random forest to labeled training data...


## 4. Prediction

### Test Data

In [25]:
data_dir_test = 'D:/Models/Random_Forest/tfidf/data/generated/'

df_test = pd.read_csv(os.path.join(data_dir_test, "test_NLI.tsv"),sep="\t")

df_test['concatinated'] = df_test['sentence1'] + ' ' + df_test['sentence2']


Preprocessing the test data

In [26]:
df_test['concatinated'] = df_test['concatinated'].map(transformText)

In [None]:
X_test = df_test['concatinated'].values

y_test = df_test['label'].replace(['None','Positive','Negative'],[1,2,0]).values

In [28]:
## Prediction part

X_new_counts = count_vect.transform(X_test)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

Y_predicted = forest.predict(X_new_tfidf)

Evaluating the performance of the model

In [29]:
from  sklearn.metrics  import accuracy_score
print(accuracy_score(y_test, Y_predicted))

0.8718455123985078


In [30]:
Y_forest_score = forest.predict_proba(X_new_tfidf)
Y_forest_score

array([[0.17, 0.82, 0.01],
       [0.32, 0.66, 0.02],
       [0.16, 0.8 , 0.04],
       ...,
       [0.11, 0.85, 0.04],
       [0.08, 0.86, 0.06],
       [0.38, 0.55, 0.07]])

In [31]:
import csv

# Open/Create a file to append data
csvFile_pred = open('prediction_score.csv', 'w')

#Use csv Writer
csvWriter_pred = csv.writer(csvFile_pred)

csvWriter_pred.writerow(['predicted','score_neg','score_none','score_pos'])

42

In [32]:
for f in range(len(Y_predicted)):
    csvWriter_pred.writerow([Y_predicted[f],Y_forest_score[f][0], Y_forest_score[f][1], Y_forest_score[f][2]])
csvFile_pred.close()

In [33]:
# Open/Create a file to append data
csvFile_true = open('y_test_true.csv', 'w')

#Use csv Writer
csvWriter_true = csv.writer(csvFile_true)

csvWriter_true.writerow(['y_test'])

for f in range(len(y_test)):
    csvWriter_true.writerow([y_test[f]])
csvFile_true.close()

Checking on the save result files

In [34]:
df = pd.read_csv('y_test_true.csv')
df.head()

Unnamed: 0,y_test
0,1
1,1
2,1
3,0
4,1


In [35]:
len(df)

4557

In [36]:
dataframe = pd.read_csv('prediction_score.csv')
dataframe.tail(5)

Unnamed: 0,predicted,score_neg,score_none,score_pos
4552,1,0.18,0.82,0.0
4553,1,0.24,0.76,0.0
4554,1,0.11,0.85,0.04
4555,1,0.08,0.86,0.06
4556,1,0.38,0.55,0.07


### Evaluation of the model

In [38]:
import collections

import numpy as np
import pandas as pd
from sklearn import metrics

def get_y_true():
#    """ 
#    Read file to obtain y_true.
#    All of five tasks of sentitel use the test set of task-BERT-pair-NLI-M to get true labels.
#    All of five tasks of SemEval-2014 use the test set of task-BERT-pair-NLI-M to get true labels.
#    """
   
        df = pd.read_csv('y_test_true.csv')
        y_true = []

        for f in range(len(df)):
            y_true.append(df.y_test[f])
        
        return y_true
       
#def get_y_pred(task_name, pred_data_dir):
        
def get_y_pred():
#    """ 
#    Read file to obtain y_pred and scores.
#    """
               
    dataframe = pd.read_csv('prediction_score.csv')
    
    pred=[]
    score=[]



    for f in range(len(dataframe)):
        pred.append(dataframe.predicted[f])
        score.append([float(dataframe.score_pos[f]),float(dataframe.score_none[f]),float(dataframe.score_neg[f])])
                 
    return pred, score





def sentitel_strict_acc(y_true, y_pred):
    """
    Calculate "strict Acc" of aspect detection task of sentitel.
    """
    total_cases=int(len(y_true)/4)
    true_cases=0
    for i in range(total_cases):
        if y_true[i*4]!=y_pred[i*4]:continue
        if y_true[i*4+1]!=y_pred[i*4+1]:continue
        if y_true[i*4+2]!=y_pred[i*4+2]:continue
        if y_true[i*4+3]!=y_pred[i*4+3]:continue
        true_cases+=1
    aspect_strict_Acc = true_cases/total_cases

    return aspect_strict_Acc


def sentitel_macro_F1(y_true, y_pred):
    """
    Calculate "Macro-F1" of aspect detection task of sentitel.
    """
    p_all=0
    r_all=0
    count=0
    for i in range(len(y_pred)//4):
        a=set()
        b=set()
        for j in range(4):
            if y_pred[i*4+j]!=0:
                a.add(j)
            if y_true[i*4+j]!=0:
                b.add(j)
        if len(b)==0:continue
        a_b=a.intersection(b)
        if len(a_b)>0:
            p=len(a_b)/len(a)
            r=len(a_b)/len(b)
        else:
            p=0
            r=0
        count+=1
        p_all+=p
        r_all+=r
    Ma_p=p_all/count
    Ma_r=r_all/count
    aspect_Macro_F1 = 2*Ma_p*Ma_r/(Ma_p+Ma_r)

    return aspect_Macro_F1


def sentitel_AUC_Acc(y_true, score):
    """
    Calculate "Macro-AUC" of both aspect detection and sentiment classification tasks of sentitel.
    Calculate "Acc" of sentiment classification task of sentitel.
    """
    # aspect-Macro-AUC
    aspect_y_true=[]
    aspect_y_score=[]
    aspect_y_trues=[[],[],[],[]]
    aspect_y_scores=[[],[],[],[]]
    for i in range(len(y_true)):
        if y_true[i]>0:
            aspect_y_true.append(0)
        else:
            aspect_y_true.append(1) # "None": 1
        tmp_score=score[i][0] # probability of "None"
        aspect_y_score.append(tmp_score)
        aspect_y_trues[i%4].append(aspect_y_true[-1])
        aspect_y_scores[i%4].append(aspect_y_score[-1])

    aspect_auc=[]
    for i in range(4):
        aspect_auc.append(metrics.roc_auc_score(aspect_y_trues[i], aspect_y_scores[i]))
    aspect_Macro_AUC = np.mean(aspect_auc)
    
    # sentiment-Macro-AUC
    sentiment_y_true=[]
    sentiment_y_pred=[]
    sentiment_y_score=[]
    sentiment_y_trues=[[],[],[],[]]
    sentiment_y_scores=[[],[],[],[]]
    for i in range(len(y_true)):
        if y_true[i]>0:
            sentiment_y_true.append(y_true[i]-1) # "Postive":0, "Negative":1
            tmp_score=score[i][2]/(score[i][1]+score[i][2])  # probability of "Negative"
            sentiment_y_score.append(tmp_score)
            if tmp_score>0.5:
                sentiment_y_pred.append(1) # "Negative": 1
            else:
                sentiment_y_pred.append(0)
            sentiment_y_trues[i%4].append(sentiment_y_true[-1])
            sentiment_y_scores[i%4].append(sentiment_y_score[-1])

    sentiment_auc=[]
    for i in range(4):
        sentiment_auc.append(metrics.roc_auc_score(sentiment_y_trues[i], sentiment_y_scores[i]))
    sentiment_Macro_AUC = np.mean(sentiment_auc)

    # sentiment Acc
    sentiment_y_true = np.array(sentiment_y_true)
    sentiment_y_pred = np.array(sentiment_y_pred)
    sentiment_Acc = metrics.accuracy_score(sentiment_y_true,sentiment_y_pred)

    return aspect_Macro_AUC, sentiment_Acc, sentiment_Macro_AUC

#####################################################################


y_true = (get_y_true())
y_pred, score = get_y_pred()

result = collections.OrderedDict()

aspect_strict_Acc = sentitel_strict_acc(y_true, y_pred)
aspect_Macro_F1 = sentitel_macro_F1(y_true, y_pred)
aspect_Macro_AUC, sentiment_Acc, sentiment_Macro_AUC = sentitel_AUC_Acc(y_true, score)
result = {'aspect_strict_Acc': aspect_strict_Acc,
                'aspect_Macro_F1': aspect_Macro_F1,
                'aspect_Macro_AUC': aspect_Macro_AUC,
                'sentiment_Acc': sentiment_Acc,
                'sentiment_Macro_AUC': sentiment_Macro_AUC}


print(result)

nameHandle = open('evaluation_results.txt', 'w')

nameHandle.write('aspect_strict_Acc:\t'+ str(aspect_strict_Acc))
nameHandle.write('\naspect_Macro_F1:\t' + str(aspect_Macro_F1))
nameHandle.write('\naspect_Macro_AUC:\t' + str(aspect_Macro_AUC))
nameHandle.write('\n\nsentiment_Acc:\t' + str(sentiment_Acc))
nameHandle.write('\nsentiment_Macro_AUC:\t' + str(sentiment_Macro_AUC))
nameHandle.close()

{'aspect_strict_Acc': 0.5399473222124671, 'aspect_Macro_F1': 0.9443483881214678, 'aspect_Macro_AUC': 0.6153760420954906, 'sentiment_Acc': 0.9580853816300129, 'sentiment_Macro_AUC': 0.7377472835679291}
