# T-ABSA Logistic Regression Model using TF-IDF 

#### Preprocessing the reviews 

Importing the libraries for preprocessing the reviews

In [1]:
# read in some helpful libraries

import os
import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe

Loading the training dataset into python

In [2]:
data_dir = 'D:/Generate_Data/data/5_aspects/'


df = pd.read_csv(os.path.join(data_dir, "train_NLI.tsv"),sep="\t")

df.tail(4)

Unnamed: 0,id,sentence1,sentence2,label
20059,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - CustomerService,Negative
20060,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Data,
20061,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - General,
20062,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Network,


### Building a Classifier

In [3]:
# shape of the data
df.shape

(20063, 4)

Encoding the labels of the dataset

In [4]:
y_train = df['label'].replace(['None','Positive','Negative'],[1,2,0])

y_train.head(4)

0    1
1    1
2    2
3    1
Name: label, dtype: int64

In [5]:
y_train.tail(4)

20059    0
20060    1
20061    1
20062    1
Name: label, dtype: int64

Importing the libraries for carrying out TFID on the reviews

In [6]:
x_cols = [x for x in df.columns if x != 'label']

# Split the data into two dataframes (one for the labels and the other for the independent variables)
X_data = df[x_cols]

In [7]:
X_data.tail()

Unnamed: 0,id,sentence1,sentence2
20058,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Calls
20059,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - CustomerService
20060,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Data
20061,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - General
20062,1168227041763823616,mtnugwhy did you disconnect my line from network,mtn - Network


In [None]:
X_data['concatinated'] = X_data['sentence1'] + ' ' + X_data['sentence2']

In [None]:
X_train = X_data['concatinated'].values

In [10]:
y_train[100]

1

### Preprocessing the data

In [11]:
import re                         # regular expression
from nltk.corpus import stopwords  
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np

In [12]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    
    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    return text

In [13]:
X_data['concatinated'] = X_data['concatinated'].map(transformText)

### TD-IDF Embedding

In [15]:
## Get the word vocabulary out of the data

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()


X_train_counts = vectorizer.fit(X_data['concatinated'])

num_features = len(vectorizer.get_feature_names())

print("Number of Featurs", num_features)

Number of Featurs 6768


## 3. Training a classifier

In [16]:
X_train_tfidf = vectorizer.transform(X_data['concatinated'])

print(X_train_tfidf.shape)

(20063, 6768)


In [17]:



from sklearn.linear_model import LogisticRegression

print("Fitting a weighted logistic regression to the labeled training data...")


model = LogisticRegression(class_weight='balanced')
model = model.fit(X_train_tfidf, y_train)

print("Fitting Completed")

Fitting a weighted logistic regression to the labeled training data...




Fitting Completed


## 4. Prediction

### Test Data

In [18]:
data_dir_test = 'D:/jupyter/Year2_Research/Generate_Data/data/5_aspects/'

df_test = pd.read_csv(os.path.join(data_dir_test, "test_NLI.tsv"),sep="\t")

df_test['concatinated'] = df_test['sentence1'] + ' ' + df_test['sentence2']


Preprocessing the test data

In [19]:
df_test['concatinated'] = df_test['concatinated'].map(transformText)

In [None]:
X_test = df_test['concatinated'].values
y_test = df_test['label'].replace(['None','Positive','Negative'],[1,2,0]).values


In [21]:
X_test_tfidf  = vectorizer.transform(X_test)
print ('Dimension of TF-IDF vector :' , X_test_tfidf.shape)

Dimension of TF-IDF vector : (5669, 6768)


In [23]:
## Prediction

Y_predicted = model.predict(X_test_tfidf)

Evaluating the performance of the model

In [24]:
from  sklearn.metrics  import accuracy_score
print(accuracy_score(y_test, Y_predicted))

0.7733286293878991


In [25]:
Y_forest_score = model.predict_proba(X_test_tfidf)
Y_forest_score

array([[0.25258025, 0.73169552, 0.01572423],
       [0.37322663, 0.60073771, 0.02603566],
       [0.53269951, 0.40292008, 0.06438041],
       ...,
       [0.45693178, 0.4740096 , 0.06905862],
       [0.21029751, 0.67228958, 0.11741292],
       [0.31837616, 0.64991318, 0.03171066]])

In [26]:
import csv

# Open/Create a file to append data
csvFile_pred = open('prediction_score.csv', 'w')

#Use csv Writer
csvWriter_pred = csv.writer(csvFile_pred)

csvWriter_pred.writerow(['predicted','score_none','score_pos','score_neg'])

42

In [27]:
for f in range(len(Y_predicted)):
    csvWriter_pred.writerow([Y_predicted[f],Y_forest_score[f][1], Y_forest_score[f][0], Y_forest_score[f][2]])
csvFile_pred.close()

Checking on the save result files

In [28]:
len(df)

20063

In [29]:
dataframe = pd.read_csv('prediction_score.csv')
dataframe.tail(5)

Unnamed: 0,predicted,score_none,score_pos,score_neg
5664,1,0.831817,0.1562,0.011982
5665,1,0.704573,0.272903,0.022524
5666,1,0.47401,0.456932,0.069059
5667,1,0.67229,0.210298,0.117413
5668,1,0.649913,0.318376,0.031711


### Evaluation of the model

In [1]:
import collections

import numpy as np
import pandas as pd
from sklearn import metrics



def get_y_true():
#    """ 
#    Read file to obtain y_true.
#        
#    """
    true_data_file = "D:/Generate_Data/data/5_aspects/test_NLI.tsv"

    df = pd.read_csv(true_data_file,sep='\t')
    y_true = []
    for i in range(len(df)):
        label = df['label'][i]
        assert label in ['None', 'Positive', 'Negative'], "error!"
        if label == 'None':
            n = 1
        elif label == 'Positive':
            n = 2
        else:
            n = 0
        y_true.append(n)

    print(len(y_true))   
    return y_true

        
def get_y_pred():
#    """ 
#    Read file to obtain y_pred and scores.
#    """
               
    dataframe = pd.read_csv('prediction_score.csv')
    
    pred=[]
    score=[]



    for f in range(len(dataframe)):
        pred.append(dataframe.predicted[f])
        score.append([float(dataframe.score_pos[f]),float(dataframe.score_none[f]),float(dataframe.score_neg[f])])
                 
    return pred, score





def _strict_acc(y_true, y_pred):
    """
    Calculate "strict Acc" of aspect detection task of dataset.
    """
    total_cases=int(len(y_true)/5)
    true_cases=0
    for i in range(total_cases):
        if y_true[i*5]!=y_pred[i*5]:continue
        if y_true[i*5+1]!=y_pred[i*5+1]:continue
        if y_true[i*5+2]!=y_pred[i*5+2]:continue
        if y_true[i*5+3]!=y_pred[i*5+3]:continue
        if y_true[i*5+4]!=y_pred[i*5+4]:continue
        true_cases+=1
    aspect_strict_Acc = true_cases/total_cases

    return aspect_strict_Acc


def _macro_F1(y_true, y_pred):
    """
    Calculate "Macro-F1" of aspect detection task of .
    """
    p_all=0
    r_all=0
    count=0
    for i in range(len(y_pred)//5):
        a=set()
        b=set()
        for j in range(5):
            if y_pred[i*5+j]!=1:
                a.add(j)
            if y_true[i*5+j]!=1:
                b.add(j)
        if len(b)==0:continue
        a_b=a.intersection(b)
        if len(a_b)>0:
            p=len(a_b)/len(a)
            r=len(a_b)/len(b)
        else:
            p=0
            r=0
        count+=1
        p_all+=p
        r_all+=r
    Ma_p=p_all/count
    Ma_r=r_all/count
    aspect_Macro_F1 = 2*Ma_p*Ma_r/(Ma_p+Ma_r)

    return aspect_Macro_F1


def _AUC_Acc(y_true, score):
    """
    Calculate "Macro-AUC" of both aspect detection and sentiment classification tasks of dataset.
    Calculate "Acc" of sentiment classification task of dataset.
    """
    # aspect-Macro-AUC
    aspect_y_true=[]
    aspect_y_score=[]
    aspect_y_trues=[[],[],[],[],[]]
    aspect_y_scores=[[],[],[],[],[]]
    for i in range(len(y_true)):
        if y_true[i]>0:
            aspect_y_true.append(0)
        else:
            aspect_y_true.append(1) # "None": 1
        tmp_score=score[i][0] # probability of "None"
        aspect_y_score.append(tmp_score)
        aspect_y_trues[i%5].append(aspect_y_true[-1])
        aspect_y_scores[i%5].append(aspect_y_score[-1])

    aspect_auc=[]
    for i in range(5):
        aspect_auc.append(metrics.roc_auc_score(aspect_y_trues[i], aspect_y_scores[i]))
        
    print("AUC per aspect:\t Calls, CustomerService, Data, General, Network")
    print(aspect_auc)
    aspect_Macro_AUC = np.mean(aspect_auc)
    
    # sentiment-Macro-AUC
    sentiment_y_true=[]
    sentiment_y_pred=[]
    sentiment_y_score=[]
    sentiment_y_trues=[[],[],[],[],[]]
    sentiment_y_scores=[[],[],[],[],[]]
    for i in range(len(y_true)):
        if y_true[i]>0:
            sentiment_y_true.append(y_true[i]-1) # "Postive":0, "Negative":1
            tmp_score=score[i][2]/(score[i][1]+score[i][2])  # probability of "Negative"
            sentiment_y_score.append(tmp_score)
            if tmp_score>0.5:
                sentiment_y_pred.append(1) # "Negative": 1
            else:
                sentiment_y_pred.append(0)
            sentiment_y_trues[i%5].append(sentiment_y_true[-1])
            sentiment_y_scores[i%5].append(sentiment_y_score[-1])

    sentiment_auc=[]
    for i in range(5):
        sentiment_auc.append(metrics.roc_auc_score(sentiment_y_trues[i], sentiment_y_scores[i]))
    sentiment_Macro_AUC = np.mean(sentiment_auc)

    # sentiment Acc
    sentiment_y_true = np.array(sentiment_y_true)
    sentiment_y_pred = np.array(sentiment_y_pred)
    sentiment_Acc = metrics.accuracy_score(sentiment_y_true,sentiment_y_pred)

    return aspect_Macro_AUC, sentiment_Acc, sentiment_Macro_AUC

#####################################################################


y_true = (get_y_true())
y_pred, score = get_y_pred()

result = collections.OrderedDict()

aspect_strict_Acc = _strict_acc(y_true, y_pred)
aspect_Macro_F1 = _macro_F1(y_true, y_pred)
aspect_Macro_AUC, sentiment_Acc, sentiment_Macro_AUC = _AUC_Acc(y_true, score)
result = {'aspect_strict_Acc': aspect_strict_Acc,
                'aspect_Macro_F1': aspect_Macro_F1,
                'aspect_Macro_AUC': aspect_Macro_AUC,
                'sentiment_Acc': sentiment_Acc,
                'sentiment_Macro_AUC': sentiment_Macro_AUC}


print(result)

nameHandle = open('LR_tfidf_evaluation_results.txt', 'w')

nameHandle.write('aspect_strict_Acc:\t'+ str(aspect_strict_Acc))
nameHandle.write('\naspect_Macro_F1:\t' + str(aspect_Macro_F1))
nameHandle.write('\naspect_Macro_AUC:\t' + str(aspect_Macro_AUC))
nameHandle.write('\n\nsentiment_Acc:\t' + str(sentiment_Acc))
nameHandle.write('\nsentiment_Macro_AUC:\t' + str(sentiment_Macro_AUC))
nameHandle.close()

5669
AUC per aspect:	 Calls, CustomerService, Data, General, Network
[0.7427876861223441, 0.6680050524651937, 0.6631338983356692, 0.785221272315816, 0.6671451086344703]
{'aspect_strict_Acc': 0.27184466019417475, 'aspect_Macro_F1': 0.3417650825781958, 'aspect_Macro_AUC': 0.7052586035746986, 'sentiment_Acc': 0.9428811423771525, 'sentiment_Macro_AUC': 0.8952173199906298}
