##### Dataset : Twitter data from travelers in US Airline, in February 2015 which expressed their feelings on service

In [1]:
#import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

#for NLP
import re    
import nltk    
from textblob import TextBlob    
from nltk.corpus import stopwords    
from nltk.stem import PorterStemmer    
from textblob import Word    
from nltk.util import ngrams    
from wordcloud import WordCloud, STOPWORDS    
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

#for Anchor
from alibi.explainers import AnchorText

In [2]:
df = pd.read_csv('Tweets.csv')

Text Preprocessing 

In [3]:
#convert words to lowercase
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split())) 
    
#Removes unicode strings like "\u002c" and "x96"
df['text']= df['text'].str.replace(r'(\\u[0-9A-Fa-f]+)','')
df['text'] = df['text'].str.replace(r'[^\x00-\x7f]','')
    
#convert any url to URL
df['text'] = df['text'].str.replace('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','URL')
    
#Convert any @Username to "AT_USER"
df['text'] = df['text'].str.replace('@[^\s]+','')
    
#Remove additional white spaces
df['text'] = df['text'].str.replace('[\s]+',' ')
df['text'] = df['text'].str.replace('[\n]+',' ')
    
#Remove not alphanumeric symbols white spaces
df['text']= df['text'].str.replace(r'[^\w]', ' ')
    
#Removes hastag in front of a word """
df['text']= df['text'].str.replace(r'#([^\s]+)', r'\1')
    
#Removes:) or :(
df['text']= df['text'].str.replace(r':\)',"")
df['text']= df['text'].str.replace(r':\(',"")
    
#remove numbers
df['text']= df['text'].apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
    
#remove multiple exclamation
df['text']= df['text'].str.replace(r"(\!)\1+", ' ')
    
#remove multiple question marks
df['text']= df['text'].str.replace(r"(\?)\1+", ' ')
    
#remove multistop
df['text']= df['text'].str.replace(r"(\.)\1+", ' ')
    
#lemma
from textblob import Word
df['text']= df['text'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split())) 
    
#Removes emoticons from text
df['text']= df['text'].str.replace(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', "r")
    
#trim
df['text']= df['text'].str.strip('\'"')

In [4]:
y = df['airline_sentiment'].apply(lambda x:0 if x=='negative' else 1)

Vectorize

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],y, test_size=0.2, stratify=y, random_state=5)

In [6]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)

Model Fitting in a pipeline

In [7]:
clf = Pipeline([('tidf',TfidfVectorizer()), ('NB', BernoulliNB())])

In [8]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('NB',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

Define prediction function

In [9]:
predict_fn = lambda x: clf.predict(x)

In [10]:
preds_train = predict_fn(X_train)
preds_test = predict_fn(X_test)

print('Train accuracy', accuracy_score(y_train, preds_train)*100)
print('Test accuracy', accuracy_score(y_test, preds_test)*100)

Train accuracy 87.713456284153
Test accuracy 83.40163934426229


#### Constructing Anchors

Load spaCy model

In [11]:
nlp = spacy.load('en_core_web_sm')

Initialize anchor text explainer

In [12]:
explainer = AnchorText(nlp, predict_fn)

Explain a prediction

In [13]:
class_names =[0,1]

In [14]:
i=191
text = X_test.iloc[i]
print(text)

i m glad you re sorry that i m homeless for the night make me feel secure


In [15]:
pred = class_names[predict_fn([text])[0]]
alternative = class_names[1 - predict_fn([text])[0]]

print('True label:', y_test.iloc[i])
print('Naive Bayes model prediction label:',pred)
print('Naive Bayes model prediction probability for 0:',clf.predict_proba(X_test)[i][0])

True label: 0
Naive Bayes model prediction label: 0
Naive Bayes model prediction probability for 0: 0.5037463665917669


prediction probability is 0.504, it is near to the decision boundary. 

Lets examine how Anchors will be stated to capture 0(negative nature of the prediction) correctly

##### Explanation 

In [29]:
%%time
explanation = explainer.explain(text, threshold=0.95, use_unk=True)

Wall time: 1.41 s


use_unk=True means we will perturb examples by replacing words with UNKs. 

In [30]:
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)

Anchor: night AND homeless AND sorry AND that AND feel AND for AND me AND re AND make AND the AND secure
Precision: 1.00


In [31]:
print('Partial Anchor: %s' % (' AND '.join(explanation.anchor[0:3])))

Partial Anchor: night AND homeless AND sorry


Let us now take a look at the partial anchor which extracts top most conditions.The words 'night', 'homeless' and 'sorry' condfidently guarantees a negative prediction.

In [32]:
print('\nExamples where anchor applies and model predicts %s:' % pred)
print('\n'.join([j for j in explanation.raw['examples'][-1]['covered_true']]))


Examples where anchor applies and model predicts 0:
i UNK glad you re sorry that UNK UNK homeless for the night make me feel secure
UNK UNK UNK you re sorry that i UNK homeless for the night make me feel secure
UNK UNK glad UNK re sorry that UNK UNK homeless for the night make me feel secure
UNK m UNK you re sorry that UNK m homeless for the night make me feel secure
UNK m glad you re sorry that i m homeless for the night make me feel secure
i m UNK you re sorry that UNK UNK homeless for the night make me feel secure
UNK UNK UNK UNK re sorry that i m homeless for the night make me feel secure
UNK m glad UNK re sorry that UNK UNK homeless for the night make me feel secure
UNK UNK glad you re sorry that UNK m homeless for the night make me feel secure
UNK m glad UNK re sorry that UNK UNK homeless for the night make me feel secure


In [33]:
print('\nExamples where anchor applies and model predicts %s:' % alternative)
print('\n'.join([p for p in explanation.raw['examples'][-1]['covered_false']]))


Examples where anchor applies and model predicts 1:



There are no perturbated senarios where those Anchor words apply and model predicts sentiment as positive(1)