In [59]:
import pickle
import numpy as np
import re
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
import sklearn
import nltk
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings('ignore')
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer
from nltk.stem import PorterStemmer, WordNetLemmatizer
lemmetizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [60]:
class_names = np.asarray([u'ARTS', u'ARTS & CULTURE', u'BLACK VOICES', u'BUSINESS', u'COLLEGE',
 u'COMEDY', u'CRIME', u'EDUCATION', u'ENTERTAINMENT', u'FIFTY', u'GOOD NEWS',
 u'GREEN', u'HEALTHY LIVING', u'IMPACT', u'LATINO VOICES', u'MEDIA', u'PARENTS',
 u'POLITICS', u'QUEER VOICES', u'RELIGION', u'SCIENCE', u'SPORTS', u'STYLE',
 u'TASTE', u'TECH', u'THE WORLDPOST', u'TRAVEL', u'WEIRD NEWS', u'WOMEN', u'WORLD NEWS', u'WORLDPOST'])
class_names.shape

(31,)

In [61]:
model_filename = 'interests_classifier.sav'
vect_filename = 'vectorize_interests_classifier.sav'

def load_models(model_filename, vect_filename):
    loaded_model = pickle.load(open(model_filename, 'rb'))
    vectorize = pickle.load(open(vect_filename, 'rb'))
    return loaded_model, vectorize

loaded_model, vectorize = load_models(model_filename, vect_filename)

In [62]:
def get_words(headlines_list):
    headlines = headlines_list[0]   
    author_names = [x for x in headlines_list[1].lower().replace('and',',').replace(' ', '').split(',') if x != '']
    headlines_only_letters = re.sub('[^a-zA-Z]', ' ', headlines)
    words = nltk.word_tokenize(headlines_only_letters.lower())
    stops = set(stopwords.words('english'))
    meaningful_words = [lemmetizer.lemmatize(w) for w in words if w not in stops]
    return ' '.join(meaningful_words + author_names)

In [63]:
input_val = np.asarray([u"Cricket is a beatiful sport. Baseball, hockey. Dhoni is the best!!!",
       u'Jenna Amatulli'])


def clean_input_and_vectorize(input_val):
    cleanHeadlines_list = []
    cleanHeadline = get_words(input_val) #Processing the data and getting words with no special characters, numbers or html tags
    cleanHeadlines_list.append( cleanHeadline )
    tfidwords_input = vectorize.transform(cleanHeadlines_list)
    return tfidwords_input, cleanHeadlines_list

tfidwords_input, cleanHeadlines_list = clean_input_and_vectorize(input_val)

In [67]:
def predict(tfidwords_input):
    return loaded_model.predict(tfidwords_input)[0]

pred_category = predict(tfidwords_input)

In [75]:
def get_explainer(class_names, loaded_model, vectorize, cleanHeadlines_list, tfidwords_input):
    explainer = LimeTextExplainer(class_names=class_names)
    c = make_pipeline(vectorize, loaded_model)
    exp = explainer.explain_instance(cleanHeadlines_list[0], c.predict_proba, labels=range(class_names.shape[0]))
    return exp

def explain_all(exp, class_names):
    all_explainers = []
    print 'Predicted class =', loaded_model.predict(tfidwords_input_test).reshape(1,-1)[0,0]
    for idx in range(class_names.shape[0]):
        try:
            all_explainers.append(exp.as_list(label=i))
        except:
            print("Error for {}".format(i))
    return all_explainers

exp = get_explainer(class_names, loaded_model, vectorize, cleanHeadlines_list, tfidwords_input)
explain_all(exp, class_names)

Predicted class = SPORTS
i: ARTS
(u'jennaamatulli', -0.002960216025427853)
(u'baseball', -0.002946473042780976)
(u'cricket', 0.0016619187030303666)
(u'hockey', -0.0015701647137572189)
(u'sport', -0.0006129401690195315)
(u'best', 0.0004111648761919069)
(u'dhoni', -0.00010530199940080556)
(u'beatiful', -9.634058492145138e-05)
i: ARTS & CULTURE
(u'hockey', -0.005901158599960292)
(u'jennaamatulli', 0.005014042811636092)
(u'sport', -0.0044192021593126)
(u'cricket', 0.0035951170705828795)
(u'baseball', -0.0030178199691306804)
(u'best', -0.001099694527775152)
(u'dhoni', 0.00014325166962876428)
(u'beatiful', 4.6563861125730545e-05)
i: BLACK VOICES
(u'cricket', 0.07047526696300664)
(u'baseball', -0.02724487611939362)
(u'jennaamatulli', 0.02585496663568451)
(u'sport', -0.020125219763880965)
(u'best', -0.013759976897809703)
(u'hockey', -0.01198554023932841)
(u'dhoni', 0.0017133510585329067)
(u'beatiful', 0.0013172472826421633)
i: BUSINESS
(u'baseball', -0.009003953108180807)
(u'hockey', -0.003564

In [69]:
def get_class_name_index(class_names, class_name):
    return list(class_names).index(class_name)

idx = get_class_name_index(class_names, pred_category)

def explain_class(exp, ):
try:
    print("index: {}".format(class_names[idx]))
    print ('\n'.join(map(str, exp.as_list(label=idx))))
except:
    print("Error for {}".format(i))

index: SPORTS
(u'hockey', 0.2987257394812215)
(u'baseball', 0.26124278280594004)
(u'cricket', -0.23269812078681518)
(u'sport', 0.20535291567632133)
(u'jennaamatulli', -0.0870993438654839)
(u'best', -0.016727071182257768)
(u'beatiful', -0.002441689412641712)
(u'dhoni', -0.002165718043374536)


[(u'jennaamatulli', -0.002960216025427853),
 (u'baseball', -0.002946473042780976),
 (u'cricket', 0.0016619187030303666),
 (u'hockey', -0.0015701647137572189),
 (u'sport', -0.0006129401690195315),
 (u'best', 0.0004111648761919069),
 (u'dhoni', -0.00010530199940080556),
 (u'beatiful', -9.634058492145138e-05)]