In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
import sklearn
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
news = pd.read_json('../data/interests.json', lines=True)
# remove_columns_list = ['authors', 'date', 'link', 'short_description', 'headline']
news['information'] = news[['headline', 'short_description']].apply(lambda x: ' '.join(x), axis=1)

In [3]:
news = news
news.shape

(124989, 7)

In [4]:
news.drop(news[(news['authors'] == '') & (news['short_description'] == '' )].index, inplace=True)

In [5]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(news[['information', 'authors']], news['category'], test_size=0.33)

In [6]:
# Convert pandas series into numpy array
X_train = np.array(X_train);
X_test = np.array(X_test);
Y_train = np.array(Y_train);
Y_test = np.array(Y_test);
cleanHeadlines_train = [] #To append processed headlines
cleanHeadlines_test = [] #To append processed headlines
number_reviews_train = len(X_train) #Calculating the number of reviews
number_reviews_test = len(X_test) #Calculating the number of reviews

In [7]:
print(np.unique(Y_train))
print(np.unique(Y_train).shape)

[u'ARTS' u'ARTS & CULTURE' u'BLACK VOICES' u'BUSINESS' u'COLLEGE'
 u'COMEDY' u'CRIME' u'EDUCATION' u'ENTERTAINMENT' u'FIFTY' u'GOOD NEWS'
 u'GREEN' u'HEALTHY LIVING' u'IMPACT' u'LATINO VOICES' u'MEDIA' u'PARENTS'
 u'POLITICS' u'QUEER VOICES' u'RELIGION' u'SCIENCE' u'SPORTS' u'STYLE'
 u'TASTE' u'TECH' u'THE WORLDPOST' u'TRAVEL' u'WEIRD NEWS' u'WOMEN'
 u'WORLD NEWS' u'WORLDPOST']
(31,)


In [8]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
lemmetizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def get_words(headlines_list):
    headlines = headlines_list[0]   
    author_names = [x for x in headlines_list[1].lower().replace('and',',').replace(' ', '').split(',') if x != '']
    headlines_only_letters = re.sub('[^a-zA-Z]', ' ', headlines)
    words = nltk.word_tokenize(headlines_only_letters.lower())
    stops = set(stopwords.words('english'))
    meaningful_words = [lemmetizer.lemmatize(w) for w in words if w not in stops]
    return ' '.join(meaningful_words + author_names)

In [11]:
for i in range(0,number_reviews_train):
    cleanHeadline = get_words(X_train[i]) #Processing the data and getting words with no special characters, numbers or html tags
    cleanHeadlines_train.append( cleanHeadline )

In [12]:
for i in range(0,number_reviews_test):
    cleanHeadline = get_words(X_test[i]) #Processing the data and getting words with no special characters, numbers or html tags
    cleanHeadlines_test.append( cleanHeadline )

In [13]:
vectorize = sklearn.feature_extraction.text.TfidfVectorizer(analyzer = "word", max_features=30000)
tfidwords_train = vectorize.fit_transform(cleanHeadlines_train)
X_train = tfidwords_train.toarray()
tfidwords_test = vectorize.transform(cleanHeadlines_test)
X_test = tfidwords_test.toarray()


In [14]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=0.1)
model.fit(X_train,Y_train)
Y_predict = model.predict(X_test)
accuracy = accuracy_score(Y_test,Y_predict)*100
print(format(accuracy, '.2f'))

66.77


In [15]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorize, model)

In [16]:
print(c.predict_proba([cleanHeadlines_test[0]]).round(3))

[[0.001 0.003 0.006 0.001 0.001 0.008 0.035 0.    0.033 0.    0.002 0.001
  0.    0.002 0.001 0.011 0.002 0.208 0.059 0.007 0.001 0.003 0.001 0.
  0.001 0.561 0.001 0.004 0.002 0.037 0.01 ]]


In [18]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=np.unique(Y_train))

In [19]:
idx = 5
exp = explainer.explain_instance(cleanHeadlines_test[idx], c.predict_proba, labels=range(12))
print('Document id: %d' % idx)
print 'Predicted class =', model.predict(X_test[idx:idx+1]).reshape(1,-1)[0,0]
print('True class: %s' % Y_test[idx])

Document id: 5
Predicted class = WORLD NEWS
True class: WORLDPOST


In [20]:
for i in range(31):
    try:
        print ('Explanation for class %s' % Y_train[i])
        print ('\n'.join(map(str, exp.as_list(label=i))))
        print ()
    except:
        print("Error for {}".format(i))
# print ('Explanation for class %s' % Y_train[28])
# print ('\n'.join(map(str, exp.as_list(label=28))))

Explanation for class COMEDY
(u'korea', -0.004978271484924064)
(u'north', -0.0029344702673439545)
(u'photo', 0.0019411912382061235)
(u'elinegordts', -0.0017156245593885218)
(u'epic', 0.0008708329301280892)
(u'road', 0.00026994453103323425)
(u'trip', 0.0001661502221713568)
()
Explanation for class HEALTHY LIVING
(u'korea', -0.001808779903109167)
(u'epic', -0.00094640231898136)
(u'trip', -0.0008101255505702401)
(u'photo', 0.0006767684434909784)
(u'elinegordts', -0.0005587426127365393)
(u'north', -0.0005291523539648586)
(u'road', 8.960136930445545e-05)
()
Explanation for class POLITICS
(u'korea', -0.009455791414556144)
(u'elinegordts', -0.004623117079857041)
(u'epic', 0.00338054391446005)
(u'north', -0.0015316925671250588)
(u'trip', -0.0012865281095497)
(u'photo', 0.0012694029335408071)
(u'road', -0.0011319009380489815)
()
Explanation for class ENTERTAINMENT
(u'epic', -0.004482552022822484)
(u'korea', -0.0034098532937142114)
(u'photo', -0.0015056718848523226)
(u'north', -0.001432297155627

In [21]:
top_labels=2
idx = 5
exp = explainer.explain_instance(cleanHeadlines_test[idx], c.predict_proba, labels=[2,3,4])
print('Document id: %d' % idx)
print 'Predicted class =', model.predict(X_test[idx:idx+1]).reshape(1,-1)[0,0]
print('True class: %s' % Y_test[idx])

Document id: 5
Predicted class = WORLD NEWS
True class: WORLDPOST


In [22]:
for i in range(12):
    try:
        print ('Explanation for class %s' % np.unique(Y_train)[i])
        print ('\n'.join(map(str, exp.as_list(label=i))))
        print ()
    except:
        print("Error for {}".format(i))

Explanation for class ARTS
Error for 0
Explanation for class ARTS & CULTURE
Error for 1
Explanation for class BLACK VOICES
(u'korea', -0.009484669830319273)
(u'elinegordts', -0.004587156507241513)
(u'epic', 0.003397189058609107)
(u'north', -0.0015573514761584124)
(u'trip', -0.0013769590973388656)
(u'photo', 0.0011975854364029274)
(u'road', -0.0010375610344270187)
()
Explanation for class BUSINESS
(u'epic', -0.004412287523810453)
(u'korea', -0.003361781870488386)
(u'photo', -0.0015111316427056877)
(u'north', -0.001396710813581932)
(u'road', 0.0008023998630574458)
(u'elinegordts', -0.0004698121228776304)
(u'trip', 0.0002747928882003466)
()
Explanation for class COLLEGE
(u'korea', -0.0009781660305471641)
(u'trip', -0.00043768097513803474)
(u'photo', -0.0002936087534524023)
(u'north', -0.00018954936504184035)
(u'elinegordts', -0.0001613340537480655)
(u'epic', 7.677713541064159e-05)
(u'road', -6.442482737635111e-05)
()
Explanation for class COMEDY
Error for 5
Explanation for class CRIME
Err

In [23]:
import pickle

In [24]:
filename = 'interests_classifier.sav'
pickle.dump(model, open(filename, 'wb'))

In [25]:
vect_filename = 'vectorize_interests_classifier.sav'
pickle.dump(vectorize, open(vect_filename, 'wb'))

In [190]:
loaded_model = pickle.load(open(filename, 'rb'))
Y_predict = loaded_model.predict(X_test)
accuracy = accuracy_score(Y_test,Y_predict)*100
print(format(accuracy, '.2f'))

48.48


In [193]:
input_test = np.asarray([u"Cricket is a beatiful sport. Baseball, hockey. Dhoni is the best!!!",
       u'Jenna Amatulli'])

In [194]:
cleanHeadlines_test1 = []
cleanHeadline = get_words(input_test) #Processing the data and getting words with no special characters, numbers or html tags
cleanHeadlines_test1.append( cleanHeadline )

In [195]:
tfidwords_input_test = vectorize.transform(cleanHeadlines_test1)
tfidwords_input_test

<1x858 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [196]:
loaded_model.predict(tfidwords_input_test)

array([u'POLITICS'], dtype='<U13')

In [197]:
exp = explainer.explain_instance(cleanHeadlines_test1[0], c.predict_proba, labels=range(12))

In [200]:
print 'Predicted class =', loaded_model.predict(tfidwords_input_test).reshape(1,-1)[0,0]
for i in range(12):
    try:
        print("i: {}".format(np.unique(Y_train)[i]))
        print ('\n'.join(map(str, exp.as_list(label=i))))
#         if Y_test[i] == "SPORTS":
#             print ('Explanation for class %s' % loaded_model.predict(tfidwords_input_test)[0])
            
#         print ()
    except:
        print("Error for {}".format(i))

Predicted class = POLITICS
i: BLACK VOICES
(u'jennaamatulli', 0.032745966099185266)
(u'cricket', 2.9523109376155798e-06)
(u'hockey', 2.8311636531658235e-06)
(u'baseball', 1.8942035940736854e-06)
(u'sport', 1.881192608186352e-06)
(u'best', 1.3817592609477815e-06)
(u'dhoni', 8.68532935870144e-07)
(u'beatiful', -4.4182982851245567e-07)
i: BUSINESS
(u'jennaamatulli', -0.004872904322553878)
(u'cricket', -4.393313266690357e-07)
(u'hockey', -4.2130348396447854e-07)
(u'baseball', -2.8187511259792557e-07)
(u'sport', -2.799389568842294e-07)
(u'best', -2.0561862963529966e-07)
(u'dhoni', -1.2924577899653078e-07)
(u'beatiful', 6.574838789817101e-08)
i: COMEDY
(u'jennaamatulli', -0.004848473702200255)
(u'cricket', -4.3712871070583687e-07)
(u'hockey', -4.191912517543843e-07)
(u'baseball', -2.804619135273674e-07)
(u'sport', -2.7853546485512615e-07)
(u'best', -2.0458774736383447e-07)
(u'dhoni', -1.285977969409541e-07)
(u'beatiful', 6.541875411168215e-08)
i: CRIME
(u'jennaamatulli', -0.01045205857925458