In [176]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import eli5
from nltk import sent_tokenize
import nltk
from IPython.display import display
from IPython.core.display import HTML

In [195]:

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=['headers', 'footers','quotes'],
)
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=['headers', 'footers','quotes'],
)



In [196]:
def return_first_x_sentences(text,x):
    sent_tokenize_list = sent_tokenize(text)
    first_x=sent_tokenize_list[:x]
    return '\n'.join(first_x)
SENT_NUM=2
twenty_train.data=map(lambda text: return_first_x_sentences(text,SENT_NUM),twenty_train.data)
twenty_test.data=map(lambda text: return_first_x_sentences(text,SENT_NUM),twenty_test.data)

In [197]:
def print_report(pipe):
    y_test = twenty_test.target
    y_pred = pipe.predict(twenty_test.data)
    report = metrics.classification_report(y_test, y_pred,
        target_names=twenty_test.target_names)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))
def get_mistake(pipe):
    y_test = twenty_test.target
    y_pred = pipe.predict(twenty_test.data)
    mistakes=[]
    for i in range(len(y_test)):
        if y_test[i]!=y_pred[i]:
            mistakes.append((twenty_test.data[i],twenty_test.target_names[y_pred[i]],twenty_test.target_names[y_test[i]]))
    return mistakes
            
    

In [198]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
clf =LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.66      0.48      0.56       319
         comp.graphics       0.83      0.84      0.84       389
               sci.med       0.72      0.77      0.74       396
soc.religion.christian       0.66      0.75      0.70       398

           avg / total       0.72      0.72      0.72      1502

accuracy: 0.721


In [199]:
mistakes_l=get_mistake(pipe)
for i in range(40):
    display(eli5.show_prediction(clf, mistakes_l[i][0], vec=vec,
                     target_names=twenty_test.target_names,targets=[mistakes_l[i][1],mistakes_l[i][2]]))
                     
    print
    print

Contribution?,Feature
1.832,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.951,<BIAS>
-7.005,Highlighted in text (sum)






Contribution?,Feature
-0.579,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.96,<BIAS>
-4.326,Highlighted in text (sum)






Contribution?,Feature
1.14,Highlighted in text (sum)
-0.951,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-3.886,Highlighted in text (sum)






Contribution?,Feature
-0.043,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.772,Highlighted in text (sum)
-1.14,<BIAS>






Contribution?,Feature
-0.038,Highlighted in text (sum)
-0.96,<BIAS>

Contribution?,Feature
-0.365,Highlighted in text (sum)
-1.14,<BIAS>






Contribution?,Feature
-1.033,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.951,<BIAS>
-2.37,Highlighted in text (sum)






Contribution?,Feature
1.315,Highlighted in text (sum)
-1.14,<BIAS>

Contribution?,Feature
-0.96,<BIAS>
-4.957,Highlighted in text (sum)






Contribution?,Feature
3.739,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
3.042,Highlighted in text (sum)
-1.14,<BIAS>






Contribution?,Feature
0.227,Highlighted in text (sum)
-1.14,<BIAS>

Contribution?,Feature
-0.96,<BIAS>
-5.285,Highlighted in text (sum)






Contribution?,Feature
-1.14,<BIAS>
-3.383,Highlighted in text (sum)

Contribution?,Feature
-0.96,<BIAS>
-7.133,Highlighted in text (sum)






Contribution?,Feature
10.757,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-6.116,Highlighted in text (sum)






Contribution?,Feature
-0.776,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.951,<BIAS>
-2.395,Highlighted in text (sum)






Contribution?,Feature
-0.62,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.951,<BIAS>
-2.974,Highlighted in text (sum)






Contribution?,Feature
6.548,Highlighted in text (sum)
-0.951,<BIAS>

Contribution?,Feature
-0.96,<BIAS>
-8.476,Highlighted in text (sum)






Contribution?,Feature
-1.337,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.951,<BIAS>
-3.981,Highlighted in text (sum)






Contribution?,Feature
1.221,Highlighted in text (sum)
-1.14,<BIAS>

Contribution?,Feature
-1.45,<BIAS>
-4.744,Highlighted in text (sum)






Contribution?,Feature
5.226,Highlighted in text (sum)
-1.14,<BIAS>

Contribution?,Feature
-1.45,<BIAS>
-2.269,Highlighted in text (sum)






Contribution?,Feature
2.29,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-4.909,Highlighted in text (sum)






Contribution?,Feature
1.561,Highlighted in text (sum)
-1.14,<BIAS>

Contribution?,Feature
-0.417,Highlighted in text (sum)
-0.951,<BIAS>






Contribution?,Feature
-0.96,<BIAS>
-1.328,Highlighted in text (sum)

Contribution?,Feature
-1.14,<BIAS>
-2.059,Highlighted in text (sum)






Contribution?,Feature
4.365,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-6.211,Highlighted in text (sum)






Contribution?,Feature
6.75,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-4.355,Highlighted in text (sum)






Contribution?,Feature
-0.951,<BIAS>
-1.826,Highlighted in text (sum)

Contribution?,Feature
-1.14,<BIAS>
-1.76,Highlighted in text (sum)






Contribution?,Feature
-0.736,Highlighted in text (sum)
-0.96,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-4.637,Highlighted in text (sum)






Contribution?,Feature
0.114,Highlighted in text (sum)
-0.951,<BIAS>

Contribution?,Feature
-1.45,<BIAS>
-2.924,Highlighted in text (sum)






Contribution?,Feature
-1.122,Highlighted in text (sum)
-1.14,<BIAS>

Contribution?,Feature
-0.96,<BIAS>
-10.042,Highlighted in text (sum)






Contribution?,Feature
2.647,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.96,<BIAS>
-8.28,Highlighted in text (sum)






Contribution?,Feature
-0.96,<BIAS>
-1.057,Highlighted in text (sum)

Contribution?,Feature
-1.14,<BIAS>
-2.552,Highlighted in text (sum)






Contribution?,Feature
2.605,Highlighted in text (sum)
-0.951,<BIAS>

Contribution?,Feature
0.825,Highlighted in text (sum)
-0.96,<BIAS>






Contribution?,Feature
-0.057,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-3.763,Highlighted in text (sum)






Contribution?,Feature
-0.96,<BIAS>
-1.977,Highlighted in text (sum)

Contribution?,Feature
-1.14,<BIAS>
-4.121,Highlighted in text (sum)






Contribution?,Feature
-1.142,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-5.922,Highlighted in text (sum)






Contribution?,Feature
5.822,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.951,<BIAS>
-5.412,Highlighted in text (sum)






Contribution?,Feature
-0.382,Highlighted in text (sum)
-0.951,<BIAS>

Contribution?,Feature
-0.96,<BIAS>
-2.285,Highlighted in text (sum)






Contribution?,Feature
0.136,Highlighted in text (sum)
-0.951,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-3.377,Highlighted in text (sum)






Contribution?,Feature
1.103,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-5.698,Highlighted in text (sum)






Contribution?,Feature
-0.951,<BIAS>

Contribution?,Feature
-0.96,<BIAS>






Contribution?,Feature
7.474,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-1.14,<BIAS>
-8.374,Highlighted in text (sum)






Contribution?,Feature
-0.96,<BIAS>
-0.991,Highlighted in text (sum)

Contribution?,Feature
-1.14,<BIAS>
-6.879,Highlighted in text (sum)






Contribution?,Feature
9.694,Highlighted in text (sum)
-1.45,<BIAS>

Contribution?,Feature
-0.599,Highlighted in text (sum)
-1.14,<BIAS>






In [200]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
8.218,Highlighted in text (sum)
-0.951,<BIAS>


In [201]:
eli5.show_weights(clf, vec=vec, top=20,
                  target_names=twenty_test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+22.081,atheism,,
+14.722,bobby,,
+14.370,islam,,
+12.898,atheists,,
+12.647,religion,,
+12.033,motto,,
+11.385,deletion,,
+10.247,freedom,,
+10.099,objective,,
+9.977,sorry wrong,,

Weight?,Feature
+22.081,atheism
+14.722,bobby
+14.370,islam
+12.898,atheists
+12.647,religion
+12.033,motto
+11.385,deletion
+10.247,freedom
+10.099,objective
+9.977,sorry wrong

Weight?,Feature
+26.296,graphics
+16.491,image
+16.108,files
+15.644,file
+14.675,3d
+14.177,images
+14.146,ftp
+14.029,program
+13.913,software
+13.524,computer

Weight?,Feature
+18.259,msg
+16.838,doctor
+15.478,yes point
+12.998,food
+12.943,blood
+12.529,disease
+12.525,sci
+11.786,pain
+11.498,treatment
… 10943 more positive …,… 10943 more positive …

Weight?,Feature
+20.009,christians
+18.379,jesus
+17.741,church
+17.099,god
+16.980,christian
+13.407,christianity
+13.069,heaven
+12.768,christ
+12.607,sin
+12.488,catholic


In [202]:
sample=twenty_test.data[0]
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)


def extract_entity_names(t):
    entity_names = []
    print(t)
    if hasattr(t, 'node') and t.node:
        if t.node == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))
                
    return entity_names

entity_names = []
for tree in chunked_sentences:
    # Print results per sentence
    # print extract_entity_names(tree)
    
    entity_names.extend(extract_entity_names(tree))

# Print all entity names
#print entity_names

# Print unique entity names
print set(entity_names)

(S
  As/IN
  I/PRP
  recall/VBP
  from/IN
  my/PRP$
  bout/NN
  with/IN
  kidney/NN
  stones/NNS
  ,/,
  there/EX
  is/VBZ
  n't/RB
  any/DT
  medication/NN
  that/WDT
  can/MD
  do/VB
  anything/NN
  about/IN
  them/PRP
  except/IN
  relieve/VBP
  the/DT
  pain/NN
  ./.)
(S
  Either/DT
  they/PRP
  pass/VBP
  ,/,
  or/CC
  they/PRP
  have/VBP
  to/TO
  be/VB
  broken/VBN
  up/RP
  with/IN
  sound/NN
  ,/,
  or/CC
  they/PRP
  have/VBP
  to/TO
  be/VB
  extracted/VBN
  surgically/RB
  ./.)
set([])
