# Loading the 20 newsgroups dataset

In [1]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

In [3]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [4]:
print(len(twenty_train.data))
print(len(twenty_train.filenames))

2257
2257


In [5]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [6]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [7]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [8]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


# Tokenizing text

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(X_train_counts.shape)

(2257, 35788)


In [10]:
count_vect.vocabulary_.get(u'algorithm')

4690

# From occurrences to frequencies

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf.shape)

(2257, 35788)


In [12]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(2257, 35788)


# Training a classifier

In [13]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [14]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Hello, world!', 'Act now for instant savings', 'Penalty for early withdrawl', 'Algorithms and Data Structures']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))
    
print()
print(twenty_train.target_names)
print(clf.predict_proba(X_new_tfidf))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'Hello, world!' => comp.graphics
'Act now for instant savings' => soc.religion.christian
'Penalty for early withdrawl' => alt.atheism
'Algorithms and Data Structures' => comp.graphics

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
[[ 0.16297502  0.03828016  0.03737814  0.76136668]
 [ 0.16387956  0.36874738  0.2364763   0.23089675]
 [ 0.13859729  0.37652451  0.23162016  0.25325805]
 [ 0.28986829  0.17798726  0.21465594  0.31748851]
 [ 0.30710321  0.19244132  0.20961331  0.29084216]
 [ 0.10273114  0.55085759  0.20328355  0.14312772]]


# Building a pipeline

In [15]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

# Evaluating Performance

In [16]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
print(np.mean(predicted == twenty_test.target) )

0.834886817577


In [17]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),
                    ])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
print(np.mean(predicted == twenty_test.target) * 100)

91.2782956059


In [18]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))

print(metrics.confusion_matrix(twenty_test.target, predicted))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502

[[258  11  15  35]
 [  4 379   3   3]
 [  5  33 355   3]
 [  5  10   4 379]]


# Parameter tuning using grid search

In [19]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }

In [20]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [21]:
print(twenty_train.target_names[gs_clf.predict(['God is love'])])

soc.religion.christian


  if __name__ == '__main__':


In [22]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

print(score * 100)

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
90.0
