Getting data:

In [56]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
categories = None
twenty_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories, shuffle=True, random_state=42)

from pprint import pprint
pprint(list(twenty_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [57]:
twenty_train.filenames.shape

(11314,)

In [58]:
twenty_train.target.shape

(11314,)

In [59]:
twenty_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

Converting text to vectors and training SVM:

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(penalty='l2',alpha=0.0001, max_iter=50, random_state=42)),])
_ = text_clf.fit(twenty_train.data, twenty_train.target)

In [85]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.69676048858205

In [86]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.53      0.49      0.51       319
           comp.graphics       0.67      0.72      0.69       389
 comp.os.ms-windows.misc       0.63      0.62      0.62       394
comp.sys.ibm.pc.hardware       0.68      0.66      0.67       392
   comp.sys.mac.hardware       0.76      0.71      0.74       385
          comp.windows.x       0.80      0.71      0.75       395
            misc.forsale       0.77      0.81      0.79       390
               rec.autos       0.78      0.71      0.74       396
         rec.motorcycles       0.80      0.76      0.78       398
      rec.sport.baseball       0.55      0.84      0.67       397
        rec.sport.hockey       0.85      0.89      0.87       399
               sci.crypt       0.83      0.72      0.77       396
         sci.electronics       0.62      0.56      0.59       393
                 sci.med       0.75      0.78      0.77       396
         

In [87]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[155,   2,   1,   1,   0,   2,   3,   4,   3,  12,   3,   3,  10,
         10,  12,  53,   7,  14,   6,  18],
       [  7, 279,  20,   9,   5,  18,   4,   1,   3,   9,   1,   8,   8,
          2,   9,   2,   1,   2,   0,   1],
       [  4,  19, 244,  31,  16,  16,   4,   2,   2,  18,   2,   1,   1,
          9,  10,   1,   3,   3,   6,   2],
       [  1,  12,  35, 257,  23,   6,  12,   1,   2,   8,   1,   2,  26,
          1,   1,   0,   1,   0,   2,   1],
       [  1,   6,   9,  28, 273,   6,   9,   4,   6,  14,   1,   3,  11,
          1,   5,   2,   4,   0,   2,   0],
       [  0,  43,  36,   5,   5, 279,   4,   0,   0,   5,   1,   3,   4,
          1,   7,   1,   0,   1,   0,   0],
       [  0,   3,   2,  10,  11,   0, 316,   6,   4,  11,   2,   1,   9,
          0,   5,   2,   5,   1,   1,   1],
       [  6,   0,   3,   2,   1,   1,  11, 280,  14,  29,   2,   1,  19,
          3,   8,   1,   5,   4,   4,   2],
       [  2,   2,   2,   1,   2,   0,   5,  21, 303,  15,   2,  

Applying GridSearch:

In [80]:
from __future__ import print_function
from time import time
import logging

from sklearn.model_selection import GridSearchCV

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.001, 0.0001, 0.00001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (5, 10, 50),
}

if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Loading 20 newsgroups dataset for categories:
None
11314 documents
20 categories

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.001, 0.0001, 1e-05),
 'clf__max_iter': (5, 10, 50),
 'clf__penalty': ('l2', 'elasticnet')}
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  7.2min finished


done in 444.105s

Best score: 0.751
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 50
	clf__penalty: 'l2'
