In [17]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [18]:
newsgroups_train.filenames.shape

(11314,)

In [19]:
newsgroups_train.target.shape

(11314,)

In [20]:
newsgroups_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

Converting text to vectors:

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [22]:
vectors.nnz / float(vectors.shape[0])

159.0132743362832

multinomial Naive Bayes classifier:

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
vectors = vectorizer.fit_transform(newsgroups_train.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.682861129525057

most informative features:

In [29]:
import numpy as np
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.target_names)

alt.atheism: not in and it you is that of to the
comp.graphics: you in graphics it is for of and to the
comp.os.ms-windows.misc: file of you for and is it to windows the
comp.sys.ibm.pc.hardware: with scsi for of drive is it and to the
comp.sys.mac.hardware: that apple for of mac it and is to the
comp.windows.x: for this it in of is and window to the
misc.forsale: or in shipping offer 00 to and sale the for
rec.autos: is that in it of you and to car the
rec.motorcycles: for that in of you it and bike to the
rec.sport.baseball: year was is that of in and to he the
rec.sport.hockey: hockey team that game of he and in to the
sci.crypt: in be it is that key and of to the
sci.electronics: that for in it you is and of to the
sci.med: this you that in it and is to of the
sci.space: for that it is in and space of to the
soc.religion.christian: you it in god and is that to of the
talk.politics.guns: it gun is you in and that of to the
talk.politics.mideast: it is israel that you in and to of th