In [1]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=22)

In [6]:
twenty_train.target

2257

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(twenty_train)
x_train_counts

<5x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
    ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())
])

text_clf.fit(twenty_train.data, twenty_train.target)

In [13]:
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, categories=categories, random_state=22)

predicted = text_clf.predict(twenty_test.data)

In [14]:
np.mean(predicted == twenty_test.target)

0.8348868175765646

#### SVM classifer

In [18]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=22, max_iter=15, tol=None))
])

text_clf.fit(twenty_train.data, twenty_train.target)

In [19]:
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.9101198402130493

In [21]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [22]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [23]:
print(gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.8949999999999999
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
