In [44]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

import numpy as np

In [45]:
categories = [
    'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'
]
dataset = fetch_20newsgroups(
    subset='train', categories=categories, shuffle=True, random_state=42
)
# Loads in the 20 newsgroups dataset

In [46]:
clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [47]:
clf.fit(dataset.data, dataset.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [48]:
def predictor(clf, dataset, text):
    prediction = clf.predict([text])
    return dataset.target_names[prediction[0]]

In [49]:
predictor(clf, dataset, "My GPU can run OpenGL 4.6")

'comp.graphics'

In [50]:
test_dataset = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [51]:
prediction = clf.predict(test_dataset.data)
np.mean(prediction == test_dataset.target)
# 83% accurate

0.8348868175765646

In [52]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
# Support vector machine, and a more efficient vectorizer

clf = Pipeline([
    ('vect', HashingVectorizer(alternate_sign=True)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3)),
])

clf.fit(dataset.data, dataset.target)

Pipeline(memory=None,
     steps=[('vect', HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', pr...'l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False))])

In [53]:
prediction = clf.predict(test_dataset.data)
np.mean(prediction == test_dataset.target)
# Now it's 92% accurate!

0.9227696404793608

In [54]:
from sklearn import metrics

print(
    metrics.classification_report(
        test_dataset.target, prediction, target_names=test_dataset.target_names
    )
)

# More advanced performance metrics, lots of nice data here!

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.85      0.90       319
         comp.graphics       0.90      0.97      0.93       389
               sci.med       0.95      0.89      0.92       396
soc.religion.christian       0.90      0.96      0.93       398

           avg / total       0.92      0.92      0.92      1502



In [12]:
from sklearn.model_selection import GridSearchCV
# This will search for the best parameters for our classifier

In [13]:
params = {
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3, 1e-4),
}
# These are the parameters we're going to be testing.
# The values in the dictionary are the arguments that will be tested.
# The keys in the dictionary are the parameter names (the weird underscore stuff is related to Pipeline)

In [14]:
gs_clf = GridSearchCV(clf, params, cv=5, n_jobs=-1)
# This is a heavy operation, so it's going to be running in parallel

In [22]:
gs_clf = gs_clf.fit(dataset.data, dataset.target)

In [23]:
gs_clf.best_params_
# Now that we know the best parameters, we can use them to improve our model
# (the best params turned out to be the default, so no change should be made)

{'clf__alpha': 0.0001, 'tfidf__use_idf': True}