In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = [
    'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'
]
dataset = fetch_20newsgroups(
    subset='train', categories=categories, shuffle=True, random_state=42
)
# Loads in the 20 newsgroups dataset

In [3]:
len(dataset.data)

2257

In [4]:
dataset.target[:5]
# Each number in "target" corresponds to a category name

array([1, 1, 3, 3, 3], dtype=int64)

In [5]:
[dataset.target_names[x] for x in dataset.target[:5]]

['comp.graphics',
 'comp.graphics',
 'soc.religion.christian',
 'soc.religion.christian',
 'soc.religion.christian']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset.data)
# Makes our bag of words

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Represents our bag of words as tf-idf

In [8]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, dataset.target)
# Now we have a classifier

In [9]:
def predictor(vectorizer, tfidf, clf, dataset, text):
    prediction = clf.predict(
        tfidf.transform(
            vectorizer.transform([text])
        )
    )
    return dataset.target_names[prediction[0]]

# The data that goes in needs to be in the same format as the data we fitted on,
# that's what this function is doing prior to the prediction.

from functools import partial
predict = partial(predictor, count_vect, tfidf_transformer, clf, dataset)

In [10]:
predict("Medicine helps people be healthy")
# Nice! It correctly predicted that the sentiment was about medicine. Let's try another:

'sci.med'

In [11]:
predict("My GPU can run OpenGL 4.6")
# Nice!

'comp.graphics'

In [12]:
# But what we just did was quite a bit of code, it doesn't have to be that way.
# Let's redo everything:

from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

clf.fit(dataset.data, dataset.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [15]:
def predictor(clf, dataset, text):
    prediction = clf.predict([text])
    return dataset.target_names[prediction[0]]

predict = partial(predictor, clf, dataset)

In [16]:
predict("My GPU can run OpenGL 4.6")

'comp.graphics'

In [None]:
# Nice! It worked.