In [1]:
from sklearn.datasets import fetch_20newsgroups # data
from sklearn.naive_bayes import MultinomialNB # model

from sklearn.feature_extraction.text import CountVectorizer # data processing
from sklearn.feature_extraction.text import TfidfTransformer # data processing

In [8]:
newsgroups_to_download = ['comp.graphics', 'rec.sport.hockey', 'sci.electronics', 'sci.space']

newsgroups = fetch_20newsgroups(categories=newsgroups_to_download, shuffle=True, random_state=265)
# Convert the text into numbers that represent each word (bag of words method)
word_vector = CountVectorizer()
word_vector_counts = word_vector.fit_transform(newsgroups.data)

# Account for the length of the documents:
#   get the frequency with which the word occurs instead of the raw number of times
term_freq_transformer = TfidfTransformer()
term_freq = term_freq_transformer.fit_transform(word_vector_counts)
model = MultinomialNB().fit(term_freq, newsgroups.target)

In [11]:
# Predict some new fake documents
fake_docs = [
    'My 1660 TI is a great GPU with lots of VRAM',
    'The player had a wicked slap shot',
    'I spent all day yesterday soldering banks of capacitors',
    'NASA has several rovers on Mars']
fake_counts = word_vector.transform(fake_docs)
fake_term_freq = term_freq_transformer.transform(fake_counts)

predicted = model.predict(fake_term_freq)
print('Predictions:')
for doc, group in zip(fake_docs, predicted):
    print('\t{0} => {1}'.format(doc, newsgroups.target_names[group]))

probabilities = model.predict_proba(fake_term_freq)
print('Probabilities:')
print(''.join(['{:17}'.format(name) for name in newsgroups.target_names]))
for probs in probabilities:
    print(''.join(['{:<17.8}'.format(prob) for prob in probs]))

Predictions:
	My 1660 TI is a great GPU with lots of VRAM => sci.space
	The player had a wicked slap shot => rec.sport.hockey
	I spent all day yesterday soldering banks of capacitors => sci.space
	NASA has several rovers on Mars => sci.space
Probabilities:
comp.graphics    rec.sport.hockey sci.electronics  sci.space        
0.17354896       0.19155755       0.21956498       0.41532851       
0.12948055       0.51155698       0.18248712       0.17647535       
0.18604814       0.24117771       0.27540452       0.29736963       
0.091902227      0.062842265      0.11893481       0.7263207        
