In [2]:
import pandas as pd
import numpy as np

categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']

In [3]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
   categories=categories, shuffle=True, random_state=42)

In [4]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [19]:
twenty_train.data[2].split("\n")

['From: djohnson@cs.ucsd.edu (Darin Johnson)',
 'Subject: Re: harrassed at work, could use some prayers',
 'Organization: =CSE Dept., U.C. San Diego',
 'Lines: 63',
 '',
 "(Well, I'll email also, but this may apply to other people, so",
 "I'll post also.)",
 '',
 ">I've been working at this company for eight years in various",
 ">engineering jobs.  I'm female.  Yesterday I counted and realized that",
 ">on seven different occasions I've been sexually harrassed at this",
 '>company.',
 '',
 '>I dreaded coming back to work today.  What if my boss comes in to ask',
 '>me some kind of question...',
 '',
 'Your boss should be the person bring these problems to.  If he/she',
 'does not seem to take any action, keep going up higher and higher.',
 'Sexual harrassment does not need to be tolerated, and it can be an',
 'enormous emotional support to discuss this with someone and know that',
 'they are trying to do something about it.  If you feel you can not',
 'discuss this with your boss, perh

In [20]:
print("\n".join(twenty_train.data[2].split("\n")[:20]))


From: djohnson@cs.ucsd.edu (Darin Johnson)
Subject: Re: harrassed at work, could use some prayers
Organization: =CSE Dept., U.C. San Diego
Lines: 63

(Well, I'll email also, but this may apply to other people, so
I'll post also.)

>I've been working at this company for eight years in various
>engineering jobs.  I'm female.  Yesterday I counted and realized that
>on seven different occasions I've been sexually harrassed at this
>company.

>I dreaded coming back to work today.  What if my boss comes in to ask
>me some kind of question...

Your boss should be the person bring these problems to.  If he/she
does not seem to take any action, keep going up higher and higher.
Sexual harrassment does not need to be tolerated, and it can be an
enormous emotional support to discuss this with someone and know that


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [9]:
X_train_counts

<2257x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 365886 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer


In [11]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

BUILDING A PIPELINE

In [13]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])


In [14]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [15]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 


0.8348868175765646

In [16]:
docs_new = ['God is love', 'Corona is pandemic']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))


'God is love' => soc.religion.christian
'Corona is pandemic' => sci.med
