In [10]:
import numpy as np
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn import datasets
from sklearn import feature_selection
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
# load the dataset

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))



In [11]:
# 1. import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

# 2. instantiate CountVectorizer (vectorizer)
vect = CountVectorizer()

In [12]:
newsgroups_train.target_names


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [13]:
n = 102
print('Topic = {0}\n'.format(newsgroups_train.target_names[newsgroups_train.target[n]]))
print(newsgroups_train.data[n])

Topic = rec.motorcycles

/ hpcc01:rec.motorcycles / Stafford@Vax2.Winona.MSUS.Edu (John Stafford) / 11:06 am  Apr  1, 1993 /
 

  It depreciates much faster, too.
   
John Stafford   Minnesota State University @ Winona
                    All standard disclaimers apply.
----------
The '84 GL1200A hit the traps at 13.34 according to Cycle magazine. Yeah,
they depreciate faster than Harleys for the first couple of years then
they bottom out. Got my '86 GL1200I w/ 2275 miles on the odometer for
just under $5K in May of 1990 and would ask for $4500 now with almost
16K miles onnit....that's about 50% of what a new GL1500I would cost.

Think the '86 GL1200I originally sold for $6500 brand new, not sure. 
If that's the case then it depreciated 30.77% over 7 years or a mere
$2000. Big Fat Hairy Deal! Based on what I know, Harleys tend to
depreciate your monies far more than the initial depreciation of
the bike itself when it comes to parts and service. All this about
Harleys holding their valu

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=True, stop_words=None, analyzer='word', binary=True)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
len(vectorizer.vocabulary_)

101631

In [16]:

X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 101631)

In [17]:
count_vect = CountVectorizer(binary=False)
X_train_counts = count_vect.fit_transform(newsgroups_train.data)
X_train_counts.shape

(11314, 101631)

In [18]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_counts, newsgroups_train.target)

In [19]:
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

X_test = vectorizer.transform(newsgroups_test.data)

In [20]:
predicts = clf.predict(X_test)

In [21]:
count_vect = CountVectorizer(binary=False)
X_train_counts = count_vect.fit_transform(newsgroups_train.data)

In [22]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_counts, newsgroups_train.target)

In [23]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

count_vect = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, binary=False)
X_train_counts = count_vect.fit_transform(newsgroups_train.data)

In [24]:
clf = MultinomialNB().fit(X_train_counts, newsgroups_train.target)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS).fit(newsgroups_train.data)

In [26]:
X_train_vectors = vectorizer.transform(newsgroups_train.data)
X_train_vectors[0].data

array([0.09418459, 0.13703598, 0.25808578, 0.16368393, 0.16329311,
       0.11339407, 0.14613089, 0.12706904, 0.12197187, 0.08978258,
       0.1614203 , 0.13037295, 0.10043854, 0.10634736, 0.13520842,
       0.13822597, 0.06961998, 0.11869933, 0.12504221, 0.2245489 ,
       0.20599311, 0.14341273, 0.12667096, 0.17300821, 0.1484788 ,
       0.10526009, 0.46579831, 0.10548299, 0.19644481, 0.24723135,
       0.12937103, 0.14077746, 0.20599311, 0.20797701])

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(11314, 101322)

In [28]:
tf_transformer = TfidfTransformer()
X_train_tf = tf_transformer.fit_transform(X_train_counts)
X_train_tf.shape

(11314, 101322)

In [29]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, newsgroups_train.target)

In [30]:
docs_new = ['Jesus', 'The Solar Systemis the gravitationally bound planetary system of the Sun ']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.target_names[category]))

'Jesus' => soc.religion.christian
'The Solar Systemis the gravitationally bound planetary system of the Sun ' => sci.space


In [31]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

In [32]:
text_clf.fit(newsgroups_train.data, newsgroups_train.target)  


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [33]:
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
docs_test = newsgroups_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == newsgroups_test.target)            


0.6062134891131173

In [34]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])
text_clf.fit(newsgroups_train.data, newsgroups_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == newsgroups_test.target)



0.6836165693043016

In [35]:
from sklearn import metrics
print(metrics.classification_report(newsgroups_test.target, predicted,
     target_names=newsgroups_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.56      0.42      0.48       319
           comp.graphics       0.69      0.67      0.68       389
 comp.os.ms-windows.misc       0.67      0.60      0.63       394
comp.sys.ibm.pc.hardware       0.65      0.65      0.65       392
   comp.sys.mac.hardware       0.76      0.68      0.72       385
          comp.windows.x       0.74      0.71      0.73       395
            misc.forsale       0.48      0.85      0.61       390
               rec.autos       0.79      0.70      0.74       396
         rec.motorcycles       0.73      0.77      0.75       398
      rec.sport.baseball       0.82      0.78      0.80       397
        rec.sport.hockey       0.82      0.91      0.86       399
               sci.crypt       0.71      0.74      0.73       396
         sci.electronics       0.67      0.49      0.57       393
                 sci.med       0.76      0.79      0.78       396
         

In [36]:
metrics.confusion_matrix(newsgroups_test.target, predicted)

array([[135,   0,   2,   2,   0,   2,  13,   4,   9,   5,   4,   4,   4,
          8,  20,  73,   6,  20,   0,   8],
       [  7, 262,  20,  12,   7,  22,  12,   1,   5,   3,   0,  14,   5,
          2,  13,   1,   1,   2,   0,   0],
       [  2,  15, 238,  34,  18,  23,  18,   1,   3,   4,   2,   4,   1,
          8,  11,   1,   4,   1,   4,   2],
       [  0,  14,  26, 256,  21,   9,  25,   2,   1,   1,   2,   9,  21,
          1,   1,   0,   0,   2,   1,   0],
       [  0,   6,   6,  35, 261,   7,  30,   6,   9,   0,   3,   5,   9,
          2,   4,   1,   1,   0,   0,   0],
       [  0,  37,  33,   4,   6, 282,  14,   0,   2,   1,   0,   6,   3,
          1,   5,   0,   0,   1,   0,   0],
       [  0,   3,   0,  12,   9,   0, 330,   7,   6,   2,   2,   1,   5,
          0,   6,   1,   5,   1,   0,   0],
       [  6,   1,   4,   1,   1,   3,  37, 276,  21,   3,   3,   2,  16,
          2,   5,   0,   8,   5,   2,   0],
       [  2,   0,   1,   1,   1,   0,  24,  16, 307,   4,   0,  

In [47]:
x= vectorizer.fit_transform(newsgroups_train.data)
y= newsgroups_train.target

from sklearn.linear_model import LogisticRegressionCV      # using multiNomial Naive Bayes as classifier

clf= LogisticRegressionCV()
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegressionCV (loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])
clf.fit(X_train_counts, newsgroups_train.target)
pred= clf.predict(newsgroups_test)


TypeError: __init__() got an unexpected keyword argument 'loss'

In [44]:
clf.Cs   

AttributeError: 'MultinomialNB' object has no attribute 'Cs'

In [None]:
from sklearn.metrics import classification_report , accuracy_score
print('accuracy=',accuracy_score( newsgroups_train.target,pred))
print(classification_report( newsgroups_train.target, pred))