In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

## train data preview

In [3]:
X_train = fetch_20newsgroups(subset="train",shuffle = True)
print(X_train.target_names)
print('\n'.join(X_train.data[0].split("\n")[:3]))
print(X_train.target[:3])

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
[7 4 4]


## Train Data preprocessing

In [4]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train.data)
X_train_counts.shape

(11314, 130107)

In [5]:

Tfidf_transformer = TfidfTransformer()
X_train_tfidf = Tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

## Naive Bayes Classifier 

In [6]:
NB_clf = MultinomialNB()

In [7]:
NB_clf = NB_clf.fit(X_train_tfidf, X_train.target)

## test data preprocess

In [8]:
X_test = fetch_20newsgroups(subset='test',shuffle=True)
X_test_tfidf = Tfidf_transformer.transform(count_vect.transform(X_test.data))

## do predicting

In [9]:
NB_predict = NB_clf.predict(X_test_tfidf)

In [10]:
np.mean(NB_predict == X_test.target)

0.7738980350504514

## Using pipleline to do training and testing

In [11]:
from sklearn.pipeline import Pipeline

In [15]:
NB_text_clf = Pipeline([('vect',CountVectorizer(ngram_range = (1,2))),
                        ('tfidf',TfidfTransformer(use_idf = True)),
                        ('clf',MultinomialNB(fit_prior = True, alpha = 1e-2))])

In [16]:
NB_text_clf = NB_text_clf.fit(X_train.data, X_train.target)
NB_text_predict = NB_text_clf.predict(X_test.data)
np.mean(NB_text_predict == X_test.target)

0.83005841741901221

In [20]:
NB_text_clf = Pipeline([('vect',CountVectorizer(ngram_range = (1,2))),
                        ('tfidf',TfidfTransformer(use_idf = True)),
                        ('clf',MultinomialNB(fit_prior = False, alpha = 1e-2))])

In [21]:
NB_text_clf = NB_text_clf.fit(X_train.data, X_train.target)
NB_text_predict = NB_text_clf.predict(X_test.data)
np.mean(NB_text_predict == X_test.target)

0.83510355815188531

## model selection 

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
parameters = {'vect__ngram_range':[(1,1),(1,2)],
             'tfidf__use_idf':[True, False],
             'clf__alpha':[1e-2, 1e-3],}

In [17]:
NB_model_sele = GridSearchCV(NB_text_clf, parameters,n_jobs = -1)

In [18]:
NB_model_sele = NB_model_sele.fit(X_train.data, X_train.target)

In [19]:
print(1e-3)

0.001


I think there is a flaw in this experiment. we compare the score of GridSearchCV which doesn't use X_test with
the prediction accuracy on X_test

In [30]:
print(NB_model_sele.best_score_)
print(NB_model_sele.best_params_)

0.906752695775
{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


## NLTK

In [21]:
import nltk
#nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [22]:
from nltk.stem.snowball import SnowballStemmer

In [23]:
stemmer = SnowballStemmer("english", ignore_stopwords = True)

## build up stemmed count vectorizer 

In [32]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc:([stemmer.stem(w) for w in analyzer(doc)])        

In [36]:
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [37]:
stemmed_text_NB_clf = Pipeline([('vect', stemmed_count_vect),
                         ('tfidf', TfidfTransformer()),
                         ('mnb', MultinomialNB())])

In [38]:
stemmed_parameters = {'vect__ngram_range':[(1,1),(1,2)],
             'tfidf__use_idf':[True, False],
             'mnb__alpha':[1e-2, 1e-3],}

In [41]:
stemmed_NB_model_sel = GridSearchCV(stemmed_text_NB_clf, stemmed_parameters,n_jobs = -1)

In [None]:
stemmed_NB_model_sel = stemmed_NB_model_sel.fit(X_train.data,X_train.target)

In [None]:
stemmed_predictions = stemmed_NB_model_sel.predict(X_test.data)
np.mean(stemmed_predictions==X_test.target)