### Fetch data 

In [3]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',\
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', \
                                  categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [13]:
len(twenty_train.data)

2257

In [14]:
print('classes are :', twenty_train.target_names)

classes are : ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [15]:
twenty_train.filenames[0]

'/home/admin/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38440'

In [16]:
# get label index first, then  get the name from name list
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


### Extracting features from text files

### Bag of words :
- assign a fixed integer id to each word occurring in any document of the training set (for instance by building a dictionary from words to integer indices).
- for each document #i, count the number of occurrences of each word w and store it in X[i, j] as the value of feature #j where j is the index of word w in the dictionary
##### If n_samples == 10000, storing X as a numpy array of type float32 would require 10000 x 100000 x 4 bytes = 4GB in RAM which is barely manageable on today’s computers. 

### Fortunately, most values in X will be zeros. 
- For this reason we say that bags of words are typically high-dimensional sparse datasets. 
- scipy.sparse matrices are data structures that do exactly this

### Tokenizing text with scikit-learn

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [19]:
X_train_counts = count_vect.fit_transform(twenty_train.data) # input : list of strings
type(X_train_counts)

scipy.sparse.csr.csr_matrix

In [20]:
X_train_counts.shape

(2257, 35788)

### CountVectorizer supports counts of N-grams of words or consecutive characters. 
- Once fitted, the vectorizer has built a dictionary of feature indices:

In [21]:
print(dir(count_vect))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_char_ngrams', '_char_wb_ngrams', '_check_vocabulary', '_count_vocab', '_get_param_names', '_limit_features', '_sort_features', '_validate_vocabulary', '_white_spaces', '_word_ngrams', 'analyzer', 'binary', 'build_analyzer', 'build_preprocessor', 'build_tokenizer', 'decode', 'decode_error', 'dtype', 'encoding', 'fit', 'fit_transform', 'fixed_vocabulary_', 'get_feature_names', 'get_params', 'get_stop_words', 'input', 'inverse_transform', 'lowercase', 'max_df', 'max_features', 'min_df', 'ngram_range', 'preprocessor', 'set_params', 'stop_words', 'stop_words_', 'strip_accents', 'token_pattern', 'tokenizer', 'transform'

In [27]:
# if we want to get the counts of words in the raw documents
count_vect._count_vocab(twenty_train.data,False)
count_vect.vocabulary_.get('algorithm')

4690

### This index value of the word 'algorithm' in the vocabulary is linked to its frequency in the whole training corpus.

### From occurrences to frequencies¶

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
print(type(tfidf_transformer))
X_train_tfidf = tfidf_transformer.transform(X_train_counts)

<class 'sklearn.feature_extraction.text.TfidfTransformer'>


### fit_transform applies to data, transform applies to count matrix

In [33]:
X_train_tf.shape

(2257, 35788)

### Training a classifier
- scikit-learn includes several variants of this classifier; 
- the one most suitable for word counts is the multinomial variant:

In [39]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

### Prediction

In [37]:
# new data
docs_new = ['God is love', 'OpenGL on the GPU is fast']

In [40]:
# feature engineering
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [41]:
predicted = clf.predict(X_new_tfidf)

In [43]:
for doc, category in zip(docs_new, predicted):
    print(f'{doc} => {twenty_train.target_names[category]}')

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics


### Building a pipeline
- make the vectorizer => transformer => classifier easier to work with
- The names vect, tfidf and clf (classifier) are arbitrary

In [45]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tf-idf', TfidfTransformer()),
                     ('classifier', MultinomialNB())])

In [46]:
text_clf.fit(twenty_train.data, twenty_train.target) 

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

### Evaluation of the performance on the test set

In [47]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',\
                                 categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)            

0.83488681757656458

In [None]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html