In [None]:
## Text classification with Naïve Bayes


In [18]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

### The `sklearn` news dataset

In [2]:
news = datasets.fetch_20newsgroups(subset="all")

In [4]:
print(news.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [7]:
news.data[0]

"From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [8]:
news.target

array([10,  3, 17, ...,  3,  1,  7])

In [9]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Split data into testing and training
* We will use `75%` of the data for training and the rest for testing

In [13]:
X, y = news.data, news.target
len(X),len(y)

(18846, 18846)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [16]:
len(X_train), len(X_test),len( y_test), len(y_train)

(14134, 4712, 4712, 14134)

### Bag of words
* Machine learning algo works on numerical data so we want to convert the text to numeric
* From `sklearn.feature_extraction.text` we have three classes that transforms text into numeric which are:
    1. **`CountVectorizer`** -sically creates a dictionary of words from the text corpus. Then, each instance is converted to a vector of numeric features where each element will be the count of the number of times a particular word appears in the document.
    2. **`HashingVectorizer`** -instead of constricting and maintaining the dictionary in memory, implements a hashing function that maps tokens into feature indexes, and then computes the count as in CountVectorizer
    3. **`TfidfVectorizer`** - works like the CountVectorizer, but with a more advanced calculation called **Term Frequency Inverse Document Frequency (TF-IDF)**. This is a statistic for measuring the importance of a word in a document or corpus. Intuitively, it looks for words that are more frequent in the current document, compared with their frequency in the whole corpus of documents. You can see this as a way to normalize the results and a.

#### Training

> We are going to create 3 classifiers with three different vectorizers

#### `CountVectorizer()`

In [19]:
countvectorizer_pipe = Pipeline([
    ("vect", CountVectorizer() ),
    ("clf", MultinomialNB())
])

#### `TfidfVectorizer`

In [68]:
tfidf_pipe = Pipeline([
    ("vect", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

#### `HashingVectorizer`

In [67]:
hashingvectorizer_pipe = Pipeline([
    ("vect", HashingVectorizer(alternate_sign=False) ),
    ("clf", MultinomialNB())
])

#### Comparing the vectorising methods

#### `countvectorizer_pipe`

In [69]:
countvectorizer_pipe.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('clf', MultinomialNB())])

In [70]:
countvectorizer_pipe.predict([X_test[10]]), y_test[10]

(array([4]), 4)

#### scoring

In [71]:
countvectorizer_pipe.score(X_train, y_train)

0.9533040894297439

> The count vectoctorizer is `93%` accurate on the train dataset

In [72]:
countvectorizer_pipe.score(X_test, y_test)

0.8771222410865874

> The count vectorizer is `86%` accurate on the test dataset.

#### `tfidf_pipe`

In [73]:
tfidf_pipe.fit(X_train, y_train)

Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB())])

In [35]:
tfidf_pipe.predict([X_test[10]]), y_test[10]

(array([4]), 4)

#### Scoring

In [74]:
tfidf_pipe.score(X_train, y_train)

0.9231640016980331

> The tfdif is `92%` accurate on the train dataset

In [52]:
tfidf_pipe.score(X_test, y_test)

0.8752122241086587

> The tfdif is `84%` accurate on the test dataset

#### `hashingvectorizer`

In [59]:
hashingvectorizer_pipe.fit(X_train, y_train)

Pipeline(steps=[('vect', HashingVectorizer(alternate_sign=False)),
                ('clf', MultinomialNB())])

In [60]:
hashingvectorizer_pipe.predict([X_test[10]]), y_test[10]

(array([4]), 4)

#### Scoring

In [61]:
hashingvectorizer_pipe.score(X_train, y_train)

0.8243950757039762

> The hashingvectorizer yeilds `82%` accuracy on the train dataset

In [62]:
hashingvectorizer_pipe.score(X_test, y_test)

0.7521222410865874

> The hashingvectorizer yeilds `75%` accuracy on the test dataset

### Imporving our models

In [None]:
> We can remove stopword

In [78]:
countvectorizer_pipe = Pipeline([
    ("vect", CountVectorizer(stop_words='english', ngram_range=(1, 2), binary=True ) ),
    ("clf", MultinomialNB())
])
countvectorizer_pipe.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(binary=True, ngram_range=(1, 2),
                                 stop_words='english')),
                ('clf', MultinomialNB())])

In [79]:
countvectorizer_pipe.score(X_train, y_train), countvectorizer_pipe.score(X_test, y_test)

(0.9941983868685439, 0.907258064516129)

> The `countvectorizer` has increased to `99%` accuracy on train datasets and `91%` accuracy on test datasets.

In [81]:
tfidf_pipe = Pipeline([
    ("vect", TfidfVectorizer(max_df=0.5, stop_words='english', use_idf=True, ngram_range=(1, 2)) ),
    ("clf", MultinomialNB())
])
tfidf_pipe.fit(X_train, y_train)
tfidf_pipe.score(X_train, y_train), tfidf_pipe.score(X_test, y_test)

(0.9721239564171501, 0.8788200339558574)

> The `tfidf_pipe` has increased to `97%` on the train and `88%` on the test datasets.

In [83]:
tfidf_pipe.predict([X_train[19]]), y_train[19]

(array([14]), 14)