# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [1]:
from pprint import pprint
from time import time
import logging
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/bng-
[nltk_data]     anuphap_c/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Choose a few categories fro the entire 20 categories

In [2]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [3]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetch documents for these 2 categories


In [4]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



 ### Make Word2Vec and Doc2Vec work (Credit: Stackoverflow)

In [5]:
import numpy as np
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [np.maximum(self.model.wv[word], 0) if word in self.model.wv else np.zeros(self.model.vector_size) for word in X]

class Doc2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [np.maximum(self.model.infer_vector(word_tokenize(doc)), 0) if len(word_tokenize(doc)) > 0 else np.zeros(self.model.vector_size) for doc in X]

In [6]:
word2vec_model = Word2Vec(sentences=[word_tokenize(text) for text in data.data])

tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[str(i)]) for i, doc in enumerate(data.data)]
doc2vec_model = Doc2Vec()
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

### Algo and Extractor

In [7]:
algorithms = [
    ('Multinomial Naïve Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machines', SVC()),
    ('Decision Trees', DecisionTreeClassifier()),
]

extractors = [
    ('Bag-of-Words', CountVectorizer()),
    ('N-grams', CountVectorizer(ngram_range=(1, 2))),
    ('word2vec', Word2VecTransformer(model=word2vec_model)),
    ('doc2vec', Doc2VecTransformer(model=doc2vec_model))
]

### Combine pipeline, Set parameters = all (avoid error), do gridsearch, and benchmarking (performance, and accuracy).

In [8]:
results = []

for algo_name, algo in algorithms:
    for extractor_name, extractor in extractors:
        pipeline = Pipeline([
            ('vect', extractor),
            ('clf', algo),
        ])

        parameters = {}

        grid_search = GridSearchCV(pipeline, parameters, cv=5,
                                   n_jobs=-1, verbose=1)

        start_time = time()
        grid_search.fit(data.data, data.target)
        end_time = time()

        execution_time = end_time - start_time
        best_score = grid_search.best_score_
        best_parameters = grid_search.best_estimator_.get_params()

        results.append({
            'Algorithm': algo_name,
            'Feature Extractor': extractor_name,
            'Best Score': best_score,
            'Execution Time': execution_time,
            'Best Parameters': best_parameters
        })

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


### Save in tabulate

In [9]:
from tabulate import tabulate

best_configuration = max(results, key=lambda x: x['Best Score'])

print("Best:")
pprint(best_configuration)

with open('results.txt', 'w') as file:

    file.write("Best:\n")
    file.write(tabulate([best_configuration], headers='keys', tablefmt='grid'))

    file.write("\n\nAll Possibles:\n")
    file.write(tabulate(results, headers='keys', tablefmt='grid'))

Best:
{'Algorithm': 'Logistic Regression',
 'Best Parameters': {'clf': LogisticRegression(),
                     'clf__C': 1.0,
                     'clf__class_weight': None,
                     'clf__dual': False,
                     'clf__fit_intercept': True,
                     'clf__intercept_scaling': 1,
                     'clf__l1_ratio': None,
                     'clf__max_iter': 100,
                     'clf__multi_class': 'auto',
                     'clf__n_jobs': None,
                     'clf__penalty': 'l2',
                     'clf__random_state': None,
                     'clf__solver': 'lbfgs',
                     'clf__tol': 0.0001,
                     'clf__verbose': 0,
                     'clf__warm_start': False,
                     'memory': None,
                     'steps': [('vect', CountVectorizer(ngram_range=(1, 2))),
                               ('clf', LogisticRegression())],
                     'vect': CountVectorizer(ngram_range=(1, 2)