# Text Classification. Hash Vectorizer and SVM

We get ~87% classification accuracy on the "ag news" dataset, with a Hashing Vectorizer and an SVM based on gradient descent. No special optimizations were made. The state of the art for this dataset is ~92.5%. See for example
1. R. Johnson, T. Zhang, *Effective Use of Word Order for Text Categorization
with Convolutional Neural Networks*
2. X. Zhan, Y. LeCun, *Text Understanding from Scratch*

In [0]:
!pip uninstall --yes tensorflow
!pip install tensorflow==2.0.0
import tensorflow as tf
print(tf.__version__)

In [0]:
import itertools
import logging
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import re
from nltk.util import ngrams 
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO


In [0]:
def doc_cleaner(doc):
    # doc = doc.decode('utf-8')
    doc = doc.lower()        
    doc = re.sub(r':?\\+', ' ', doc)  # remove double backslash
    # doc = re.sub(r'[0-9]+', ' ', doc)  # remove numbers
    # doc = re.sub(r'#[\w-]+', ' ', doc)  # remove hashtag
    # doc = re.sub(r'^[^-]* - ', ' ', doc)  # remove everything before the first hyphen
    doc = re.sub(r'[^A-Z a-z]+', '', doc)
    doc = re.sub(' +', ' ', doc)  # multiple spaces
    return doc

In [0]:
class Embedding(tf.keras.Model):
    def __init__(self, kernel_size, batch_size, embedding_length):
        super().__init__()
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Conv1D(filters=3, kernel_size=kernel_size, input_shape=(embedding_length, 1)))
        self.model.add(tf.keras.layers.GlobalMaxPooling1D())
        # self.model.add(tf.keras.layers.Flatten())
        self.model.add(tf.keras.layers.Dense(units=4, activation='softmax'))        
        
        # optimizer and loss funciton
        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_fun = tf.keras.losses.CategoricalCrossentropy(from_logits=False)

        self.model.compile(optimizer=self.optimizer, loss=self.loss_fun, metrics=['categorical_accuracy'])  

    def fit_data(self, docvecs, target, batch_size, epochs):
        """  supports both raw data and generators """
        history = self.model.fit(docvecs,
                                target,
                                batch_size=batch_size,
                                epochs=epochs)
        print("training history: {}".format(history))

In [0]:
class BatchedCorpus():
    def __init__(self, file_path:str):
        self._file_path = file_path
        
    def get_batch(self, start=0, batch_size=100, preprocess=None):
        """
            Args:
                start (int) = row index where the batch starts
                batch_size (int) = number of samples in the batch
                preprocess (function) = function to preprocess text, i.e., remove special symbols etc.
            Returns:
                tuple (document class (int), document title (str), text (str))
        """
        batch = []
        labels = []
        titles = []
        texts = []
        file = open(self._file_path, 'r') 
        for i, row in enumerate(itertools.islice(file, start, start + batch_size)):
            line = file.readline().split('","')
            labels.append(int(re.sub(r'[^0-9]+', '', line[0])))
            if preprocess:
                texts.append(preprocess(line[2]))
            else:
                texts.append(line[2])
        return labels, texts
    
    def iter_batch(self, batch_size=100, preprocess=None):
      """ generator based on get_batch """
        start = 0
        i = 0
        labels, texts = self.get_batch(start, batch_size, preprocess)
        while len(texts):            
            yield labels, texts
            labels, texts = self.get_batch(i * batch_size, batch_size, preprocess)
            i = i + 1

# Main

In [8]:
BATCH_SIZE = 1000
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,
                                    alternate_sign=False)   
embedding = Embedding(1000, BATCH_SIZE, 2 ** 18)

# prepare the test set, it's a small dataset so we just import it
test = pd.read_csv(r'/content/drive/My Drive/test.csv'.replace("\\", "/"), names=["label", "title", "text"])
test.text.apply(doc_cleaner)
X_test = vectorizer.transform(test.text)

train = BatchedCorpus(r'/content/drive/My Drive/train.csv'.replace("\\", "/"))

clf = SGDClassifier()
# only one epoch
for i, (labels, texts) in enumerate(train.iter_batch(preprocess=doc_cleaner, batch_size=BATCH_SIZE)):
    # train the SVM
    X = vectorizer.transform(texts)
    clf.partial_fit(X, labels, np.unique(labels))
    if i % 10 == 0:
      print("batch = {}, SVM test accuracy = {}".format(i, clf.score(X_test, test.label.values)))

batch = 0, SVM test accuracy = 0.6267105263157895
batch = 10, SVM test accuracy = 0.8119736842105263
batch = 20, SVM test accuracy = 0.8306578947368422
batch = 30, SVM test accuracy = 0.848421052631579
batch = 40, SVM test accuracy = 0.8581578947368421
batch = 50, SVM test accuracy = 0.8589473684210527
batch = 60, SVM test accuracy = 0.861578947368421
batch = 70, SVM test accuracy = 0.8653947368421052
batch = 80, SVM test accuracy = 0.8621052631578947
batch = 90, SVM test accuracy = 0.8693421052631579
batch = 100, SVM test accuracy = 0.8736842105263158
batch = 110, SVM test accuracy = 0.8759210526315789
batch = 120, SVM test accuracy = 0.8736842105263158
