In [1]:
%matplotlib inline

## [Feature Hashing]()
- Instead of building a features hash table (as vectorizers do), apply a hash function to *directly* find features' column indexes. Better speed, lower memory usage, but no ability to remember the input features. (No inverse_transform method.)
- Hashes can create collision between unrelated features - so this function uses a signed hash to cancel out errors instead of accumulating them. use ```alternate_sign=True``` (default) to do this. Particularly useful for hash tables with n_features<10K.
- Accepts dicts, feature:value pairs, or strings. Single strings have implied value of 1.
- Output is a scipy.sparse matrix in CSR format.
- Implemented using [MurmurHash3](https://github.com/aappleby/smhasher), signed 32bit version.

In [2]:
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=10)
D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
f = h.transform(D)
f.toarray()

array([[ 0.,  0., -4., -1.,  0.,  0.,  0.,  0.,  0.,  2.],
       [ 0.,  0.,  0., -2., -5.,  0.,  0.,  0.,  0.,  0.]])

## Example: [Feature Hashing vs Dict Vectorization](https://scikit-learn.org/stable/auto_examples/text/plot_hashing_vs_dict_vectorizer.html#sphx-glr-auto-examples-text-plot-hashing-vs-dict-vectorizer-py)

In [8]:
from collections import defaultdict
import re
import sys
from time import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer, FeatureHasher


def n_nonzero_columns(X):
    """Returns the number of non-zero columns in a CSR matrix X."""
    return len(np.unique(X.nonzero()[1]))


def tokens(doc):
    """Extract tokens from doc. with simple regex."""
    return (tok.lower() for tok in re.findall(r"\w+", doc))


def token_freqs(doc):
    """Extract a dict mapping tokens from doc to their frequencies."""
    freq = defaultdict(int)
    for tok in tokens(doc):
        freq[tok] += 1
    return freq


categories = [
    'alt.atheism',
    'comp.graphics',
    'comp.sys.ibm.pc.hardware',
    'misc.forsale',
    'rec.autos',
    'sci.space',
    'talk.religion.misc',
]

print("Usage: %s [n_features_for_hashing]" % sys.argv[0])
print("    The default number of features is 2**18.")

#try:
#    n_features = int(sys.argv[1])
#except IndexError:
#    n_features = 2 ** 18
n_features = 2**18
#except ValueError:
#    print("not a valid number of features: %r" % sys.argv[1])
#    sys.exit(1)


print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset='train', categories=categories,
                                 return_X_y=True)
data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6
print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb))

print("DictVectorizer")
t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % len(vectorizer.get_feature_names()))

print("FeatureHasher on frequency dicts")
t0 = time()
hasher = FeatureHasher(n_features=n_features)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))

print("FeatureHasher on raw tokens")
t0 = time()
hasher = FeatureHasher(n_features=n_features, input_type="string")
X = hasher.transform(tokens(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))

Usage: /home/bjpcjp/.local/lib/python3.6/site-packages/ipykernel_launcher.py [n_features_for_hashing]
    The default number of features is 2**18.
Loading 20 newsgroups training data
3803 documents - 6.245MB
DictVectorizer
done in 1.023960s at 6.099MB/s
Found 47928 unique terms
FeatureHasher on frequency dicts
done in 0.650507s at 9.600MB/s
Found 43873 unique terms
FeatureHasher on raw tokens
done in 0.592099s at 10.547MB/s
Found 43873 unique terms
