In [1]:
%matplotlib inline

In [2]:
from collections import defaultdict
import re
import sys
from time import time
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer , FeatureHasher

In [3]:
def n_nonzero_columns(X):
    """
    X.nonzero() : 由于X是二维的,所以返回一个元组，元组有两个元素，每个元素是个列表，每个元素的意义代表索引
    np.unique(X) :返回去除重复元素后的数组
    """
    return len(np.unique(X.nonzero()[1]))

def tokens(doc):
    return (tok.lower() for tok in re.findall(r"\w",doc))

def token_freqs(doc):
    freq = defaultdict(int) #每个键的值默认是个int类型的数
    for tok in tokens(doc):
        freq[tok] += 1
    return freq

In [4]:
categories_list = [
    'alt.atheism',
    'comp.graphics',
    'comp.sys.ibm.pc.hardware',
    'misc.forsale',
    'rec.autos',
    'sci.space',
    'talk.religion.misc',
]

In [5]:
print(__doc__)
print("Usage: %s [n_features_for_hashing]" % sys.argv[0])
print("    The default number of features is 2**18.")

Automatically created module for IPython interactive environment
Usage: /home/penggh/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py [n_features_for_hashing]
    The default number of features is 2**18.


In [None]:
# try:
#     n_features = int(sys.argv[1])
# except IndexError:
#     n_features = 2 ** 18
# except ValueError:
#     print("not a valid number of features: %r" % sys.argv[1])
#     sys.exit(1)

In [6]:
print("Loading 20 newsgroups training data")
raw_data = fetch_20newsgroups(subset='train',categories=categories_list).data
data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6
print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb))

Loading 20 newsgroups training data
3803 documents - 6.245MB


In [8]:
print("DictVectorizer")
t0 = time()
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % len(vectorizer.get_feature_names()))

DictVectorizer
done in 1.563622s at 3.994MB/s
Found 46 unique terms


In [10]:
vectorizer.get_feature_names()

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'ª',
 '¹',
 'º',
 'é',
 'ñ',
 'ú',
 'ý',
 'þ',
 'ÿ']

In [11]:
print("FeatureHasher on frequency dicts")
t0 = time()
n_features = 64
hasher = FeatureHasher(n_features=n_features)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))
print()

FeatureHasher on frequency dicts
done in 1.494447s at 4.179MB/s
Found 36 unique terms



In [12]:
print("FeatureHasher on raw tokens")
t0 = time()
hasher = FeatureHasher(n_features=n_features, input_type="string")
X = hasher.transform(tokens(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))

FeatureHasher on raw tokens
done in 2.168380s at 2.880MB/s
Found 36 unique terms
