In [1]:
import app.pipeliner.pipedev as piper
import app.pipeliner.registry as rg
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
import simplejson as json

In [2]:
tp = piper.TextPipliner("pickle0003")

In [3]:
tp.set_pipeline(Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
]))
tp.set_categories([
    'alt.atheism',
    'talk.religion.misc',
])

tp.set_search_params({
    "verbose": 1,
    "n_jobs": -1,
    "param_grid": {
    #'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}})
tp.run(persist_dir="./app/pipeliner/store/")

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:   21.9s finished
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(GridSearchCV(cv=None, error_score='raise',
        estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=50000, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         stri...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
        verbose=0, warm_start=False))]),
        fit_params={}, iid=True, n_jobs=-1,
        param_grid={'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (1e-05, 1e-06)},
        pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
        scoring=None, verbose=1), './app/pipeliner/store/pickle0003')

In [4]:
# Write a new PickleRegister at "./app/pipeliner/register.json"
# The pickles are being persisted to "./app/pipeliner/store/"
reg = rg.PickleRegister("./app/pipeliner/register.json", 
                  "./app/pipeliner/store/", new_register=True)

In [5]:
# The TextPipeliner as_registry() method takes a filepath to persist the metadata
# and a description of the pickle. It returns a valid argument for the PickleRegister
# new_entry() method.
item = tp.as_registry("./app/pipeliner/store/pickle0003.json", "A"
                      " pipeline tuned on a broad parameter space, "
                      "trained on two categories in the newsgroup20 corpus.")

In [19]:
# new_entry() also expects a pickle_type ("pipeline") and can accept an id kwarg
reg.new_entry(item, "pipeline")

In [20]:
reg._register

{'register': {'pickle0003': {'id': 'pickle0003',
   'payload': {'answer_key': {0: 'alt.atheism', 1: 'talk.religion.misc'},
    'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
    'description': 'A pipeline tuned on a broad parameter space, trained on two categories in the newsgroup20 corpus.',
    'name': 'pickle0003',
    'score': 0.9369894982497082,
    'search_params': {'n_jobs': -1,
     'param_grid': {'clf__alpha': (1e-05, 1e-06),
      'tfidf__norm': ('l1', 'l2'),
      'vect__max_features': (None, 5000, 10000, 50000),
      'vect__ngram_range': ((1, 1), (1, 2))},
     'verbose': 1}},
   'pickletype': 'pipeline'}}}

In [16]:
reg._ids

{'all': []}

In [8]:
pickle_map = reg.load_pickles()

In [10]:
pickle_map

{'pickle0003': Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=50000, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         stri...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
        verbose=0, warm_start=False))])}

In [18]:
reg._pickletypes

set()

In [11]:
with open("./app/pipeliner/register.json", "r") as registry:
    reg = json.load(registry)
reg

{'register': {'pickle0003': {'id': 'pickle0003',
   'payload': {'answer_key': {'0': 'alt.atheism', '1': 'talk.religion.misc'},
    'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
    'description': 'A pipeline tuned on a broad parameter space, trained on two categories in the newsgroup20 corpus.',
    'name': 'pickle0003',
    'score': 0.9369894982497082,
    'search_params': {'n_jobs': -1,
     'param_grid': {'clf__alpha': [1e-05, 1e-06],
      'tfidf__norm': ['l1', 'l2'],
      'vect__max_features': [None, 5000, 10000, 50000],
      'vect__ngram_range': [[1, 1], [1, 2]]},
     'verbose': 1}},
   'pickletype': 'pipeline'}}}

In [12]:
regObject = reg["register"]["pickle0003"]
regObject["payload"]["description"] = 'A pipeline tuned on a broad parameter space, trained on all the categories in the newsgroup20 corpus.'

In [13]:
reg = rg.PickleRegister("./app/pipeliner/register.json", 
                  "./app/pipeliner/store/")
reg.update_entry(regObject)
pickle_map = reg.load_pickles()

In [14]:
reg._register

{'register': {'pickle0003': {'id': 'pickle0003',
   'payload': {'answer_key': {'0': 'alt.atheism', '1': 'talk.religion.misc'},
    'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
    'description': 'A pipeline tuned on a broad parameter space, trained on all the categories in the newsgroup20 corpus.',
    'name': 'pickle0003',
    'score': 0.9369894982497082,
    'search_params': {'n_jobs': -1,
     'param_grid': {'clf__alpha': [1e-05, 1e-06],
      'tfidf__norm': ['l1', 'l2'],
      'vect__max_features': [None, 5000, 10000, 50000],
      'vect__ngram_range': [[1, 1], [1, 2]]},
     'verbose': 1}},
   'pickletype': 'pipeline'}}}

In [11]:
tp2 = piper.TextPipliner("pickle0002")
tp2.set_pipeline(Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
]))
tp2.set_categories([
    'alt.atheism',
    'talk.religion.misc',
])
tp2.set_search_params({
    "verbose": 1,
    "n_jobs": -1,
    "param_grid": {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}})
gsCV = tp2.run()

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 3456 out of 3456 | elapsed: 13.1min finished
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [12]:
item = tp2.as_registry("./app/pipeliner/store/pickle0002.json", "A"
                      " pipeline tuned on a broad parameter space, "
                      "trained on 2 categories ('alt.atheism', 'talk.religion.misc')"
                      "in the newsgroup20 corpus.")

In [14]:
reg = rg.PickleRegister("./app/pipeliner/register.json", 
                  "./app/pipeliner/store/")
reg.new_entry(item, "pipeline")

In [5]:
pickle_map = reg.load_pickles()
pickle_map

{'pickle0002': Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.75, max_features=None, min_df=1,
         ngram_range=(1, 2), preprocessor=None, stop_words=None,
         stri...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
        verbose=0, warm_start=False))]),
 'pickle0003': Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=50000, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         stri...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
        verbose=0, warm_start=False))])}

In [21]:
import requests

In [26]:
resp = requests.get("http://localhost/api/pickleregister")
resp.content

b'"{\\"register\\": {\\"pickle0003\\": {\\"id\\": \\"pickle0003\\", \\"pickletype\\": \\"pipeline\\", \\"payload\\": {\\"name\\": \\"pickle0003\\", \\"answer_key\\": {\\"0\\": \\"alt.atheism\\", \\"1\\": \\"talk.religion.misc\\"}, \\"components\\": [\\"CountVectorizer\\", \\"TfidfTransformer\\", \\"SGDClassifier\\"], \\"score\\": 0.9369894982497082, \\"search_params\\": {\\"verbose\\": 1, \\"n_jobs\\": -1, \\"param_grid\\": {\\"vect__max_features\\": [null, 5000, 10000, 50000], \\"vect__ngram_range\\": [[1, 1], [1, 2]], \\"tfidf__norm\\": [\\"l1\\", \\"l2\\"], \\"clf__alpha\\": [1e-05, 1e-06]}}, \\"description\\": \\"A pipeline tuned on a broad parameter space, trained on two categories in the newsgroup20 corpus.\\"}}}}"'


In [25]:
pst = requests.post("http://localhost/api/predict", json= {"id": "pickle0003",
                                                               "text": article_tester})
pst.content

b'{"prediction": "talk.religion.misc", "register": {"id": "pickle0003", "pickletype": "pipeline", "payload": {"name": "pickle0003", "answer_key": {"0": "alt.atheism", "1": "talk.religion.misc"}, "components": ["CountVectorizer", "TfidfTransformer", "SGDClassifier"], "score": 0.9369894982497082, "search_params": {"verbose": 1, "n_jobs": -1, "param_grid": {"vect__max_features": [null, 5000, 10000, 50000], "vect__ngram_range": [[1, 1], [1, 2]], "tfidf__norm": ["l1", "l2"], "clf__alpha": [1e-05, 1e-06]}}, "description": "A pipeline tuned on a broad parameter space, trained on two categories in the newsgroup20 corpus."}}}'

In [15]:
reg.delete_entry("pickle0003")
reg._register


{'register': {}}

In [7]:
import simplejson as json
with open("./app/pipeliner/register.json", 'rb') as f:
    loaded = json.load(f)
loaded

{'id': 'pickle0001',
 'payload': {'answer_key': {'0': 'alt.atheism', '1': 'talk.religion.misc'},
  'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
  'description': 'A pipeline tuned on a broad parameter space, trained on alt.atheism and talk.religion.misc.',
  'name': 'pickle0001',
  'score': 0.9381563593932322,
  'search_params': {'n_jobs': -1,
   'param_grid': {'clf__alpha': [1e-05, 1e-06],
    'clf__penalty': ['l2', 'elasticnet'],
    'tfidf__norm': ['l1', 'l2'],
    'vect__max_features': [None, 5000, 10000, 50000],
    'vect__ngram_range': [[1, 1], [1, 2]]},
   'verbose': 1}},
 'pickletype': 'pipeline'}

In [10]:
est = pickle_map["pickle0001"]
est.predict([article_tester])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


array([0], dtype=int64)

In [14]:
reg._register

[{'id': 'pickle0001',
  'payload': {'answer_key': {0: 'alt.atheism', 1: 'talk.religion.misc'},
   'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
   'description': 'A pipeline tuned on a broad parameter space, trained on alt.atheism and talk.religion.misc.',
   'name': 'pickle0001',
   'score': 0.9381563593932322,
   'search_params': {'n_jobs': -1,
    'param_grid': {'clf__alpha': (1e-05, 1e-06),
     'clf__penalty': ('l2', 'elasticnet'),
     'tfidf__norm': ('l1', 'l2'),
     'vect__max_features': (None, 5000, 10000, 50000),
     'vect__ngram_range': ((1, 1), (1, 2))},
    'verbose': 1}},
  'pickletype': 'pipeline'}]

In [30]:
with open("./app/pipeliner/store/pickle0001.json", 'w') as file:

    json.dump([
        {"id": 1, "pickletype": "pipeline", "payload": [{'answer_key': {'0': 'alt.atheism', '1': 'talk.religion.misc'},
     'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
     'description': ['A dummy'],
     'name': ['pickle0001'],
     'score': [0.9463243873978997],
     'search_params': [{'n_jobs': -1,
       'param_grid': {'clf__n_iter': [10, 50, 80],
        'tfidf__norm': ['l1', 'l2'],
        'vect__max_df': [0.5, 0.75, 1.0],
        'vect__max_features': [None, 5000, 10000, 50000]},
       'verbose': 1}]}]},
    {"id": 2, "pickletype": "pipeline", "payload": {'answer_key': {'0': 'alt.atheism', '1': 'talk.religion.misc'},
     'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
     'description': ['A dummy'],
     'name': ['pickle0001'],
     'score': [0.9463243873978997],
     'search_params': [{'n_jobs': -1,
       'param_grid': {'clf__n_iter': [10, 50, 80],
        'tfidf__norm': ['l1', 'l2'],
        'vect__max_df': [0.5, 0.75, 1.0],
        'vect__max_features': [None, 5000, 10000, 50000]},
       'verbose': 1}]}}], file)

with open("./app/pipeliner/store/pickle0001.json", 'rb') as f:
    loaded = json.load(f)
loaded

[{'id': 1,
  'payload': [{'answer_key': {'0': 'alt.atheism', '1': 'talk.religion.misc'},
    'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
    'description': ['A dummy'],
    'name': ['pickle0001'],
    'score': [0.9463243873978997],
    'search_params': [{'n_jobs': -1,
      'param_grid': {'clf__n_iter': [10, 50, 80],
       'tfidf__norm': ['l1', 'l2'],
       'vect__max_df': [0.5, 0.75, 1.0],
       'vect__max_features': [None, 5000, 10000, 50000]},
      'verbose': 1}]}],
  'pickletype': 'pipeline'},
 {'id': 2,
  'payload': {'answer_key': {'0': 'alt.atheism', '1': 'talk.religion.misc'},
   'components': ['CountVectorizer', 'TfidfTransformer', 'SGDClassifier'],
   'description': ['A dummy'],
   'name': ['pickle0001'],
   'score': [0.9463243873978997],
   'search_params': [{'n_jobs': -1,
     'param_grid': {'clf__n_iter': [10, 50, 80],
      'tfidf__norm': ['l1', 'l2'],
      'vect__max_df': [0.5, 0.75, 1.0],
      'vect__max_features': [None, 5000, 10000, 50

In [31]:
set([0, 1, 0, 1, 0])

{0, 1}

In [32]:
!python ./app/pipeliner/registry.py

In [3]:



###############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
categories = None

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

11314 documents
20 categories

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__n_iter': (10, 50, 80),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 51.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 109.3min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 193.7min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 234.3min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 320.1min
[Parallel(n_jobs=-1)]: Done 3456 out of 3456 | elapsed: 377.5min finished
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


done in 22760.330s

Best score: 0.925
Best parameters set:
	clf__alpha: 1e-05
	clf__n_iter: 50
	clf__penalty: 'elasticnet'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 2)


In [4]:
pipeline.set_params(**grid_search.best_estimator_.get_params())

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [5]:
pipeline.fit(data.data, data.target)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [24]:
article_tester = """One of the surprises from AMD’s first year of the newest x86 Zen architecture was the launch of the Threadripper platform. Despite the mainstream Ryzen processors already taking a devastating stab into the high-end desktop market, AMD’s Threadripper offered more cores at a workstation-friendly price. For 2018, the next generation is going to be using AMD’s updated 12nm Zeppelin dies, as well as including a few new tweaks into the system including better boost and faster caches.

This article is still a work in progress, and will be updated as more news comes in.



AMD’s Zeppelin silicon has 8 cores, and the first generation Threadripper uses two of them to get to the top-SKU of 16-cores. Inside the CPU however, there are four pieces of silicon: two active and two inactive. For this second generation of Threadripper, called Threadripper 2 or the Threadripper 2000-series, AMD is going to make these inactive dies into active ones, and substantially increase the core count for the high-end desktop and workstation user.



At the AMD press event at Computex, it was revealed that these new processors would have up to 32 cores in total, mirroring the 32-core versions of EPYC. On EPYC, those processors have four active dies, with eight active cores on each die (four for each CCX). On EPYC however, there are eight memory channels, and AMD’s X399 platform only has support for four channels. For the first generation this meant that each of the two active die would have two memory channels attached – in the second generation Threadripper this is still the case: the two now ‘active’ parts of the chip do not have direct memory access.



This technically adds latency to the platform, however AMD is of the impression that for all but the most memory bound tasks, this should not be an issue (usually it is suggested to just go buy an EPYC for those workloads). While it does put more pressure on the internal Infinity Fabric, AMD ultimately designed Infinity Fabric for scalable scenarios like this between different silicon with different levels of cache and memory access.



Update: AMD has just published a full copy of their slide deck for the Threadripper 2 presentation. In it are a few interesting factoids.

AMD Threadripper CPUs
 	Threadripper
2
32-Core Sample	Threadripper
2
24-Core Sample	Threadripper
1950X	Threadripper
1920X
Socket	TR4 (LGA)
4094-pin
CPU Architecture	Zen+	Zen+	Zen	Zen
Cores/Threads	32 / 64	24 / 48	16 / 32	12 / 24
Base Frequency	3.0 GHz	3.0 GHz	3.4 GHz	3.5 GHz
Turbo Frequency	3.4 GHz (WIP)	3.4 GHz (WIP)	4.0 GHz	4.0 GHz
L3 Cache	64 MB ?	48 MB ?	32 MB	32 MB
TDP	250W	250W	180W	180W
PCIe 3.0 Lanes	60 + 4
Chipset Support	X399
Memory Channels	4
Both the 24-core and 32-core sample CPUs are clocked at 3.0GHz base and 3.4GHz all-core turbo, with the latter being a work-in-progress according to the company.
The 32-core system was equipped with DDR4-3200 memory. This is notable because the Ryzen processors based on the same 12nm Zeppelin dies officially max out at DDR4-2933.
The codename for the processor family is listed as "Colfax". This is the first we've heard this codename from AMD.
Despite the high TDP, both CPUs used in AMD's demos were air-cooled, using AMD's Wraith Ripper Air Cooler
Also announced at the presentation is the state of play of motherboards. According to the motherboard vendors These new Threadripper 2000-series processors will have a peak TDP rating of 250W, which is much higher than 180W we saw on the 1950X. We have been told by partners that the 250W rating is actually conservative, and users should expect lower power consumption in most scenarios. Nonetheless, it was stated by several motherboard vendors that some of the current X399 motherboards on the market might struggle with power delivery to the new parts, and so we are likely to see a motherboard refresh. That is not saying that the current X399 offerings will not work, however they might not offer overclocking to the level that users might expect. At Computex there are new X399 refresh motherboards being demonstrated by a few companies, and we will report on them in due course. Other specifications are expected to match the previous generation, such as PCIe lane counts, despite the newly active dies.


MSI's 19-phase X399 Refresh Motherboard

The launch for these new processors, according to our moles is in early August. This aligns with what AMD stated at the beginning of the year at CES, and is almost a year from the original Threadripper launch.

Pricing on the processors is set to be revealed either today or closer to the launch time. We will update this piece as more information comes in.

It will be interesting if AMD is going to go through the ‘unboxing’ embargo this time around, or just jump straight to full performance reviews. As always, come to AnandTech for the full story.


GIGABYTE's new X399 Refresh Motherboard"""

In [97]:
prediction = pipeline.predict([article_tester])

transform = pipeline.transform([article_tester])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if np.issubdtype(mask.dtype, np.int):


In [37]:
{i: y for i, y in enumerate(data.target_names)}

{0: 'alt.atheism',
 1: 'comp.graphics',
 2: 'comp.os.ms-windows.misc',
 3: 'comp.sys.ibm.pc.hardware',
 4: 'comp.sys.mac.hardware',
 5: 'comp.windows.x',
 6: 'misc.forsale',
 7: 'rec.autos',
 8: 'rec.motorcycles',
 9: 'rec.sport.baseball',
 10: 'rec.sport.hockey',
 11: 'sci.crypt',
 12: 'sci.electronics',
 13: 'sci.med',
 14: 'sci.space',
 15: 'soc.religion.christian',
 16: 'talk.politics.guns',
 17: 'talk.politics.mideast',
 18: 'talk.politics.misc',
 19: 'talk.religion.misc'}

In [16]:
from sklearn.externals import joblib
joblib.dump(pipeline, filename="pickle0001")

['pickle0001']

In [47]:
pipeline = joblib.load(filename="./pickle0001")

In [31]:
[name[1].__class__.__name__ for name in pipeline.steps]

In [93]:
inc = pipeline.transform(data.data)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if np.issubdtype(mask.dtype, np.int):


In [102]:
vect = pipeline.named_steps['vect'].transform([article_tester])
tfi = pipeline.named_steps['tfidf'].transform(vect)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [105]:
vect

<1x1181775 sparse matrix of type '<class 'numpy.int64'>'
	with 658 stored elements in Compressed Sparse Row format>

In [117]:
np.array(tfi)

array(<1x1181775 sparse matrix of type '<class 'numpy.float64'>'
	with 658 stored elements in Compressed Sparse Row format>, dtype=object)

In [107]:
len(pipeline.named_steps['vect'].get_feature_names())

In [127]:
len(article_tester.split())

831

In [118]:
import numpy as np
tops = zip(np.array(pipeline.named_steps['vect'].get_feature_names()), tfi)

In [120]:
cl = pipeline.named_steps['clf']

In [125]:
transformed = pipeline.transform([article_tester])

In [126]:
transformed

<1x202335 sparse matrix of type '<class 'numpy.float64'>'
	with 505 stored elements in Compressed Sparse Column format>