# Mink usage examples

This notebook demonstrates some examples of using mink and how it interacts with sklearn.

## Imports

In [1]:
import pickle

In [2]:
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
import tensorflow as tf

In [3]:
from mink import NeuralNetClassifier
from mink import NeuralNetRegressor
from mink.layers import DenseLayer
from mink.layers import InputLayer
from mink.updates import SGD

## MNIST classification task

### Data

In [4]:
X, y = make_classification(n_samples=5000)

### Define neural net

Note: No need to specify the shape of the training data, number of classes, or to set softmax nonlinearity. The `NeuralNetClassifier` class takes care of all of that, as is expected from an sklearn estimator.

In [5]:
l0 = InputLayer()
l1 = DenseLayer(l0, num_units=200)
l2 = DenseLayer(l1)

In [6]:
net = NeuralNetClassifier(l2, verbose=1)

If we want to change certain parameters after initialization, just use the `set_params` method and the double-underscore notation known from sklearn.

In [7]:
net.set_params(update__learning_rate=0.5)

NeuralNetClassifier(batch_iterator=<mink.nolearn.BatchIterator object at 0x7f67c4419fd0>,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=InputLayer(Xs=None, name=None, ys=None), name=None,
      nonlinearity=None, num_units=200),
      name=None, nonlinearity=None, num_units=None),
          max_epochs=10, objective=CrossEntropy(eps=1e-12),
          session_kwargs=None, update=SGD(learning_rate=0.5), verbose=1)

### Train

In [8]:
net.fit(X, y, num_epochs=0)

NeuralNetClassifier(batch_iterator=<mink.nolearn.BatchIterator object at 0x7f67c4419fd0>,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=InputLayer(Xs=None, name=None, ys=None), name=None,
      nonlinearity=None, num_units=200),
      name=None, nonlinearity=Softmax(), num_units=2),
          max_epochs=10, objective=CrossEntropy(eps=1e-12),
          session_kwargs=None, update=SGD(learning_rate=0.5), verbose=1)

In [9]:
net.layer.W_, type(net.layer.W_)

(<tensorflow.python.ops.variables.Variable at 0x7f67c23b8080>,
 tensorflow.python.ops.variables.Variable)

In [10]:
bla = tf.Variable(np.ones((5, 5)))

In [11]:
bla, type(bla)

(<tensorflow.python.ops.variables.Variable at 0x7f67c226c278>,
 tensorflow.python.ops.variables.Variable)

In [12]:
y_proba = net.predict_proba(X)

In [13]:
(np.argmax(y_proba, 1) == y).mean()

0.60140000000000005

In [14]:
pdb on

Automatic pdb calling has been turned ON


In [15]:
net.fit(X, y, num_epochs=20)

epochs:    1 | loss: 0.34045
epochs:    2 | loss: 0.31662
epochs:    3 | loss: 0.30037
epochs:    4 | loss: 0.28923
epochs:    5 | loss: 0.27974
epochs:    6 | loss: 0.27211
epochs:    7 | loss: 0.26499
epochs:    8 | loss: 0.25799
epochs:    9 | loss: 0.25164
epochs:   10 | loss: 0.24486
epochs:   11 | loss: 0.23801
epochs:   12 | loss: 0.23088
epochs:   13 | loss: 0.22395
epochs:   14 | loss: 0.21685
epochs:   15 | loss: 0.20923
epochs:   16 | loss: 0.20177
epochs:   17 | loss: 0.19424
epochs:   18 | loss: 0.18615
epochs:   19 | loss: 0.17842
epochs:   20 | loss: 0.17126


NeuralNetClassifier(batch_iterator=<mink.nolearn.BatchIterator object at 0x7f67c4419fd0>,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=InputLayer(Xs=None, name=None, ys=None), name=None,
      nonlinearity=None, num_units=200),
      name=None, nonlinearity=Softmax(), num_units=2),
          max_epochs=10, objective=CrossEntropy(eps=1e-12),
          session_kwargs=None, update=SGD(learning_rate=0.5), verbose=1)

In [16]:
y_proba

array([[ 0.146476  ,  0.85352403],
       [ 0.28080532,  0.71919465],
       [ 0.36376691,  0.63623309],
       ..., 
       [ 0.48769066,  0.51230937],
       [ 0.26301438,  0.73698562],
       [ 0.41127631,  0.58872366]], dtype=float32)

In [17]:
np.argmax(y_proba, 1)[:10]

array([1, 1, 1, 0, 0, 0, 1, 1, 1, 1])

In [18]:
y_proba = net.predict_proba(X)

In [19]:
(np.argmax(y_proba, 1) == y.flatten()).mean()

0.92859999999999998

### Grid search

The neural net estimators can be used in conjunction with other sklearn features, such as `GridSearchCV`.

In [20]:
l0 = InputLayer()
l1 = DenseLayer(l0, name='hidden')
l2 = DenseLayer(l1)

In [21]:
net = NeuralNetClassifier(l2, update=SGD())

Parameters are set using the known double-underscore notation, e.g.

`'update__learning_rate': [0.1, 0.5]`.

Note: Instead of having to write

`'layer__incoming__num_units': [50, 100]`

we can just write

`'hidden__num_units': [50, 100]`

because we have given the hidden layer a name, "hidden". This may safe a lot of writing and confusion.

In [22]:
params = {
    'update__learning_rate': [0.1, 0.5],
    'max_epochs': [5, 10],
    'hidden__num_units': [50, 100],
}

In [23]:
cv = GridSearchCV(net, params, scoring='accuracy', refit=False, verbose=3)

In [24]:
cv.fit(X, y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] max_epochs=5, hidden__num_units=50, update__learning_rate=0.1 ...
[CV]  max_epochs=5, hidden__num_units=50, update__learning_rate=0.1, score=0.872825 -   0.2s
[CV] max_epochs=5, hidden__num_units=50, update__learning_rate=0.1 ...
[CV]  max_epochs=5, hidden__num_units=50, update__learning_rate=0.1, score=0.865627 -   0.3s
[CV] max_epochs=5, hidden__num_units=50, update__learning_rate=0.1 ...
[CV]  max_epochs=5, hidden__num_units=50, update__learning_rate=0.1, score=0.870948 -   0.4s
[CV] max_epochs=5, hidden__num_units=50, update__learning_rate=0.5 ...
[CV]  max_epochs=5, hidden__num_units=50, update__learning_rate=0.5, score=0.874625 -   0.3s
[CV] max_epochs=5, hidden__num_units=50, update__learning_rate=0.5 ...
[CV]  max_epochs=5, hidden__num_units=50, update__learning_rate=0.5, score=0.866827 -   0.4s
[CV] max_epochs=5, hidden__num_units=50, update__learning_rate=0.5 ...
[CV]  max_epochs=5, hidden__num_units=50, update_

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   16.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=NeuralNetClassifier(batch_iterator=<mink.nolearn.BatchIterator object at 0x7f67c4419fd0>,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=DenseLayer(W=Glor...e=CrossEntropy(eps=1e-12),
          session_kwargs=None, update=SGD(learning_rate=0.01), verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_epochs': [5, 10], 'hidden__num_units': [50, 100], 'update__learning_rate': [0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=False, scoring='accuracy', verbose=3)

In [25]:
cv.best_params_, cv.best_score_

({'hidden__num_units': 100, 'max_epochs': 10, 'update__learning_rate': 0.1},
 0.88139999999999996)

## Regression task

As is known from sklearn, we have separate estimators for classification and regression.

### Data

In [26]:
X, y = make_regression(n_samples=5000)

Note that apart from using `NeuralNetRegressor` instead of `NeuralNetClassifier`, everything is the same. No need to adjust output nonlinearity or objective.

### Define neural network

In [27]:
l0 = InputLayer()
l1 = DenseLayer(l0, num_units=200)
l2 = DenseLayer(l1)

In [28]:
net = NeuralNetRegressor(l2, verbose=1)

In [29]:
net.set_params(update__learning_rate=0.001)

NeuralNetRegressor(batch_iterator=<mink.nolearn.BatchIterator object at 0x7f67c441f438>,
          encoder=None,
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=InputLayer(Xs=None, name=None, ys=None), name=None,
      nonlinearity=None, num_units=200),
      name=None, nonlinearity=None, num_units=None),
          max_epochs=10, objective=MeanSquaredError(), session_kwargs=None,
          update=SGD(learning_rate=0.001), verbose=1)

### Train

In [30]:
net.fit(X, y, num_epochs=20)

epochs:    1 | loss: 2580.11377
epochs:    2 | loss: 171.38367
epochs:    3 | loss: 132.39168
epochs:    4 | loss: 102.61388
epochs:    5 | loss: 79.46240
epochs:    6 | loss: 61.71824
epochs:    7 | loss: 48.12549
epochs:    8 | loss: 37.80685
epochs:    9 | loss: 29.83745
epochs:   10 | loss: 23.64183
epochs:   11 | loss: 18.82131
epochs:   12 | loss: 15.06531
epochs:   13 | loss: 12.16431
epochs:   14 | loss: 9.89314
epochs:   15 | loss: 8.15140
epochs:   16 | loss: 6.82924
epochs:   17 | loss: 5.79588
epochs:   18 | loss: 4.97318
epochs:   19 | loss: 4.33360
epochs:   20 | loss: 3.82461


NeuralNetRegressor(batch_iterator=<mink.nolearn.BatchIterator object at 0x7f67c441f438>,
          encoder=None,
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(val=0.0),
      incoming=InputLayer(Xs=None, name=None, ys=None), name=None,
      nonlinearity=None, num_units=200),
      name=None, nonlinearity=Linear(), num_units=1),
          max_epochs=10, objective=MeanSquaredError(), session_kwargs=None,
          update=SGD(learning_rate=0.001), verbose=1)

## Saving and restoring

### Save previous net

In [31]:
score_before = mean_squared_error(y, net.predict(X))
print(score_before)

6.63134254573


In [32]:
with open('mynet.pkl', 'wb') as f:
    pickle.dump(net, f)

### Create a new net with same architecture

In [33]:
with open('mynet.pkl', 'rb') as f:
    new_net = pickle.load(f)

In [34]:
new_net.initialize(X, y)

#### score after loading, the same as saved net

In [35]:
score_after = mean_squared_error(y, new_net.predict(X))
print(score_after)
assert np.isclose(score_before, score_after)

6.63134254573
