# Mink usage examples

This notebook demonstrates some examples of using mink and how it interacts with sklearn.

## Imports

In [1]:
import pickle

In [2]:
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
import tensorflow as tf

In [3]:
from mink import NeuralNetClassifier
from mink import NeuralNetRegressor
from mink.layers import DenseLayer
from mink.layers import InputLayer
from mink.updates import SGD

## MNIST classification task

### Data

In [4]:
X, y = make_classification(n_samples=5000, random_state=0)

### Define neural net

Note: No need to specify the shape of the training data, number of classes, or to set softmax nonlinearity. The `NeuralNetClassifier` class takes care of all of that, as is expected from an sklearn estimator.

In [5]:
l0 = InputLayer()
l1 = DenseLayer(l0, num_units=200)
l2 = DenseLayer(l1)

In [6]:
net = NeuralNetClassifier(l2, verbose=1)

If we want to change certain parameters after initialization, just use the `set_params` method and the double-underscore notation known from sklearn.

In [7]:
net.set_params(update__learning_rate=0.5)

NeuralNetClassifier(batch_iterator_test=128, batch_iterator_train=128,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=InputLayer(Xs=None, make_logs=False, name=None, ys=None),
      make_logs=False, name=None, nonlinearity=None, num_units=200),
      make_logs=False, name=None, nonlinearity=None, num_units=None),
          max_epochs=10, objective=CrossEntropy(eps=1e-12),
          on_epoch_finished=(<mink.handlers.PrintTrainProgress object at 0x7f900e758e48>,),
          on_training_started=(<mink.handlers.PrintLayerInfo object at 0x7f900e826710>,),
          session_kwargs=None, update=SGD(learning_rate=0.5), verbose=1)

### Train

In [8]:
net.fit(X, y, epochs=0)

# Neural Network with 4602 learnable parameters

## Layer information

|   # | name   |   size |
|----:|:-------|-------:|
|   0 | input  |     20 |
|   1 | dense  |    200 |
|   2 | dense  |      2 |



NeuralNetClassifier(batch_iterator_test=128, batch_iterator_train=128,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=InputLayer(Xs=None, make_logs=False, name=None, ys=None),
      make_logs=False, name=None, nonlinearity=None, num_units=200),
      make_logs=False, name=None, nonlinearity=Softmax(), num_units=2),
          max_epochs=10, objective=CrossEntropy(eps=1e-12),
          on_epoch_finished=(<mink.handlers.PrintTrainProgress object at 0x7f900e758e48>,),
          on_training_started=(<mink.handlers.PrintLayerInfo object at 0x7f900e826710>,),
          session_kwargs=None, update=SGD(learning_rate=0.5), verbose=1)

In [9]:
y_proba = net.predict_proba(X)

In [10]:
(np.argmax(y_proba, 1) == y).mean()

0.40060000000000001

In [11]:
net.fit(X, y, epochs=20)

# Neural Network with 4602 learnable parameters

## Layer information

|   # | name   |   size |
|----:|:-------|-------:|
|   0 | input  |     20 |
|   1 | dense  |    200 |
|   2 | dense  |      2 |

|   epoch |   train loss |     dur |
|--------:|-------------:|--------:|
|       1 |      [36m0.17884[0m | 0.04433 |
|       2 |      [36m0.12588[0m | 0.03878 |
|       3 |      [36m0.11474[0m | 0.03564 |
|       4 |      [36m0.10970[0m | 0.03639 |
|       5 |      [36m0.10618[0m | 0.03578 |
|       6 |      [36m0.10345[0m | 0.03739 |
|       7 |      [36m0.10096[0m | 0.03449 |
|       8 |      [36m0.09875[0m | 0.03641 |
|       9 |      [36m0.09683[0m | 0.03886 |
|      10 |      [36m0.09497[0m | 0.03637 |
|      11 |      [36m0.09321[0m | 0.04120 |
|      12 |      [36m0.09151[0m | 0.03713 |
|      13 |      [36m0.08991[0m | 0.03788 |
|      14 |      [36m0.08840[0m | 0.03666 |
|      15 |      [36m0.08697[0m | 0.03804 |
|      16 |      [36m0.08563[0m 

NeuralNetClassifier(batch_iterator_test=128, batch_iterator_train=128,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=InputLayer(Xs=None, make_logs=False, name=None, ys=None),
      make_logs=False, name=None, nonlinearity=None, num_units=200),
      make_logs=False, name=None, nonlinearity=Softmax(), num_units=2),
          max_epochs=10, objective=CrossEntropy(eps=1e-12),
          on_epoch_finished=(<mink.handlers.PrintTrainProgress object at 0x7f900e758e48>,),
          on_training_started=(<mink.handlers.PrintLayerInfo object at 0x7f900e826710>,),
          session_kwargs=None, update=SGD(learning_rate=0.5), verbose=1)

In [12]:
y_proba = net.predict_proba(X)

In [13]:
(np.argmax(y_proba, 1) == y.flatten()).mean()

0.98040000000000005

### Grid search

The neural net estimators can be used in conjunction with other sklearn features, such as `GridSearchCV`.

In [14]:
l0 = InputLayer()
l1 = DenseLayer(l0, name='hidden')
l2 = DenseLayer(l1)

In [15]:
net = NeuralNetClassifier(l2, update=SGD())

Parameters are set using the known double-underscore notation, e.g.

`'update__learning_rate': [0.1, 0.5]`.

Note: Instead of having to write

`'layer__incoming__num_units': [50, 100]`

we can just write

`'hidden__num_units': [50, 100]`

because we have given the hidden layer a name, "hidden". This may safe a lot of writing and confusion.

In [16]:
params = {
    'update__learning_rate': [0.1, 0.5],
    'max_epochs': [5, 10],
    'hidden__num_units': [50, 100],
}

In [17]:
cv = GridSearchCV(net, params, scoring='accuracy', refit=False, verbose=3)

In [18]:
cv.fit(X, y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] hidden__num_units=50, max_epochs=5, update__learning_rate=0.1 ...
[CV]  hidden__num_units=50, max_epochs=5, update__learning_rate=0.1, score=0.947211 -   0.3s
[CV] hidden__num_units=50, max_epochs=5, update__learning_rate=0.1 ...
[CV]  hidden__num_units=50, max_epochs=5, update__learning_rate=0.1, score=0.944811 -   0.3s
[CV] hidden__num_units=50, max_epochs=5, update__learning_rate=0.1 ...
[CV]  hidden__num_units=50, max_epochs=5, update__learning_rate=0.1, score=0.962185 -   0.4s
[CV] hidden__num_units=50, max_epochs=5, update__learning_rate=0.5 ...
[CV]  hidden__num_units=50, max_epochs=5, update__learning_rate=0.5, score=0.960408 -   0.4s
[CV] hidden__num_units=50, max_epochs=5, update__learning_rate=0.5 ...
[CV]  hidden__num_units=50, max_epochs=5, update__learning_rate=0.5, score=0.955009 -   0.4s
[CV] hidden__num_units=50, max_epochs=5, update__learning_rate=0.5 ...
[CV]  hidden__num_units=50, max_epochs=5, update_

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   18.0s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=NeuralNetClassifier(batch_iterator_test=128, batch_iterator_train=128,
          encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=Fa...ject at 0x7f900e826710>,),
          session_kwargs=None, update=SGD(learning_rate=0.01), verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'hidden__num_units': [50, 100], 'max_epochs': [5, 10], 'update__learning_rate': [0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=False, scoring='accuracy', verbose=3)

In [19]:
cv.best_params_, cv.best_score_

({'hidden__num_units': 50, 'max_epochs': 10, 'update__learning_rate': 0.5},
 0.96060000000000001)

## Regression task

As is known from sklearn, we have separate estimators for classification and regression.

### Data

In [20]:
X, y = make_regression(n_samples=5000, random_state=0)

Note that apart from using `NeuralNetRegressor` instead of `NeuralNetClassifier`, everything is the same. No need to adjust output nonlinearity or objective.

### Define neural network

In [21]:
l0 = InputLayer()
l1 = DenseLayer(l0, num_units=200)
l2 = DenseLayer(l1)

In [22]:
net = NeuralNetRegressor(l2, verbose=1)

In [23]:
net.set_params(update__learning_rate=0.0001)

NeuralNetRegressor(batch_iterator_test=128, batch_iterator_train=128,
          encoder=None,
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=InputLayer(Xs=None, make_logs=False, name=None, ys=None),
      make_logs=False, name=None, nonlinearity=None, num_units=200),
      make_logs=False, name=None, nonlinearity=None, num_units=None),
          max_epochs=10, objective=MeanSquaredError(),
          on_epoch_finished=(<mink.handlers.PrintTrainProgress object at 0x7f900e76d128>,),
          on_training_started=(<mink.handlers.PrintLayerInfo object at 0x7f900e76d0f0>,),
          session_kwargs=None, update=SGD(learning_rate=0.0001), verbose=1)

### Train

In [24]:
net.fit(X, y, epochs=20)

# Neural Network with 20401 learnable parameters

## Layer information

|   # | name   |   size |
|----:|:-------|-------:|
|   0 | input  |    100 |
|   1 | dense  |    200 |
|   2 | dense  |      1 |

|   epoch |   train loss |     dur |
|--------:|-------------:|--------:|
|       1 |  [36m30585.94141[0m | 0.05749 |
|       2 |  [36m29499.32422[0m | 0.02741 |
|       3 |  [36m25286.44727[0m | 0.02820 |
|       4 |  [36m13752.04688[0m | 0.02866 |
|       5 |   [36m3010.52661[0m | 0.02850 |
|       6 |    [36m534.34149[0m | 0.02807 |
|       7 |    [36m304.34503[0m | 0.02792 |
|       8 |    [36m264.94580[0m | 0.02905 |
|       9 |    [36m244.89995[0m | 0.02767 |
|      10 |    [36m229.84749[0m | 0.02902 |
|      11 |    [36m217.19238[0m | 0.02862 |
|      12 |    [36m205.98453[0m | 0.02843 |
|      13 |    [36m195.79178[0m | 0.02809 |
|      14 |    [36m186.34044[0m | 0.02901 |
|      15 |    [36m177.48831[0m | 0.02880 |
|      16 |    [36m169.15683[0m

NeuralNetRegressor(batch_iterator_test=128, batch_iterator_train=128,
          encoder=None,
          layer=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=DenseLayer(W=GlorotUniform(c01b=False, gain=1.0), b=Constant(value=0.0),
      incoming=InputLayer(Xs=None, make_logs=False, name=None, ys=None),
      make_logs=False, name=None, nonlinearity=None, num_units=200),
      make_logs=False, name=None, nonlinearity=Linear(), num_units=1),
          max_epochs=10, objective=MeanSquaredError(),
          on_epoch_finished=(<mink.handlers.PrintTrainProgress object at 0x7f900e76d128>,),
          on_training_started=(<mink.handlers.PrintLayerInfo object at 0x7f900e76d0f0>,),
          session_kwargs=None, update=SGD(learning_rate=0.0001), verbose=1)

## Saving and restoring

### Save previous net

In [25]:
score_before = mean_squared_error(y, net.predict(X))
print(score_before)

136.255227


In [26]:
with open('mynet.pkl', 'wb') as f:
    pickle.dump(net, f)

### Create a new net with same architecture

In [27]:
with open('mynet.pkl', 'rb') as f:
    new_net = pickle.load(f)

In [28]:
score_after = mean_squared_error(y, new_net.predict(X))
print(score_after)
assert np.isclose(score_before, score_after)

136.255227
