In [38]:
import os
import numpy as np
import glob

from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

import substratools as tools

# Generate dataset 

-> generate_data_samples.py

## Import dataset

In [39]:


# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# train features, train labels, test features, test labels


##  Load into numpy array files

In [40]:
root_path = os.path.expanduser("~/python-projects/substra/substra/examples/mnist")
data_path = os.path.join(root_path, 'data')
train_data_path = os.path.join(data_path, 'train')
test_data_path = os.path.join(data_path, 'test')
assets_path = os.path.join(root_path, 'assets')

print(data_path)
print(assets_path)

OUT_FILE = {
    os.path.join("train_data","x_train.npy"): x_train,
    os.path.join("train_data","y_train.npy"): y_train,
    os.path.join("test_data", "x_test.npy"): x_test,
    os.path.join("test_data", "y_test.npy"): y_test,

}

for filename, data in OUT_FILE.items():
    full_path = os.path.join(assets_path,filename)
    os.makedirs(os.path.dirname(full_path), exist_ok=True)
    np.save(full_path, data)

/home/fabien/python-projects/substra/substra/examples/mnist/data
/home/fabien/python-projects/substra/substra/examples/mnist/assets


# Open Dataset

-> opener.py

## Define fonctions of the Opener

In [41]:

class MnistOpener(tools.Opener):

    @classmethod
    def _get_files(cls, folders):
        """Return list of X and y file given a folder location"""
        X_files, y_files = [], []
        for folder in folders:
            Xs = glob.glob(os.path.join(folder, 'x*.npy'))
            ys = glob.glob(os.path.join(folder, 'y*.npy'))

            X_files.extend(Xs)
            y_files.extend(ys)

        return X_files, y_files

    def get_X(self, folders):
        """Get X :-) """
        print('Finding features file...')
        X_files, _ = self._get_files(folders)
        print(X_files)
        print('Loading features...')
        Xs = []
        for X_file in X_files:
            Xs.append(np.load(X_file))
        Xs = np.concatenate(Xs)

        return Xs

    def get_y(self, folders):
        """Get y :-)"""
        print('Finding label file...')
        _, y_files = self._get_files(folders)

        print('Loading labels...')
        ys = []
        for y_file in y_files:
            ys.append(np.load(y_file))
        ys = np.concatenate(ys)

        return ys

    def fake_X(self):
        return np.random.randn(22, 28, 28).astype(np.float32)

    def fake_y(self):
        return np.random.choice(np.arange(10), size=(22)).astype(np.int)


## Load dataset from local

Replacing part of the command : 
```
python assets/algo_random_forest/algo.py train \
  --debug \
  --opener-path assets/dataset/opener.py \
  --data-samples-path assets/train_data_samples \
  --output-model-path assets/model/model \
  --log-path assets/logs/train.log
```


In [42]:

folder_train_data = os.path.join(assets_path,"train_data")
folder_test_data = os.path.join(assets_path,"test_data")

folders = [folder_train_data]
x_train = MnistOpener.get_X(MnistOpener, folders)
print('x_train shape:', x_train.shape)
y_train = MnistOpener.get_y(MnistOpener, folders)
print('y_train shape:', y_train.shape)

folders = [folder_test_data]
x_test = MnistOpener.get_X(MnistOpener, folders)
print('x_test shape:', x_test.shape)
y_test = MnistOpener.get_y(MnistOpener, folders)
print('y_test shape:', y_test.shape)

Finding features file...
['/home/fabien/python-projects/substra/substra/examples/mnist/assets/train_data/x_train.npy']
Loading features...
x_train shape: (60000, 28, 28)
Finding label file...
Loading labels...
y_train shape: (60000,)
Finding features file...
['/home/fabien/python-projects/substra/substra/examples/mnist/assets/test_data/x_test.npy']
Loading features...
x_test shape: (10000, 28, 28)
Finding label file...
Loading labels...
y_test shape: (10000,)


# Algorithm

-> algo.py

## Normalize data


In [43]:
# input image dimensions
img_rows, img_cols = 28, 28
input_shape = (img_rows, img_cols, 1)

class Algo(tools.algo.Algo):

    def _normalize_X(self, X):

        if K.image_data_format() == 'channels_first':
            X = X.reshape(X.shape[0], 1, img_rows, img_cols)
       
            input_shape = (1, img_rows, img_cols)
        else:
            X = X.reshape(X.shape[0], img_rows, img_cols, 1)

            input_shape = (img_rows, img_cols, 1)

        X = X.astype('float32')

        X /= 255

        print('X shape:', X.shape)
        print(X.shape[0], ' samples')
        
        return X

In [44]:
x_train = Algo._normalize_X(Algo,x_train)
x_test = Algo._normalize_X(Algo,x_test)



X shape: (60000, 28, 28, 1)
60000  samples
X shape: (10000, 28, 28, 1)
10000  samples


## Create model

-> function train(...)

In [45]:

num_classes = 10


model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])



## Train model


In [46]:
epochs = 1
batch_size = 128
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
 

Train on 60000 samples, validate on 10000 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f893fe7d5b0>

## Test model
-> pas de fct associée dans l'exemple

In [127]:
         
score = model.evaluate(x_test, y_test, verbose=0) #split fct en 2 parts pour faire la metrics
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.05513126526260748
Test accuracy: 0.9828000068664551


## Get model predictions

-> fonction Algo.predict()

In [128]:
def predict(X, model):
            y_pred = model.predict_classes(X, verbose=0)
            return y_pred

In [129]:
y_pred = predict(x_test, model)

print(y_pred.shape)

(10000,)


## Save and load predictions
( in opener.py )

In [130]:
def save_predictions(y_pred, path):
    """Save prediction"""
    np.save(path, y_pred)

def get_predictions(path):
    """Get predictions which were saved using the save_pred function"""
    return np.load(path)

In [131]:
pred_path = os.path.join(assets_path,"y_pred.npy")
save_predictions(y_pred, pred_path)

y_pred = get_predictions(pred_path)
print(y_pred.shape)

(10000,)


## Save and load model

In [132]:

def load_model(self, path):
    with open(path, 'rb') as f:
        return keras.models.load_model(f)

def save_model(self, model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        model.save(f)

In [133]:
folder_model = os.path.join(assets_path,"model")

model_path = os.path.join(folder_model, "model")
save_model(Algo, model, model_path)

loaded_model = load_model(Algo, model_path)

score = loaded_model.evaluate(x_test, y_test, verbose=0) #split fct en 2 parts pour faire la metrics
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Test loss: 0.05513126526260748
Test accuracy: 0.9828000068664551


# Metrics

-> metrics.py

In [134]:
from sklearn.metrics import accuracy_score

class MnistMetrics(tools.Metrics):
    def score(self, y_true, y_pred):
        """Returns the macro-average recall

        :param y_true: actual values from test data
        :type y_true: pd.DataFrame
        :param y_true: predicted values from test data
        :type y_pred: pd.DataFrame
        :rtype: float
        """
        return accuracy_score(y_true, y_pred)


In [135]:
print(y_test.shape)
print(y_pred.shape)

score = MnistMetrics.score(TitanicMetrics, np.argmax(y_test, axis=1), y_pred)
print(score)

(10000, 10)
(10000,)
0.9828


# Add dataset and objective to substra

-> add_dataset_objective.py