# ->->->->->->->->->->->->->->->-> Welcome <-<-<-<-<-<-<-<-<-<-<-<-<-<-<-<-

![shinnosuke](https://ss0.bdstatic.com/70cFvHSh_Q1YnxGkpoWK1HF6hhy/it/u=1784511497,119911411&fm=26&gp=0.jpg)

<h1>Contents</h1>
<ol>
    <li><h2><a href="#layer">Layer</a></h2>
        <ul>
            <li><h3><a href="#input">Input</a></h3></li>
            <li><h3><a href="#dense">Dense</a></h3></li>
            <li><h3><a href="#flatten">Flatten</a></h3></li>
            <li><h3><a href="#pad">ZeroPadding2D</a></h3></li>
            <li><h3><a href="#conv">Conv2D</a></h3></li>
            <li><h3><a href="#maxpool">MaxPooling2D</a></h3></li>
            <li><h3><a href="#meanpool">MeanPooling2D</a></h3></li>
            <li><h3><a href="#act">Activation</a></h3></li>
            <li><h3><a href="#reshape">Reshape</a></h3></li>
            <li><h3><a href="#dropout">Dropout</a></h3></li>
            <li><h3><a href="#bn">Batch Normalization</a></h3></li>
            <li><h3><a href="#ln">Layer Normalization</a></h3></li>
            <li><h3><a href="#gn">Group Normalization</a></h3></li>
            <li><h3><a href="#embed">Embedding</a></h3></li>
            <li><h3><a href="#rnn">SimpleRNN</a></h3></li>
            <li><h3><a href="#lstm">LSTM</a></h3></li>
            <li><h3><a href="#timedist">TimeDistributed</a></h3></li>
        </ul>
    </li>
    <li><div></div><h2><a href="#node">Node</a></h2>
        <ul>
            <li><h3><a href="#variable">Variable</a></h3></li>
            <li><h3><a href="#constant">Constant</a></h3></li>
        </ul>
    </li>
    <li><div></div><h2><a href="#example">Example</a></h2>
        <ul>
            <li><h3><a href="#ex-cnn">CNN</a></h3></li>
            <li><h3><a href="#ex-lstm">LSTM</a></h3></li>
        </ul>
    </li>
</ol>

<h2 id="layer">Layer</h2>

<div style="color: red" id="input"><i>- Input(shape: Tuple,  value: ndarray = None, **kwargs)</i></div>

+ shape: input data's shape, for example, (None, C, H, W) or (None, features).
+ value: this layer's input and output tensor's value.

In [13]:
from shinnosuke.models import Model
from shinnosuke.layers import Input

X = Input(shape=(None, 10, 5, 5))
model = Model(inputs=X, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 10, 5, 5)     0            
              
---------------------------------------------------------------------------
***************************************************************************
Total params: 0
Trainable params: 0
Non-trainable params: 0



<div style="color: red" id="dense"><i>- Dense(n_out: int, n_in: int = None, initializer='glorot_uniform', activation='linear', kernel_regularizer=None, **kwargs)</i></div>

+ n_out: out feature numbers.

+ n_in: in feature numbers.

+ initializer: kernel and bias initialize method. see details in <a href='#Initializers'>Initializers</a>

+ activation: activation function. see details in <a href='#Activations'>Activations</a>

+ kernel_regularizer: not implemented.

In [7]:
from shinnosuke.models import Sequential
from shinnosuke.layers import Dense

model = Sequential()
model.add(Dense(n_out=100, n_in=500, activation='relu', name='dense1'))  # must specify n_in if this is the first layer of network
model.add(Dense(n_out=10, name='dense2'))
model.compile(loss='mse', optimizer='adam')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
dense1 (Dense)       (None, 100)          50100        
              
---------------------------------------------------------------------------
dense2 (Dense)       (None, 10)           1010         dense1         
---------------------------------------------------------------------------
***************************************************************************
Total params: 51110
Trainable params: 51110
Non-trainable params: 0



<div style="color: red" id="flatten"><i>- Flatten(out_dim: int = 2, **kwargs)</i></div>

+ out_dim: after flatten, the output data's dimension. for example, input data's shape is (N, C, H, W), out_dim = 2 will convert output data's shape to (N, $C \times H \times W$) and out_dim = 3 will convert output data's shape to (N, C, $H \times W$).

In [9]:
from shinnosuke.models import Model
from shinnosuke.layers import Flatten
from shinnosuke.layers import Input

X_input = Input(shape=(None, 10, 5, 8))
X = Flatten(out_dim=2)(X_input)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 10, 5, 8)     0            
              
---------------------------------------------------------------------------
Flatten              (None, 400)          0            Input          
---------------------------------------------------------------------------
***************************************************************************
Total params: 0
Trainable params: 0
Non-trainable params: 0



<div style="color: red" id="pad"><i>- ZeroPadding2D(pad_size: Tuple, **kwargs)</i></div>

+ pad_size: for example, (1, 1), which means pad input(N, C, H, W) to (N, C, H+2, W+2).

In [10]:
from shinnosuke.models import Model
from shinnosuke.layers import ZeroPadding2D
from shinnosuke.layers import Input

X_input = Input(shape=(None, 10, 5, 5))
X = ZeroPadding2D(pad_size=(2, 2))(X_input)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 10, 5, 5)     0            
              
---------------------------------------------------------------------------
ZeroPadding2D        (None, 10, 9, 9)     0            Input          
---------------------------------------------------------------------------
***************************************************************************
Total params: 0
Trainable params: 0
Non-trainable params: 0



<div style="color: red" id="conv"><i>- Conv2D(filter_nums: int, filter_size: Tuple, input_shape: Tuple = None, stride: int = 1, padding: str = 'VALID', activation = 'linear',initializer = 'Normal', **kwargs)</i></div>

+ filter_nums: filter's numbers.

+ filter_size: filter's size. for example, (3, 3) or 3.

+ input_shape: must specify if this is the first layer of network.

+ stride: convolution stride.

+ padding: 'SAME' or 'VALID', 'VALID' means no padding, 'SAME' means pad input to get the same output size as input.

+ activation: activation function. see details in <a href='#Activations'>Activations</a>

+ initializer: kernel and bias initialize method. see details in <a href='#Initializers'>Initializers</a>

In [11]:
from shinnosuke.models import Model
from shinnosuke.layers import Conv2D
from shinnosuke.layers import Input

X_input = Input(shape=(None, 3, 24, 24))
X = Conv2D(filter_nums=16, filter_size=(3, 3), stride=1, padding='VALID', activation='relu')(X_input)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='bce')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 3, 24, 24)    0            
              
---------------------------------------------------------------------------
Conv2D               (None, 16, 22, 22)   448          Input          
---------------------------------------------------------------------------
***************************************************************************
Total params: 448
Trainable params: 448
Non-trainable params: 0



<div style="color: red" id="maxpool"><i>- MaxPooling2D(pool_size: Tuple, stride: int = None, **kwargs)</i></div>

+ pool_size: pooling kernel size, for example (2, 2) means apply max pooling in every 2 x 2 area.
+ stride: pooling stride.

<div style="color: red" id="meanpool"><i>- MeanPooling2D(pool_size: Tuple, stride: int = None, **kwargs)</i></div>

+ pool_size: pooling kernel size, for example (2, 2) means apply mean pooling in every 2 x 2 area.
+ stride: pooling stride.

In [12]:
from shinnosuke.models import Model
from shinnosuke.layers import MaxPooling2D, MeanPooling2D
from shinnosuke.layers import Input

X_input = Input(shape=(None, 3, 24, 24))
X = MaxPooling2D(pool_size=(2, 2))(X_input)
X = MeanPooling2D(pool_size=(2, 2))(X)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='bce')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 3, 24, 24)    0            
              
---------------------------------------------------------------------------
MaxPooling2D         (None, 3, 12, 12)    0            Input          
---------------------------------------------------------------------------
MeanPooling2D        (None, 3, 6, 6)      0            MaxPooling2D   
---------------------------------------------------------------------------
***************************************************************************
Total params: 0
Trainable params: 0
Non-trainable params: 0



<div style="color: red" id="act"><i>- Activation(act_name='relu')</i></div>

+ act_name: activation function name, support ReLU, Sigmoid, etc. see details in <a href='#Activations'>Activations</a>

In [14]:
from shinnosuke.models import Model
from shinnosuke.layers import Activation
from shinnosuke.layers import Input

X_input = Input(shape=(None, 3, 24, 24))
X = Activation('relu')(X_input)
X = Activation('sigmoid')(X)
X = Activation('softmax')(X)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='cross_entropy')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 3, 24, 24)    0            
              
---------------------------------------------------------------------------
Activation           (None, 3, 24, 24)    0            Input          
---------------------------------------------------------------------------
Activation           (None, 3, 24, 24)    0            Activation     
---------------------------------------------------------------------------
Activation           (None, 3, 24, 24)    0            Activation     
---------------------------------------------------------------------------
***************************************************************************
Total params: 0
Trainable params: 0
Non-trainable params: 0



<div style="color: red" id="reshape"><i>- Reshape(output_shape: Tuple, **kwargs)</i></div>

+ output_shape: shape after reshape operations.

In [1]:
from shinnosuke.models import Model
from shinnosuke.layers import Reshape
from shinnosuke.layers import Input

X_input = Input(shape=(None, 3, 5, 4))
X = Reshape(output_shape=(None, 12, 5))(X_input)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='cross_entropy')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 3, 5, 4)      0            
              
---------------------------------------------------------------------------
Reshape              (None, 12, 5)        0            Input          
---------------------------------------------------------------------------
***************************************************************************
Total params: 0
Trainable params: 0
Non-trainable params: 0



<div style="color: red" id="dropout"><i>- Dropout(keep_prob)</i></div>

+ keep_prob:  probability of keeping a unit active.

In [2]:
from shinnosuke.models import Model
from shinnosuke.layers import Dropout
from shinnosuke.layers import Input

X_input = Input(shape=(None, 500))
X = Dropout(keep_prob=0.5)(X_input)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 500)          0            
              
---------------------------------------------------------------------------
Dropout              (None, 500)          0            Input          
---------------------------------------------------------------------------
***************************************************************************
Total params: 0
Trainable params: 0
Non-trainable params: 0



<div style="color: red" id="bn"><i>- Batch Normalization(epsilon=1e-6, momentum=0.9, axis=1, gamma_initializer='ones', beta_initializer='zeros', moving_mean_initializer='zeros', moving_variance_initializer='ones')</i></div>
$$
u_B = \frac{1}{m} \sum \limits_{i=1}^m x_i  \quad \quad mini-batch \quad mean
\\
\sigma_B = \frac{1}{m} \sum \limits_{i=1}^m (x_i - u_B)^2  \quad \quad mini-batch \quad variance
\\
\hat x_i = \frac{x_i - u_B}{\sqrt{\sigma_B^2 + \epsilon}}   \quad \quad normalize
\\
y_i = \gamma \hat x_i + \beta  \quad \quad scale \quad and \quad shift
$$


+ epsilon:  $\epsilon$ value.
+ momentum: at training time, we use moving averages to update $u_B \rightarrow$ $moving\_u = momentum * moving\_u + (1 - momentum) * u_B$ and $\sigma_B \rightarrow  moving\_\sigma = momentum * moving\_\sigma + (1 - momentum) * \sigma_B$ 
+ axis: use normalization on which axis, for Dense Layer, it should be 1 or -1, for Convolution Layer, it should be 1.
+ gamma_initializer: initialize $\gamma$ method. see details in <a href='#Initializers'>Initializers</a>
+ beta_initializer: initialize $\beta$ method. see details in <a href='#Initializers'>Initializers</a>
+ moving_mean_initializer: initialize $moving\_u$ method. see details in <a href='#Initializers'>Initializers</a>
+ moving_variance_initializer: initialize $moving\_\sigma$ method. see details in <a href='#Initializers'>Initializers</a>

<div style="color: red" id="ln"><i>- Layer Normalization(epsilon=1e-10, gamma_initializer='ones', beta_initializer='zeros')</i></div>
$$
u = \frac{1}{CHW} \sum \limits_{i=1}^C \sum \limits_{j=1}^H \sum \limits_{k=1}^W x_{ijk}  \quad \quad sample \quad mean
\\
\sigma = \frac{1}{CHW} \sum \limits_{i=1}^C \sum \limits_{j=1}^H \sum \limits_{k=1}^W (x_{ijk} - u)^2  \quad \quad sample \quad variance
\\
\hat x = \frac{x - u}{\sqrt{\sigma^2 + \epsilon}}   \quad \quad normalize
\\
y = \gamma \hat x + \beta  \quad \quad scale \quad and \quad shift
$$


+ epsilon:  $\epsilon$ value.
+ gamma_initializer: initialize $\gamma$ method. see details in <a href='#Initializers'>Initializers</a>
+ beta_initializer: initialize $\beta$ method. see details in <a href='#Initializers'>Initializers</a>

<div style="color: red" id="gn"><i>- Group Normalization(epsilon=1e-5, G=16,gamma_initializer='ones', beta_initializer='zeros')</i></div>
split channel into G groups, for each group, applying layer normalization separately.
$$
\\
u = \frac{1}{CHW} \sum \limits_{i=1}^C \sum \limits_{j=1}^H \sum \limits_{k=1}^W x_{ijk}  \quad \quad sample \quad mean
\\
\sigma = \frac{1}{CHW} \sum \limits_{i=1}^C \sum \limits_{j=1}^H \sum \limits_{k=1}^W (x_{ijk} - u)^2  \quad \quad sample \quad variance
\\
\hat x = \frac{x - u}{\sqrt{\sigma^2 + \epsilon}}   \quad \quad normalize
\\
y = \gamma \hat x + \beta  \quad \quad scale \quad and \quad shift
$$


+ epsilon:  $\epsilon$ value.
+ G: group numbers.
+ gamma_initializer: initialize $\gamma$ method. see details in <a href='#Initializers'>Initializers</a>
+ beta_initializer: initialize $\beta$ method. see details in <a href='#Initializers'>Initializers</a>

In [3]:
from shinnosuke.models import Model
from shinnosuke.layers import BatchNormalization, LayerNormalization, GroupNormalization
from shinnosuke.layers import Input

X_input = Input(shape=(None, 16, 5, 5))
X = BatchNormalization()(X_input)
X = LayerNormalization()(X)
X = GroupNormalization()(X)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 16, 5, 5)     0            
              
---------------------------------------------------------------------------
BatchNormalization   (None, 16, 5, 5)     32           Input          
---------------------------------------------------------------------------
LayerNormalization   (None, 16, 5, 5)     800          BatchNormalization
---------------------------------------------------------------------------
GroupNormalization   (None, 16, 5, 5)     32           LayerNormalization
---------------------------------------------------------------------------
***************************************************************************
Total params: 864
Trainable params: 864
Non-trainable params: 0



<div style="color: red" id="embed"><i>- Embedding(input_dim, output_dim,embeddings_initializer='uniform', mask_zero=False, input_length=None, **kwargs)</i></div>

+ input_dim: to embedded dimension, for example, input data shape is (N, T), input_dim is the max value.
+ out_dim: after embedding dimension, for example, out_dim = E, input data (N, T) after embedding's shape is (N, T, E).
+ embeddings_initializer: embedding kernel initialize method. see details in <a href='#Initializers'>Initializers</a>
+ mask_zero: use masks.
+ input_length: must specify if this layer is first layer of network.

In [4]:
from shinnosuke.models import Sequential
from shinnosuke.layers import Embedding

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=200, input_length=30))
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Embedding            (None, 30, 200)      1000000      
              
---------------------------------------------------------------------------
***************************************************************************
Total params: 1000000
Trainable params: 1000000
Non-trainable params: 0



<div style="color: red" id="rnn"><i>- SimpleRNN(units, activation='tanh', initializer='glorotuniform', recurrent_initializer='orthogonal', return_sequences=False, return_state=False, stateful=False, **kwargs)</i></div>

$$
z^t = W_{aa}\cdot a^{t-1} + W_{xa}\cdot x^t +b_a
\\
a^t = activation(z^t)
$$

+ units: rnn hidden unit numbers, for example, units = a, input data (N, T, L) after rnn will output (N, T, a).
+ activation: activation method. see details in <a href='#Activations'>Activations</a>
+ initializer: $W_{xa}$ initialize method. see details in <a href='#Initializers'>Initializers</a>
+ recurrent_initializer: $W_{aa}$ initialize method. see details in <a href='#Initializers'>Initializers</a>
+ return_sequences: if True, return all timesteps a $\rightarrow$ $[a^1, a^2,..., a^t]$; if False, return the last timesteps $a^t$.
+ return_state: if True, return return_sequences' result and all timesteps a.
+ stateful: if True, use last time $a^t$ to initialize this time $a^1$; if False, use 0 to initialize this time $a^1$.

In [5]:
from shinnosuke.models import Model
from shinnosuke.layers import SimpleRNN
from shinnosuke.layers import Input

X_input = Input(shape=(None, 30, 200))
X = SimpleRNN(units=50)(X_input)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 30, 200)      0            
              
---------------------------------------------------------------------------
SimpleRNN            (None, 50)           12550        Input          
---------------------------------------------------------------------------
***************************************************************************
Total params: 12550
Trainable params: 12550
Non-trainable params: 0



<div style="color: red" id="lstm"><i>- LSTM(units, activation='tanh', recurrent_activation='sigmoid', initializer='glorotuniform', recurrent_initializer='orthogonal', unit_forget_bias=True, return_sequences=False, return_state=False, stateful=False, **kwargs)</i></div>

at every timesteps

$$
i^t = recurrent\_activation(W_i[a^{t-1}, x^t] + b_i)
\\
f^t = recurrent\_activation(W_f[a^{t-1}, x^t] + b_f)
\\
\tilde c^t = activation(W_c[a^{t-1}, x^t] + b_c)
\\
c^t = f^t \cdot c^{t-1} + i^t \cdot \tilde c^t
\\
o^t = recurrent\_activation(W_o[a^{t-1}, x^t] + b_o)
\\
a^t = o^t \cdot tanh(c^t)
$$

+ units: lstm hidden unit numbers.
+ activation: activation method. see details in <a href='#Activations'>Activations</a>
+ recurrent_activation: activation method. see details in <a href='#Activations'>Activations</a>
+ initializer: $W_{xa}$ initialize method. see details in <a href='#Initializers'>Initializers</a>
+ recurrent_initializer: $W_{aa}$ initialize method. see details in <a href='#Initializers'>Initializers</a>
+ unit_forget_bias: if True, initialize $f^t$ bias $b_f$ as 1, else 0.
+ return_sequences: if True, return all timesteps a $\rightarrow$ $[a^1, a^2,..., a^t]$; if False, return the last timesteps $a^t$.
+ return_state: if True, return return_sequences' result and all timesteps a.
+ stateful: if True, use last time $a^t$ to initialize this time $a^1$; if False, use 0 to initialize this time $a^1$.

In [1]:
from shinnosuke.models import Model
from shinnosuke.layers import LSTM
from shinnosuke.layers import Input

X_input = Input(shape=(None, 30, 200))
X = LSTM(units=50, return_sequences=True)(X_input)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 30, 200)      0            
              
---------------------------------------------------------------------------
LSTM                 (None, 30, 50)       50200        Input          
---------------------------------------------------------------------------
***************************************************************************
Total params: 50200
Trainable params: 50200
Non-trainable params: 0



<div style="color: red" id="timedist"><i>- TimeDistributed(layer, **kwargs)</i></div>

+ layer: to apply time distributed layer.


In [3]:
from shinnosuke.layers import Input, Dense, LSTM, TimeDistributed
from shinnosuke.models import Model

X_input = Input(shape=(None, 25, 97))
X = LSTM(units=100, return_sequences=True, stateful=True)(X_input)
X = TimeDistributed(Dense(50))(X)
model = Model(inputs=X_input, outputs=X)
model.compile(optimizer='sgd', loss='mse')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 25, 97)       0            
              
---------------------------------------------------------------------------
LSTM                 (None, 25, 100)      79200        Input          
---------------------------------------------------------------------------
TimeDistributed      (None, 25, 50)       5050         LSTM           
---------------------------------------------------------------------------
***************************************************************************
Total params: 84250
Trainable params: 84250
Non-trainable params: 0



<h2 id="node">Node</h2>

<div style="color: red" id="variable"><i>- Variable(initial_value=None, shape=None,name='variable')</i></div>

+ initial_value: initialize value of this variable.
+ shape: this variable's shape.
+ name: this variable's name.

In [5]:
from shinnosuke import Variable

a = Variable(initial_value=3)
print('before change: ', a)
# change variable's value
a.output_tensor = 2
print('after change: ',a)

before change:  3
after change:  2


<div style="color: red" id="constant"><i>- Constant(output_tensor, name='constant')</i></div>

+ output_tensor: this constant value, once initialized, it can't be changed.
+ name: this constant's name.

In [6]:
from shinnosuke import Constant

a = Constant(3)
print('before change: ', a)
# change variable's value
a.output_tensor = 2
print('after change: ',a)

before change:  3
after change:  3




<h2 id="example">Example</h2>

<h3 id="ex-cnn">CNN</h3>

In [5]:
from shinnosuke.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, BatchNormalization, Activation, Input
from shinnosuke.models import Model
from shinnosuke.utils import StochasticGradientDescent
# define model
def create_model():
    X_input = Input(shape=(None, 3, 64, 64))
    X = Conv2D(filter_nums=8, filter_size=(3, 3), padding='SAME')(X_input)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = MaxPooling2D(pool_size=(2, 2))(X)
    X = Conv2D(filter_nums=16, filter_size=(3, 3), padding='SAME')(X)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = MaxPooling2D(pool_size=(2, 2))(X)
    X = Flatten()(X)
    X = Dense(n_out=1000, activation='relu')(X)
    X = Dropout(keep_prob=0.8)(X)
    X = Dense(n_out=500, activation='relu')(X)
    X = Dropout(keep_prob=0.8)(X)
    X = Dense(n_out=10, activation='softmax')(X)
    model = Model(inputs=X_input, outputs=X)
    return model

model = create_model()
sgd = StochasticGradientDescent(lr=0.01)
model.compile(loss='SparseCategoricalCrossEntropy', optimizer=sgd)
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 3, 64, 64)    0            
              
---------------------------------------------------------------------------
Conv2D               (None, 8, 64, 64)    224          Input          
---------------------------------------------------------------------------
BatchNormalization   (None, 8, 64, 64)    16           Conv2D         
---------------------------------------------------------------------------
Activation           (None, 8, 64, 64)    0            BatchNormalization
---------------------------------------------------------------------------
MaxPooling2D         (None, 8, 32, 32)    0            Activation     
---------------------------------------------------------------------------
Conv2D               (None, 16, 32, 32)

In [6]:
import cupy as cp
from shinnosuke.utils import to_categorical
# randomly generate training datas
cp.random.seed(1)
X = cp.random.random((100, 3, 64, 64))
y = cp.random.randint(0, 10, size=(100, 1))
# to one-hot
y = to_categorical(y)

model.fit(X, y, batch_size=64, epochs=20, validation_ratio=0.)

[0;31m Epoch[1/20]
[0;31m Epoch[2/20]
[0;31m Epoch[3/20]
[0;31m Epoch[4/20]
[0;31m Epoch[5/20]
[0;31m Epoch[6/20]
[0;31m Epoch[7/20]
[0;31m Epoch[8/20]
[0;31m Epoch[9/20]
[0;31m Epoch[10/20]
[0;31m Epoch[11/20]
[0;31m Epoch[12/20]
[0;31m Epoch[13/20]
[0;31m Epoch[14/20]
[0;31m Epoch[15/20]
[0;31m Epoch[16/20]
[0;31m Epoch[17/20]
[0;31m Epoch[18/20]
[0;31m Epoch[19/20]
[0;31m Epoch[20/20]


<h3 id="ex-lstm">LSTM</h3>

In [21]:
from shinnosuke.models import Model
from shinnosuke.layers import Input, LSTM, TimeDistributed, Dense, Activation, Embedding
from shinnosuke.utils import StochasticGradientDescent

def create_model():
    X_input = Input(shape=(None, 20))
    X = Embedding(input_dim=500, output_dim=100)(X_input)
    X = LSTM(units=100, return_sequences=True)(X)
    X = TimeDistributed(Dense(n_out=50))(X)
    X = LSTM(units=50)(X)
    X = Dense(n_out=1)(X)
    X = Activation('sigmoid')(X)
    model = Model(inputs=X_input, outputs=X)
    return model

model = create_model()
sgd = StochasticGradientDescent(lr=0.5)
model.compile(optimizer=sgd, loss='binary_cross_entropy')
print(model)

***************************************************************************
Layer(type)          Output Shape         Param        Connected to   
###########################################################################
Input                (None, 20)           0            
              
---------------------------------------------------------------------------
Embedding            (None, 20, 100)      50000        Input          
---------------------------------------------------------------------------
LSTM                 (None, 20, 100)      80400        Embedding      
---------------------------------------------------------------------------
TimeDistributed      (None, 20, 50)       5050         LSTM           
---------------------------------------------------------------------------
LSTM                 (None, 50)           20200        TimeDistributed
---------------------------------------------------------------------------
Dense                (None, 1)            

In [23]:
import cupy as cp
from shinnosuke.utils import to_categorical

cp.random.seed(1)
X = cp.random.randint(0, 500, size=(100, 20))  # (N, timestep)
y = cp.random.randint(0, 2, size=(100, 1))


model.fit(X, y, batch_size=50, epochs=20, validation_ratio=0.)

[0;31m Epoch[1/20]
[0;31m Epoch[2/20]
[0;31m Epoch[3/20]
[0;31m Epoch[4/20]
[0;31m Epoch[5/20]
[0;31m Epoch[6/20]
[0;31m Epoch[7/20]
[0;31m Epoch[8/20]
[0;31m Epoch[9/20]
[0;31m Epoch[10/20]
[0;31m Epoch[11/20]
[0;31m Epoch[12/20]
[0;31m Epoch[13/20]
[0;31m Epoch[14/20]
[0;31m Epoch[15/20]
[0;31m Epoch[16/20]
[0;31m Epoch[17/20]
[0;31m Epoch[18/20]
[0;31m Epoch[19/20]
[0;31m Epoch[20/20]
