Several things here:

1) Running  a 1 D convol for text (comes from keras examples)
2) replicating the 2d convol from cs231n demo
3) See how to look at the output of each layer, to understand what is happening

<h2>Demonstrate using a 1-D convolution on text data</h2>

In [14]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [1]:
'''

Download the data

'''
import os
#to force tensorflow to run on CPU
#http://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D, Convolution2D, MaxPooling2D, GlobalAveragePooling1D, MaxPooling2D
from keras.datasets import imdb
from keras import backend as K
from keras.layers.convolutional import ZeroPadding2D


max_features = 5000

print('Loading data...')

#Top most frequent words to consider. Any less frequent word (than max_features) will appear as 0 in the sequence data
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')



Using TensorFlow backend.


Loading data...
25000 train sequences
25000 test sequences


In [7]:
#first review is a list of integers for each token and the length is 218. Length of second is 189

print(X_train[0])

print(len(X_train[0]))
print(len(X_train[1]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]
218
189


In [10]:
maxlen = 400

#maxlen : maximum sequence length, longer sequences are truncated 
#and shorter sequences are padded with zeros at the end.

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print("\n\n")
print("So, we see above that each example is 400 numbers - these words are already preprocessed this way:\n")
print(X_train[0])

#We see the padding by default is on the left

#This changed the shape. It is now 2d numpy array where 
#each row is a review and each column is a 0 or integer indexing the word
print(type(X_train))

Pad sequences (samples x time)
X_train shape: (25000, 400)
X_test shape: (25000, 400)



So, we see above that each example is 400 numbers - these words are already preprocessed this way:

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0 

In [15]:
# set parameters:

embedding_dims = 50 

model = Sequential()



model.add(Embedding(input_dim=max_features+1, #Size of the vocabulary, 
                                            #ie. 1 + maximum integer index occurring in the input data to account for 0
                    output_dim=embedding_dims, #Dimension of the dense embedding (each token integer is mapped to a output_dim dimensional desne numeric vector)
                    input_length=maxlen, #Length of input sequences, when it is constant
                    mask_zero =False)) #convolution1d does not support masking: https://stackoverflow.com/questions/43392693/how-to-input-mask-value-to-convolution1d-layer



model.compile('rmsprop', 'mse')


In [26]:
#pass through the first example review
output_array = model.predict(X_train[0:1])
#note X_train[0].shape is be shape (400,1) and error results: ValueError: Error when checking : expected embedding_1_input to have shape (None, 400) but got array with shape (400, 1)
#While X_train[0:1] will be shape (1,400) which conforms to expected (batch,400) 



#result on first review:
output_array.shape #is (1, 400, 50) corrsponding to the single review (1), the sequence length (400) and tje word vector length (50). So, each sequence token gets a 50 dimension vector

(1, 400, 50)

In [93]:
#lets examime what a Convolution1D layer does with a small example.

np.random.seed(123)

#input is (3,4) integers. This would be like 3 examples where there are 4 words in each example and our vocabulary only consists of 3 words
X=np.array(([1,2,3,1],[1,2,3,3],[2,1,1,2]))
X.shape

mod_embed=Sequential()
mod_embed.add(Embedding(input_dim=3+1, #Size of the vocabulary, ie. 1 + maximum integer index occurring in the input data
                    output_dim=5, #Dimension of the dense embedding
                    input_length=4, #Length of input sequences, when it is constant
                    mask_zero =False)) 

print("\nThese are the embedding vectors:")
print(mod_embed.predict(X))
print("\nShape of embedding vectors:")
print(mod_embed.predict(X).shape) #3,4,5 each of the 4 words is mapped to a 4 dimensional row vector (Note that the same word (integer) is related to the same vector within and across examples - i.e. a token of '2' will be the same word vector within this example and all other examples)

#save out first embedding
np.savetxt('/home/jma/Documents/saved_embed.txt', mod_embed.predict(X)[0,:,:])

def my_init(shape, dtype=None):
    init=np.arange(30).reshape(3,5,2) #(filter height, length of word vectors, number of filters)
    return init 

mod_embed.add(Convolution1D(
                        filters=2, #just 2 filter
                        kernel_size=3, #filter height (how many words to cover at a time)
                        padding='valid', #no padding
                        strides=1 ,     
                        kernel_initializer=my_init 
))

#save out first filter
np.savetxt('/home/jma/Documents/saved_filter.txt',mod_embed.get_weights()[1][:,:,0])

print("\nThis is the filter:")
print(mod_embed.get_weights()[1])
print("\nAs expected its shape is (filter height, length of word vectors, number of filters): ")
print(mod_embed.get_weights()[1].shape)
print("\nfirst filter:")
print(mod_embed.get_weights()[1][:,:,0])


#this will result in 
print("\n\nShape of the output from the convolution")
print(mod_embed.predict(X[0:1]).shape)
print("\n\nThe output of the convolution- just two numbers from each filter")
print(mod_embed.predict(X[0:1]))
print("\n\nThe output of the convolution- for the first filter")
print(mod_embed.predict(X[0:1])[:,:,0])


These are the embedding vectors:
[[[-0.04300938  0.04198356 -0.0022753  -0.0404867   0.04836346]
  [-0.02434923 -0.00514651 -0.03838418 -0.00085073  0.02251348]
  [-0.00879489  0.03876739 -0.04208625 -0.03341036 -0.00847604]
  [-0.04300938  0.04198356 -0.0022753  -0.0404867   0.04836346]]

 [[-0.04300938  0.04198356 -0.0022753  -0.0404867   0.04836346]
  [-0.02434923 -0.00514651 -0.03838418 -0.00085073  0.02251348]
  [-0.00879489  0.03876739 -0.04208625 -0.03341036 -0.00847604]
  [-0.00879489  0.03876739 -0.04208625 -0.03341036 -0.00847604]]

 [[-0.02434923 -0.00514651 -0.03838418 -0.00085073  0.02251348]
  [-0.04300938  0.04198356 -0.0022753  -0.0404867   0.04836346]
  [-0.04300938  0.04198356 -0.0022753  -0.0404867   0.04836346]
  [-0.02434923 -0.00514651 -0.03838418 -0.00085073  0.02251348]]]

Shape of embedding vectors:
(3, 4, 5)

This is the filter:
[[[  0.   1.]
  [  2.   3.]
  [  4.   5.]
  [  6.   7.]
  [  8.   9.]]

 [[ 10.  11.]
  [ 12.  13.]
  [ 14.  15.]
  [ 16.  17.]
  [ 

Look at the spreadsheet new_cnnID. You will find the exact output as above

This (convolutional 1D on text) I guess can be thought of as 'kernel_size' - gram on text, becuase the dot product is 
over complete rows (the embedding vectors) for several words ('kernel size of them) at a time, then slides foward and so on.


Complete example in other notebook called Amazon_CNN_Text






<h2>Demonstrate using a 2-D convolution on image data</h2>


http://cs231n.github.io/convolutional-networks/#conv demo is the basis

In [10]:
N = 50
X = np.random.randn(N, 3,5, 5)  #creates the 3 channel data, 5x5 matrices
y = np.random.randint(1, size=N)
#print (X[0,:,:,:])

model = Sequential()

# number of convolutional filters, this is the number of "neurons"
n_filters = 2

# convolution filter size
# i.e. we will use a n_conv x n_conv filter
n_conv = 3

# pooling window size
# i.e. we will use a n_pool x n_pool pooling window
n_pool = 2

model.add(ZeroPadding2D(input_shape=(3, 5, 5),padding=(1,1)))  #this makes a 7x7 data input
model.add(Convolution2D(
        n_filters, n_conv, n_conv,

        # apply the filter to only full parts of the image
        # (i.e. do not "spill over" the border)
        # this is called a narrow convolution
        border_mode='valid',

        # we have a 5x5 RGB channel
        # so the input shape should be (3,5,5)
        #input_shape=(3, 5, 5),
        
        subsample=(2, 2) #this is STRIDE (left to right and top to bottom),
        
))

model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(n_pool, n_pool)))

# flatten the data for the 1D layers
model.add(Flatten())

# Dense(n_outputs)
model.add(Dense(10))


# the softmax output layer gives us a probablity for each class
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss='mse',
    optimizer='adam',
    metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d_2 (ZeroPaddin (None, 5, 7, 5)           0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 2, 3, 2)           92        
_________________________________________________________________
activation_2 (Activation)    (None, 2, 3, 2)           0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 1, 2)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                30        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
__________



Above we see that the zero padding creates a 7x7 matrix and the convolution outputs 2x3x3 just like the cs
231 example demo. 

Next we should see how many weights there are in Convolution. In the demo there are 2 sets (one for each filter) of 3x3x3 (size of the filter and number of channels)

In [None]:
# how many examples to look at during each training iteration
batch_size = 1

# how many times to run through the full set of examples
n_epochs = 1

model.fit(X,
          y,
          batch_size=batch_size,
          nb_epoch=n_epochs)


In [None]:
#SUCCESS!

#[0] is the padding, [1] is the convolution
#we see that indeed weights are 2x3x3x3!
weights=model.layers[1].get_weights()[0]
print (weights)
print (weights.shape)

#and the biases are 2x1!
biases=model.layers[1].get_weights()[1]
print (biases)
print (biases.shape)

Now interested in see the input and output to a given layer. 
This is interesteding to do for toy example for example to be able to 
see how the layers actually work on data

In [13]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten,Merge,Input
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D, Convolution2D, MaxPooling2D
from keras import backend as K
from keras.layers.convolutional import ZeroPadding2D
from keras.models import Model

In [14]:
#FIT A SIMPLE MODEL

N = 50
X = np.random.randn(N, 3,5, 5)  #creates the 3 channel data, 5x5 matrices
y = np.random.randint(1, size=N)

model = Sequential()

# number of convolutional filters, this is the number of "neurons"
n_filters = 2

# convolution filter size
# i.e. we will use a n_conv x n_conv filter
n_conv = 3

# pooling window size
# i.e. we will use a n_pool x n_pool pooling window
n_pool = 2

# we have a 5x5 image with RGB channel
# so the input shape should be (3,5,5)
model.add(ZeroPadding2D(input_shape=(3, 5, 5),padding=(1,1)))  #this makes a 7x7 data input

model.add(Convolution2D(
        
        n_filters, n_conv, n_conv,

        # apply the filter to only full parts of the image
        # (i.e. do not "spill over" the border)
        # this is called a narrow convolution
        border_mode='valid',


        subsample=(2, 2) #this is STRIDE (left to right and top to bottom),
        
))

model.add(Activation('relu'))

model.add(MaxPooling2D(pool_size=(n_pool, n_pool)))

# flatten the data for the 1D layers
model.add(Flatten())

# Dense(n_outputs)
model.add(Dense(10))


# the softmax output layer gives us a probablity for each class
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss='mse',
    optimizer='adam',
    metrics=['accuracy'])

print (model.summary())



# how many examples to look at during each training iteration
batch_size = 1

# how many times to run through the full set of examples
n_epochs = 1

model.fit(X,
          y,
          batch_size=batch_size,
          nb_epoch=n_epochs)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d_4 (ZeroPaddin (None, 5, 7, 5)           0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 2, 3, 2)           92        
_________________________________________________________________
activation_6 (Activation)    (None, 2, 3, 2)           0         
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 1, 1, 2)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                30        
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
__________





<keras.callbacks.History at 0x7f802a669c50>

In [26]:
#Function to return the array passed into a layer and the output of the layer to examine how a layer actually works on it's input (X is the small data you pass into the layer of
#interest and the index is determined from the summary above (zero based of course):

def input_output (layer_index,X,model):
    get_layer_output = K.function([model.layers[layer_index].input], [model.layers[layer_index].output])
    layer_output = get_layer_output([X])[0]
    return (X,layer_output)
    

#Create small tensor replicating the shape of data coming into the Convolution2D (second layer,index =1)
x=np.random.randn(1,5,7, 5) #into the convol

input,output =input_output(1,x,model)

output.shape




IndexError: list index out of range

now try to see about embedding

In [23]:
N = 50
X = np.array(([0,2,3],[2,2,1],[2,2,1]))

model = Sequential()
#input length = Size of the vocabulary, ie. 1 + maximum integer index occurring in the input data.
#output_dim =Dimension of the dense embedding
#inut_length = length of input sequences, when it is constant
model.add(Embedding(input_dim=4, output_dim=4, input_length=3))

model.compile('rmsprop', 'mse')
  
output_array = model.predict(X)
print (output_array)


[[[-0.01539054 -0.02353692  0.02032514  0.03455081]
  [-0.03127869  0.04872128 -0.02925384 -0.04018258]
  [ 0.00702439 -0.03493382  0.0065612   0.03262452]]

 [[-0.03127869  0.04872128 -0.02925384 -0.04018258]
  [-0.03127869  0.04872128 -0.02925384 -0.04018258]
  [-0.04798755 -0.01212469 -0.01046811  0.04191699]]

 [[-0.03127869  0.04872128 -0.02925384 -0.04018258]
  [-0.03127869  0.04872128 -0.02925384 -0.04018258]
  [-0.04798755 -0.01212469 -0.01046811  0.04191699]]]
