In [1]:
import caffe
from caffe import layers as L
from caffe import params as P

We begin by defining the structure of the LeNet using Python code, rather than editing the protobuf directly. From here we can translate a teacher-student model.

https://github.com/nigroup/nideep/blob/master/examples/parallel_train/verify_parallel_training.ipynb


In [212]:
train_source = "/opt/caffe/examples/mnist/mnist_train_lmdb"
test_source = "/opt/caffe/examples/mnist/mnist_test_lmdb"

train_bs = 64
test_bs = 100

In [175]:
def le_net(train_lmdb, test_lmdb, batch_size = 64):
    net = caffe.NetSpec() # creates empty network
    net.data, net.label = L.Data(batch_size=train_bs, 
                                 backend=P.Data.LMDB, 
                                 source=train_lmdb,
                                 transform_param={"scale":1.0/255.0}, 
                                 ntop=2,
                                 include={"phase":caffe.TRAIN}
                                )
    net.test_data, net.test_label = L.Data(top=["data", "label"],
                                           batch_size=test_bs,
                                           backend=P.Data.LMDB, 
                                           source=test_lmdb,
                                           transform_param={"scale":1.0/255.0}, 
                                           ntop=2,
                                           include={"phase":caffe.TEST}
                                          )
    net.conv1 = L.Convolution(bottom = "data", # does a 5x5 convolution with stride 1 on net.data
                              kernel_h=5, # or use kernel_size
                              kernel_w=5, 
                              stride_h=1,
                              stride_w=1,
                              num_output=20, 
                              weight_filler={"type":"xavier"}, # use Xavier weight initializations
                              bias_filler={"type":"constant"}, # biases start off at 0
                              param=[{"lr_mult":1}, # weights have standard learning rate
                                     {"lr_mult":2}] # bias has double learning rate
                             ) 
    net.pool1 = L.Pooling(net.conv1, # 2x2 maxPooling with stride 2 on net.conv1
                          kernel_size=2, 
                          stride_h=2, # or use stride
                          stride_w=2, 
                          pool=P.Pooling.MAX
                         )
    net.conv2 = L.Convolution(net.pool1,
                              kernel_h=5,
                              kernel_w=5,
                              stride_h=1,
                              stride_w=1,
                              num_output=50,
                              weight_filler={"type":"xavier"},
                              bias_filler={"type":"constant"},
                              param=[{"lr_mult":1},
                                     {"lr_mult":2}]
                             )
    net.pool2 = L.Pooling(net.conv2,
                          kernel_h=2,
                          kernel_w=2,
                          stride=2,
                          pool=P.Pooling.MAX
                         )
    net.ip1 = L.InnerProduct(net.pool2, 
                             num_output=500,
                             weight_filler={"type":"xavier"},
                             bias_filler={"type":"constant"},
                             param=[{"lr_mult":1},
                                    {"lr_mult":2}]
                            )
    net.relu1 = L.ReLU(net.ip1,
                       in_place=True
                      )
    net.ip2 = L.InnerProduct(net.relu1,
                             num_output=100,
                             weight_filler={"type":"xavier"},
                             bias_filler={"type":"constant"},
                             param=[{"lr_mult":1},
                                    {"lr_mult":2}]
                            )
    net.accuracy = L.Accuracy(net.ip2, net.label,
                              include={"phase": caffe.TEST}
                             )
    net.loss = L.SoftmaxWithLoss(net.ip2, net.label)
    return net.to_proto()

In [235]:
def teacher_le_net(train_lmdb, test_lmdb, batch_size = 64):
    net = caffe.NetSpec() # creates empty network
    # data layers
    net.data, net.label = L.Data(batch_size=train_bs, backend=P.Data.LMDB, source=train_lmdb,
                                 transform_param={"scale":1.0/255.0}, ntop=2, include={"phase":caffe.TRAIN})
    net.test_data, net.test_label = L.Data(top=["data", "label"], batch_size=test_bs, backend=P.Data.LMDB, 
                                           source=test_lmdb, transform_param={"scale":1.0/255.0}, ntop=2,
                                           include={"phase":caffe.TEST})
    net.conv_t1 = L.Convolution(bottom = "data", kernel_size=5, stride_h=1, stride_w=1, num_output=20, 
                              weight_filler={"type":"xavier"}, bias_filler={"type":"constant"},
                              param=[{"lr_mult":1},{"lr_mult":2}]) 
    net.pool_t1 = L.Pooling(net.conv_t1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    net.conv_t2 = L.Convolution(net.pool_t1, kernel_size=5, stride=1, num_output=50, weight_filler={"type":"xavier"},
                              bias_filler={"type":"constant"}, param=[{"lr_mult":1}, {"lr_mult":2}])
    net.pool_t2 = L.Pooling(net.conv_t2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    net.ip_t1 = L.InnerProduct(net.pool_t2, num_output=500, weight_filler={"type":"xavier"},
                             bias_filler={"type":"constant"}, param=[{"lr_mult":1},{"lr_mult":2}])
    net.relu_t1 = L.ReLU(net.ip_t1, in_place=True)
    net.ip_t2 = L.InnerProduct(net.relu_t1, num_output=100, weight_filler={"type":"xavier"},
                             bias_filler={"type":"constant"}, param=[{"lr_mult":1},{"lr_mult":2}])
    net.accuracy = L.Accuracy(net.ip_t2, net.label, include={"phase": caffe.TEST})
    net.loss = L.SoftmaxWithLoss(net.ip_t2, net.label, loss_weight=1)
    return net.to_proto()

The Python-to-protobuf file converts perfectly, with the exception of the test data layer, which creates 4 tops: test_data, test_label, data, and label. Just delete the test_data and test_label tops.

In [236]:
with open("teacher_lenet.prototxt", "w") as fin:
    fin.write(str(teacher_le_net(train_source, test_source)))
    
    layer {
        name: "sf_t1"
        type: "Softmax"
        bottom: "ip_t2"
        top: "sf_t1"
    }
    
    layer {
        name: "ts_loss"
        type: "SigmoidCrossEntropyLoss"
        bottom: "ip_s2"
        bottom: "sf_t1"
        top: "ts_loss"
        loss_weight: 0
    }

In [218]:
def ts_le_net(train_lmdb, test_lmdb, batch_size = 64):
    net = caffe.NetSpec() # creates empty network
    # data layers
    net.data, net.label = L.Data(batch_size=train_bs, backend=P.Data.LMDB, source=train_lmdb,
                                 transform_param={"scale":1.0/255.0}, ntop=2, include={"phase":caffe.TRAIN})
    net.test_data, net.test_label = L.Data(top=["data", "label"], batch_size=test_bs, backend=P.Data.LMDB, 
                                           source=test_lmdb, transform_param={"scale":1.0/255.0}, ntop=2,
                                           include={"phase":caffe.TEST})
    # student net
    net.conv_s1 = L.Convolution(bottom = "data", kernel_size=5, stride_h=1, stride_w=1, num_output=20, 
                              weight_filler={"type":"xavier"}, bias_filler={"type":"constant"},
                              param=[{"lr_mult":1},{"lr_mult":2}]) 
    net.pool_s1 = L.Pooling(net.conv_s1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    net.conv_s2 = L.Convolution(net.pool_s1, kernel_size=5, stride=1, num_output=50, weight_filler={"type":"xavier"},
                              bias_filler={"type":"constant"}, param=[{"lr_mult":1}, {"lr_mult":2}])
    net.pool_s2 = L.Pooling(net.conv_s2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    net.ip_s1 = L.InnerProduct(net.pool_s2, num_output=500, weight_filler={"type":"xavier"},
                             bias_filler={"type":"constant"}, param=[{"lr_mult":1},{"lr_mult":2}])
    net.relu_s1 = L.ReLU(net.ip_s1, in_place=True)
    net.ip_s2 = L.InnerProduct(net.relu_s1, num_output=100, weight_filler={"type":"xavier"},
                             bias_filler={"type":"constant"}, param=[{"lr_mult":1},{"lr_mult":2}])
    # teacher net
    net.conv_t1 = L.Convolution(bottom = "data", kernel_size=5, stride=1, num_output=20, 
                              param=[{"lr_mult":0},{"lr_mult":0}]) 
    net.pool_t1 = L.Pooling(net.conv_t1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    net.conv_t2 = L.Convolution(net.pool_t1, kernel_size=5, stride=1, num_output=50, param=[{"lr_mult":0}, {"lr_mult":0}])
    net.pool_t2 = L.Pooling(net.conv_t2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    net.ip_t1 = L.InnerProduct(net.pool_t2, num_output=500, param=[{"lr_mult":0},{"lr_mult":0}])
    net.relu_t1 = L.ReLU(net.ip_t1, in_place=True)
    net.ip_t2 = L.InnerProduct(net.relu_t1, num_output=100, param=[{"lr_mult":0},{"lr_mult":0}])
    # accuracy and loss
    net.accuracy = L.Accuracy(net.ip_s2, net.label, include={"phase": caffe.TEST})
    net.loss = L.SoftmaxWithLoss(net.ip_s2, net.label, loss_weight=0.5)
    net.ts_loss = L.SoftmaxWithLoss(net.ip_s2, net.ip_t2, loss_weight=0.5)
    return net.to_proto()

Here we prepare the teacher-student net by first loading in the trained student net. We then create the framework for the teacher-student network and then fill in the weights of the teacher section with the teacher model's weights. Note that the teacher model's layers have the same names as the teacher layers within the teacher-student model.

In [173]:
student_net = caffe.Net("student_lenet.prototxt", weights)

In [216]:
ts_le_net(train_source, test_source)

layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  transform_param {
    scale: 0.00392156885937
  }
  data_param {
    source: "/opt/caffe/examples/mnist/mnist_train_lmdb"
    batch_size: 64
    backend: LMDB
  }
}
layer {
  name: "test_data"
  type: "Data"
  top: "test_data"
  top: "test_label"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }
  transform_param {
    scale: 0.00392156885937
  }
  data_param {
    source: "/opt/caffe/examples/mnist/mnist_test_lmdb"
    batch_size: 100
    backend: LMDB
  }
}
layer {
  name: "conv_s1"
  type: "Convolution"
  bottom: "data"
  top: "conv_s1"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  convolution_param {
    num_output: 20
    kernel_size: 5
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
    stride_h: 1
    stride_w: 1
  }
}
layer {
  name: "pool_s1"
  type: "Pooling"
  bottom: "conv_s1"
  top: "pool_s1"
  pool

In [219]:
with open("ts_lenet.prototxt", "w") as fin:
    fin.write(str(ts_le_net(train_source, test_source)))

In [228]:
teacher_net = caffe.Net("ts_lenet.prototxt", "snapshots/trained.caffemodel", caffe.TRAIN)

In [232]:
teacher_net2 = caffe.Net("ts_lenet.prototxt", "snapshots/trained.caffemodel", caffe.TEST)

In [234]:
teacher_net2.forward()

{'accuracy': array(0.9900000095367432, dtype=float32),
 'loss': array(0.011848684400320053, dtype=float32),
 'ts_loss': array(106.17877960205078, dtype=float32)}

In [96]:
param=list({"lr_mult":1}, # weights have standard learning rate
                                          "lr_mult":2}) # bias has double learning rate

In [98]:
{"lr_mult":1,
 "lr_mult":2}

{'lr_mult': 2}

In [99]:
param=[{'lr_mult':1, 'decay_mult':1},
        {'lr_mult':2, 'decay_mult':0}]

In [100]:
param

[{'decay_mult': 1, 'lr_mult': 1}, {'decay_mult': 0, 'lr_mult': 2}]

In [105]:
param=[{"lr_mult":1},
                                    {"lr_mult":2}]

In [106]:
param

[{'lr_mult': 1}, {'lr_mult': 2}]

In [121]:

ns = caffe.NetSpec()
ns.data = L.Data(name="data", 
                 include={'phase':caffe.TEST})
ns.test_data = L.Data(name="data", ntop = 0, top='data',
                 include={'phase':caffe.TEST})
print '{}'.format(ns.to_proto())

layer {
  name: "data"
  type: "Data"
  top: "data"
  include {
    phase: TEST
  }
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  include {
    phase: TEST
  }
}



In [155]:
net = caffe.NetSpec()
net.data = L.Data(name='data', 
                include={"phase":caffe.TRAIN},
                ntop=1)
net.test_data = L.Data(name='data', 
                    include={"phase":caffe.TEST},
                    top='data',
                    ntop=1)
print '{}'.format(net.to_proto())

layer {
  name: "data"
  type: "Data"
  top: "data"
  include {
    phase: TRAIN
  }
}
layer {
  name: "data"
  type: "Data"
  top: "test_data"
  top: "data"
  include {
    phase: TEST
  }
}



In [156]:
net = caffe.NetSpec()
net.data = L.Data(name='data', 
                include={"phase":caffe.TRAIN},
                ntop=1)
net.test_data = L.Data(name='data', 
                    include={"phase":caffe.TEST},
                    top='data',
                    ntop=0)
print '{}'.format(net.to_proto())

layer {
  name: "data"
  type: "Data"
  top: "data"
  include {
    phase: TRAIN
  }
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  include {
    phase: TEST
  }
}



In [None]:
# teacher_lenet.prototxt

layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  transform_param {
    scale: 0.00392156885937
  }
  data_param {
    source: "/opt/caffe/examples/mnist/mnist_train_lmdb"
    batch_size: 64
    backend: LMDB
  }
}
layer {
  name: "test_data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }
  transform_param {
    scale: 0.00392156885937
  }
  data_param {
    source: "/opt/caffe/examples/mnist/mnist_test_lmdb"
    batch_size: 100
    backend: LMDB
  }
}
layer {
  name: "conv_t1"
  type: "Convolution"
  bottom: "data"
  top: "conv_t1"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  convolution_param {
    num_output: 20
    kernel_size: 5
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
    stride_h: 1
    stride_w: 1
  }
}
layer {
  name: "pool_t1"
  type: "Pooling"
  bottom: "conv_t1"
  top: "pool_t1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv_t2"
  type: "Convolution"
  bottom: "pool_t1"
  top: "conv_t2"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  convolution_param {
    num_output: 50
    kernel_size: 5
    stride: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "pool_t2"
  type: "Pooling"
  bottom: "conv_t2"
  top: "pool_t2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "ip_t1"
  type: "InnerProduct"
  bottom: "pool_t2"
  top: "ip_t1"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  inner_product_param {
    num_output: 500
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "relu_t1"
  type: "ReLU"
  bottom: "ip_t1"
  top: "ip_t1"
}
layer {
  name: "ip_t2"
  type: "InnerProduct"
  bottom: "ip_t1"
  top: "ip_t2"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  inner_product_param {
    num_output: 100
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "ip_t2"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "ip_t2"
  bottom: "label"
  top: "loss"
  loss_weight: 1.0
}

# ts_lenet.prototxt

layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  transform_param {
    scale: 0.00392156885937
  }
  data_param {
    source: "/opt/caffe/examples/mnist/mnist_train_lmdb"
    batch_size: 64
    backend: LMDB
  }
}
layer {
  name: "test_data"
  type: "Data"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }
  transform_param {
    scale: 0.00392156885937
  }
  data_param {
    source: "/opt/caffe/examples/mnist/mnist_test_lmdb"
    batch_size: 100
    backend: LMDB
  }
}
layer {
  name: "conv_s1"
  type: "Convolution"
  bottom: "data"
  top: "conv_s1"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  convolution_param {
    num_output: 20
    kernel_size: 5
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
    stride_h: 1
    stride_w: 1
  }
}
layer {
  name: "pool_s1"
  type: "Pooling"
  bottom: "conv_s1"
  top: "pool_s1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv_s2"
  type: "Convolution"
  bottom: "pool_s1"
  top: "conv_s2"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  convolution_param {
    num_output: 50
    kernel_size: 5
    stride: 1
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "pool_s2"
  type: "Pooling"
  bottom: "conv_s2"
  top: "pool_s2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "ip_s1"
  type: "InnerProduct"
  bottom: "pool_s2"
  top: "ip_s1"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  inner_product_param {
    num_output: 500
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "relu_s1"
  type: "ReLU"
  bottom: "ip_s1"
  top: "ip_s1"
}
layer {
  name: "ip_s2"
  type: "InnerProduct"
  bottom: "ip_s1"
  top: "ip_s2"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 2.0
  }
  inner_product_param {
    num_output: 100
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "conv_t1"
  type: "Convolution"
  bottom: "data"
  top: "conv_t1"
  param {
    lr_mult: 0.0
  }
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 20
    kernel_size: 5
    stride: 1
  }
}
layer {
  name: "pool_t1"
  type: "Pooling"
  bottom: "conv_t1"
  top: "pool_t1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv_t2"
  type: "Convolution"
  bottom: "pool_t1"
  top: "conv_t2"
  param {
    lr_mult: 0.0
  }
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 50
    kernel_size: 5
    stride: 1
  }
}
layer {
  name: "pool_t2"
  type: "Pooling"
  bottom: "conv_t2"
  top: "pool_t2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "ip_t1"
  type: "InnerProduct"
  bottom: "pool_t2"
  top: "ip_t1"
  param {
    lr_mult: 0.0
  }
  param {
    lr_mult: 0.0
  }
  inner_product_param {
    num_output: 500
  }
}
layer {
  name: "relu_t1"
  type: "ReLU"
  bottom: "ip_t1"
  top: "ip_t1"
}
layer {
  name: "ip_t2"
  type: "InnerProduct"
  bottom: "ip_t1"
  top: "ip_t2"
  param {
    lr_mult: 0.0
  }
  param {
    lr_mult: 0.0
  }
  inner_product_param {
    num_output: 100
  }
}
layer {
  name: "sf_s1"
  type: "Softmax"
  bottom: "ip_s2"
  top: "sf_s1"
}
layer {
  name: "sf_t1"
  type: "Softmax"
  bottom: "ip_t2"
  top: "sf_t1"
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "ip_s2"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "ip_s2"
  bottom: "label"
  top: "loss"
  loss_weight: 0.01
}
layer {
  name: "ts_loss"
  type: "SigmoidCrossEntropyLoss"
  bottom: "sf_s1"
  bottom: "sf_t1"
  top: "ts_loss"
  loss_weight: .99
}


# command to run on terminal
caffe train -gpu all -solver ts_lenet_solver.prototxt -weights snapshots/teacher.caffemodel 

# command to save output on terminal
caffe train -gpu all -solver teacher_lenet_solver.prototxt 2>&1 | tee norm.txt