New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
contrastive loss return nan loss at extreme values #1451
Comments
The implementation of tanh in caffe right now returns NaN if an input is too large or small to that layer (below -40 or above 40 on my machine). That is where the NaNs could be coming from. |
@seanbell thanks for your answer, I think you're right! But there are no solutions. |
See #1455 -- this should fix the NaN problem from tanh. Note that if you're using tanh and giving it very large/small inputs, the gradient will be identically 0 and the network will not update weights for those inputs. You may want to try and better scale your inputs so that they have (approximately) mean 0 and standard deviation 1. Or you can use ReLU instead of tanh, which doesn't have the same saturation problems. |
@seanbell I will try it. |
name: "face_siamese_train_test"
layers {
name: "pair_data"
type: DATA
top: "pair_data"
top: "sim"
data_param {
source: "lbph_siamese_train_leveldb"
batch_size: 10
}
include: { phase: TRAIN }
}
layers {
name: "pair_data"
type: DATA
top: "pair_data"
top: "sim"
data_param {
source: "features_siamese_test_leveldb"
batch_size: 100
}
include: { phase: TEST }
}
layers {
name: "slice_pair"
type: SLICE
bottom: "pair_data"
top: "data"
top: "data_p"
slice_param {
slice_dim: 1
slice_point: 1
}
}
layers {
name: "inner1"
type: INNER_PRODUCT
bottom: "data"
top: "inner1"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 1440
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
param: "inner1_w"
param: "inner1_b"
}
layers {
name: "tanh1"
type: TANH
bottom: "inner1"
top: "tanh1"
}
layers {
name: "inner3"
type: INNER_PRODUCT
bottom: "tanh1"
top: "inner3"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 2880
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
param: "inner3_w"
param: "inner3_b"
}
layers {
name: "tanh3"
type: TANH
bottom: "inner3"
top: "tanh3"
}
layers {
name: "inner1_p"
type: INNER_PRODUCT
bottom: "data_p"
top: "inner1_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 1440
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
param: "inner1_w"
param: "inner1_b"
}
layers {
name: "tanh1_p"
type: TANH
bottom: "inner1_p"
top: "tanh1_p"
}
layers {
name: "inner3_p"
type: INNER_PRODUCT
bottom: "tanh1_p"
top: "inner3_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 2880
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
param: "inner3_w"
param: "inner3_b"
}
layers {
name: "tanh3_p"
type: TANH
bottom: "inner3_p"
top: "tanh3_p"
}
layers {
name: "loss_c"
type: CONTRASTIVE_LOSS
contrastive_loss_param {
margin: 1.0
}
bottom: "tanh3"
bottom: "tanh3_p"
bottom: "sim"
top: "loss_c"
}
The text was updated successfully, but these errors were encountered: