# readme

### # contrastive loss를 사용할 때 주의점
- distance 를 기준으로, 거리가 가까우면 '같다'고 정의. 따라서, accuracy를 측정할때 살펴보면 아래와 같이 정의되어 있다는점에 주의하자!


In [None]:
'''
def compute_accuracy(y_true, y_pred):
    #Compute classification accuracy with a fixed threshold on distances.
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)
'''

---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.model_selection import train_test_split

import glob
import os
import multiprocessing

import pickle
import time
import random

---

In [2]:
def chunkIt(seq, num):
    if len(seq) % int(num) == 0:
        avg = int(len(seq) / int(num))
    else:
        avg = int(len(seq) / int(num)) + 1 # 이렇게 해야하네!
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

---

# data load

## # Train data

In [3]:
with open('../train_within_0_35_balanced/X_tain.pkl', 'rb') as f:
    X_train = pickle.load( f )
    
with open('../train_within_0_35_balanced/y_train.pkl', 'rb') as f:
    y_train = pickle.load( f )

In [4]:
print(X_train.shape, y_train.shape)

(330519, 512) (330519,)


---

## ## Test data

In [5]:
with open('../test_data(common)/TEST_within_035/test_pair_v2_035.pkl', 'rb') as f:
    X_test_dict = pickle.load( f )
    
with open('../test_data(common)/TEST_within_035/test_label_035.pkl', 'rb') as f:
    y_test_dict = pickle.load( f )

In [6]:
len(X_test_dict)

100

In [7]:
len(y_test_dict)

100

In [8]:
# test data 100개 array로

X_test = np.array([])
y_test = np.array([])

for i,key in enumerate(X_test_dict.keys()):
    if i == 0:
        X_test = X_test_dict[key]
        y_test = y_test_dict[key]
    else:
        X_test = np.append( X_test, X_test_dict[key], axis=0 )
        y_test = np.append( y_test, y_test_dict[key], axis=0 )

In [9]:
print(X_test.shape, y_test.shape)

(2183, 512) (2183,)


---
---

## ## Training

In [18]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np

import random
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Lambda
from keras.optimizers import RMSprop
from keras import backend as K

Using TensorFlow backend.


In [19]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)


def create_pairs(x, digit_indices): # balance 하게 만들어주네???(가장 작은 class의 개수에 맞춰서)
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(num_classes)]) - 1
    for d in range(num_classes):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, num_classes)
            dn = (d + inc) % num_classes
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
    return np.array(pairs), np.array(labels)


def create_base_network(input_shape):
    '''Base network to be shared (eq. to feature extraction).
    '''
    input = Input(shape=input_shape)
    #x = Flatten()(input)
    x = input
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)


def compute_accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)


def accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))


- train pair 형태로 변환

In [12]:
tr_pairs = X_train.reshape(len(X_train), 2, -1)
tr_y  = y_train

In [21]:
tr_y[:5]

array([0., 0., 0., 0., 0.])

In [23]:
te_pairs = X_test.reshape(len(X_test), 2, -1)
te_y  = np.array(y_test, dtype=float)

In [13]:
print(tr_pairs.shape, tr_y.shape) # 최신 new_diff

(330519, 2, 256) (330519,)


In [18]:
print(tr_pairs.shape, tr_y.shape)

(534002, 2, 256) (534002,)


In [16]:
print(te_pairs.shape, te_y.shape) # 최신 new_diff

(2183, 2, 256) (2183,)


In [26]:
input_shape = (256,)
epochs = 50

In [27]:
# network definition
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance,
                  output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance) # input, output

# train
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
          batch_size=500,
          epochs=epochs,
          validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y))

# compute final accuracy on training and test sets
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(tr_y, y_pred)
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_y, y_pred)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
* Accuracy on training set: 81.50%
* Accuracy on test set: 61.61%


In [28]:
te_acc

0.6161245991754466

In [29]:
y_pred

array([[1.114046  ],
       [1.2433821 ],
       [0.39067405],
       ...,
       [0.37693438],
       [0.01201613],
       [0.24476998]], dtype=float32)

In [30]:
( y_pred.ravel() < 0.5 )

array([False, False,  True, ...,  True,  True,  True])

In [31]:
te_y.shape

(2183,)

In [32]:
y_pred.reshape(len(y_pred)).shape

(2183,)

In [33]:
np.sum( 1*( y_pred.ravel() < 0.5 ) )/len(y_pred)

0.5483279890059551

In [159]:
( 1*( y_pred.ravel() < 0.5 ) ).shape

(7368000,)

In [163]:
( 1*( y_pred.ravel() < 0.5 ) ).reshape(100,-1).shape

(100, 73680)

In [164]:
te_y.reshape(100, -1).shape

(100, 73680)

---

In [37]:
from sklearn import metrics

precision_100 = []
recall_100 = []

grp_pred_w_key = {}
grp_label_w_key = {}

test_label = y_test
predicted = ( 1*( y_pred.ravel() < 0.5 ) )

#start test key
start_test_key = list(y_test_dict.keys())[0]
#init start index
start_index = 0

for test_key in y_test_dict.keys(): 
    print(test_key)
    if test_key == start_test_key:
        start_index = 0
    
    test_obj_grp_size = len( y_test_dict[test_key] )
    grp_pred = predicted[start_index : start_index + test_obj_grp_size]
    grp_label = test_label[start_index : start_index +test_obj_grp_size]
    # 저장
    grp_pred_w_key[test_key] = grp_pred
    grp_label_w_key[test_key] = grp_label
    ####
    
    print(start_index, test_obj_grp_size)
    
    start_index += test_obj_grp_size
    
    print('label_size : {}'.format(np.sum( grp_label )), 
          'prediction_size : {}'.format(np.sum( grp_pred )) )
    
    # calculate precision / recall
    precision_ = metrics.precision_score( grp_label , grp_pred )
    recall_ = metrics.recall_score( grp_label , grp_pred )
    
    # print
    print( precision_ )
    print( recall_ )
    print('\n')
    
    # append precision / recall
    precision_100.append( precision_ )
    recall_100.append( recall_ )

40648
0 10
label_size : 2 prediction_size : 8
0.25
1.0


66052
10 17
label_size : 2 prediction_size : 17
0.11764705882352941
1.0


37783
27 5
label_size : 2 prediction_size : 2
1.0
1.0


59761
32 5
label_size : 2 prediction_size : 2
1.0
1.0


16639
37 4
label_size : 4 prediction_size : 4
1.0
1.0


17958
41 5
label_size : 5 prediction_size : 5
1.0
1.0


10918
46 18
label_size : 5 prediction_size : 18
0.2777777777777778
1.0


26542
64 4
label_size : 4 prediction_size : 4
1.0
1.0


71651
68 5
label_size : 2 prediction_size : 2
1.0
1.0


45696
73 2
label_size : 1 prediction_size : 2
0.5
1.0


4253
75 12
label_size : 8 prediction_size : 12
0.6666666666666666
1.0


63461
87 96
label_size : 2 prediction_size : 81
0.024691358024691357
1.0


572
183 17
label_size : 14 prediction_size : 17
0.8235294117647058
1.0


66460
200 1
label_size : 0 prediction_size : 1
0.0
0.0


70828
201 24
label_size : 1 prediction_size : 19
0.05263157894736842
1.0


34741
225 170
label_size : 4 prediction_size : 46
0.

In [38]:
np.mean(precision_100)

0.6602213317569411

In [39]:
np.mean(recall_100)

0.9441666666666666