# # readme

- metric learning을 위한 훈련데이터 생성
    - 기준 데이터 : 가공식품 이미지의 feature extraction 데이터(unit단위인 normalized vector)
        - 일정 distance 기준에 의한 positive/negative set 생성 (balanced/imbalanced)
        - cf. negative 기준 : 0.35 기준에 들고, positive 개수와 맞춰진 negative sample(가까운순)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.model_selection import train_test_split

import glob
import os
import multiprocessing

In [2]:
import pickle
import time
import random

---

In [3]:
def chunkIt(seq, num):
    if len(seq) % int(num) == 0:
        avg = int(len(seq) / int(num))
    else:
        avg = int(len(seq) / int(num)) + 1 # 이렇게 해야하네!
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out       

---

# data load

In [4]:
data = pd.read_csv('new_feat_df.csv')

In [5]:
data.shape

(305999, 266)

In [6]:
data.head(3)

Unnamed: 0,prod_id,title,cleaned_title,model_id,weight,group_no,group_cnt,0,1,2,...,249,250,251,252,253,254,255,top3_word,top2_word,word_cnt
0,107751656,산고추 고추절임 업소용식자재 (500gX10개) 한푸드,산고추 고추절임 업소용식자재 한푸드,,500G,124012401,38,0.095858,0.027997,0.027619,...,0.062841,0.019664,0.095381,0.000538,0.01662,0.030399,0.011093,한푸드 고추절임 산고추,한푸드 고추절임,4
1,107751669,산고추 고추절임 업소용식자재 (500gX20개) 한푸드,산고추 고추절임 업소용식자재 한푸드,,500G,124012401,38,0.095858,0.027998,0.027619,...,0.062841,0.019664,0.095381,0.000538,0.01662,0.030399,0.011093,한푸드 고추절임 산고추,한푸드 고추절임,4
2,108218728,한푸드 업소용 식자재 고추절임 산고추 500g 10EA,한푸드 업소용 식자재 고추절임 산고추,,500G,124012401,38,0.095858,0.027997,0.027619,...,0.062841,0.019664,0.095381,0.000538,0.01662,0.030399,0.011093,한푸드 고추절임 산고추,한푸드 고추절임,5


---

In [7]:
group_no_df = data.groupby('group_no').size().reset_index()

In [8]:
group_no_df.columns = ['group_no', 'size_']

In [9]:
group_no_df.size_

0         2
1         2
2         2
3         3
4         5
         ..
222426    1
222427    1
222428    1
222429    2
222430    1
Name: size_, Length: 222431, dtype: int64

---

# filtering data
- 최근 기준 : 그룹 크기 2이상

In [10]:
group_no_df[group_no_df.size_ > 4].group_no.tolist()[:5]

[100000254, 100003111, 100003607, 100003821, 100033511]

In [11]:
filtering_grp = group_no_df[group_no_df.size_ > 2].group_no.tolist()

In [12]:
data_filter = data[data.group_no.isin( filtering_grp )]

In [13]:
data_filter.shape

(73681, 266)

In [14]:
data_filter.shape[0]/data.shape[0] # 24% 정도의 prod_id 남음

0.24078836858943983

---

In [15]:
data_renew = data_filter.drop(
            columns = ['title', 'cleaned_title', 'model_id', 'weight', 'top3_word', 'top2_word', 'word_cnt'])

In [16]:
data_renew

Unnamed: 0,prod_id,group_no,group_cnt,0,1,2,3,4,5,6,...,246,247,248,249,250,251,252,253,254,255
0,107751656,124012401,38,0.095858,0.027997,0.027619,0.000000,0.000000,0.041419,0.000223,...,0.039504,0.118242,0.016577,0.062841,0.019664,0.095381,0.000538,0.016620,0.030399,0.011093
1,107751669,124012401,38,0.095858,0.027998,0.027619,0.000000,0.000000,0.041419,0.000223,...,0.039504,0.118242,0.016577,0.062841,0.019664,0.095381,0.000538,0.016620,0.030399,0.011093
2,108218728,124012401,38,0.095858,0.027997,0.027619,0.000000,0.000000,0.041419,0.000223,...,0.039504,0.118242,0.016577,0.062841,0.019664,0.095381,0.000538,0.016620,0.030399,0.011093
3,124011122,124012401,38,0.092860,0.032256,0.029344,0.000000,0.000578,0.040821,0.000427,...,0.042548,0.116558,0.012958,0.068049,0.018716,0.096427,0.000931,0.014353,0.036047,0.011458
4,124011144,124012401,38,0.092860,0.032256,0.029344,0.000000,0.000578,0.040821,0.000427,...,0.042548,0.116558,0.012958,0.068049,0.018716,0.096427,0.000931,0.014353,0.036047,0.011458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82471,152799569,152799569,3,0.169476,0.021603,0.032281,0.007862,0.001533,0.262756,0.007244,...,0.148308,0.063719,0.000000,0.050773,0.001786,0.115408,0.043102,0.000000,0.024779,0.000776
82472,152799776,152799569,3,0.137807,0.029813,0.011102,0.002626,0.000000,0.220677,0.007591,...,0.151050,0.066073,0.001325,0.104724,0.006390,0.120621,0.034829,0.000000,0.008911,0.000552
92330,125008424,125008424,2,0.094371,0.006952,0.034753,0.040075,0.061682,0.121551,0.087817,...,0.083730,0.014073,0.000047,0.073444,0.075622,0.199791,0.009518,0.000000,0.001497,0.040486
92331,125008425,125008424,2,0.105567,0.004273,0.003390,0.037910,0.020251,0.146470,0.115562,...,0.092375,0.034384,0.000000,0.133222,0.082380,0.214659,0.010539,0.000000,0.003605,0.042854


----

# distance matrix

In [17]:
from sklearn.metrics import pairwise_distances

In [19]:
data_renew_distances = pairwise_distances( data_renew.iloc[:, 3:].values, metric='euclidean', n_jobs=-1 )

In [20]:
data_renew_distances.shape

(73681, 73681)

---

# prod index

In [22]:
data_renew.shape

(73681, 259)

In [23]:
# i -> prod 사전
# prod -> i 사전
index_to_prod = {}
prod_to_index = {}

for k in data_renew.prod_id.values: # distinct 하니깐 set() 으로 안한다
    i = len(index_to_prod)
    index_to_prod[i] = k
    prod_to_index[k] = i

In [24]:
data_renew.prod_id.values

array([107751656, 107751669, 108218728, ..., 125008424, 125008425,
       125009331])

In [25]:
prod_to_index[107751656]

0

In [26]:
prod_to_index[107751669]

1

----

# group index data

In [28]:
# grp에 속하는 prod list
grp_prod = dict(data_renew.groupby(['group_no'])['prod_id'].apply(list))

In [30]:
# prod별 grp
prod_grp = {v_e : k for k, v in grp_prod.items() for v_e in v}

---
---
---

# # pair-data EDA

### ### step1 : make pair index data

In [32]:
index_list = list(index_to_prod.keys())

In [33]:
from sklearn.model_selection import train_test_split

train_index_list, test_index_list = train_test_split(index_list, test_size = 0.2 , random_state = 37)

In [34]:
len(train_index_list)/len(index_list)

0.799989142384061

---

- 기준 : distance 0.3 이하인 애들만 pair 훈련데이터로 생성한다

In [104]:
print('{}, {}'.format(1,2))

1, 2


In [38]:
set([1,2]) & set([2,3])

{2}

In [None]:
%%time
### sampling-dict(pair) ###
sampling_dict_wo_balance = {}
#####################

positive_length = []
negative_length = []
sampling_length = []

start_time = time.time()

for _,i in enumerate(train_index_list):
    #if _ %5000 == 0:
    #    elapsed_time = time.time() - start_time
    #    print( _, ',', _/len(train_index_list) )
    #    print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)), '\n' )
    
    
    ### 자기자신을 뺀, positive + semi_hard_negative ###
    sampling_index = set(np.where( data_renew_distances[ i ] <= 0.35 )[0]) - set([i])
    
    positive_sample = set(grp_prod[ prod_grp[index_to_prod[i]] ]) - set( [index_to_prod[i] ] )
    positive_index_list = []
    for p in positive_sample:
        positive_index_list.append( prod_to_index[p] )
    
    # positive_index : 0.35 거리 안에 존재하는, positive
    positive_index = set(positive_index_list) & set(sampling_index)
    
    # negative_index : 0.35 거리 안에 존재하는, negative
    negative_index = set(sampling_index) - set(positive_index)
    
    # negative 와 전체 sampling 규모
    print( 'negative_size : {}, total_sampling_size : {}'.format(len(negative_index), len(sampling_index)) )
    
    
    positive_length.append( len(positive_index) )
    negative_length.append( len(negative_index) )
    sampling_length.append( len(sampling_index) )
    
    
    ### pair 쌍 만들어주기 ###
    #sampling_dict_wo_balance[ index_to_prod[i] ] = []
    
    #for s in sampling_index:
    #    sampling_dict_wo_balance[ index_to_prod[i] ].append( [index_to_prod[i] , index_to_prod[s] ] )

In [43]:
ratio_ = []
for i in range(len(positive_length)):
    try:
        r = negative_length[i]/sampling_length[i]
    except:
        r = 0
    ratio_.append(r)

In [45]:
# mean of ratio of negative_length

np.mean(ratio_)

0.4039883871036081

In [55]:
ratio_array = np.array(ratio_)

In [58]:
# 0.35 이내 sampling 없는 경우 제외하고 ratio_mean
ratio_array[ratio_array!=0].mean()

0.6985242443366114

In [60]:
# 이정도 비율로만 0.35 sampling 군집 존재해
len(ratio_array[ratio_array!=0])/len(ratio_array)

0.5783455483170467

---

### 0.35 이내의 positive의 최장 거리, 그리고 이 최장 거리 이내에 존재하는 negative sample 개수

In [None]:
sampling_index = set(np.argsort(data_renew_distances[ i ])) - set( grp_prod[ prod_grp[index_to_prod[i]] ] )
sampling_index = list(sampling_index)[ :positive_length]

In [65]:
a = np.array([1,2,3,4,5])

In [66]:
a[[0,1]]

array([1, 2])

In [67]:
a[a>3]

array([4, 5])

In [None]:
%%time
### sampling-dict(pair) ###
sampling_dict_wo_balance = {}
#####################

negative_within_ratio = []

start_time = time.time()

for _,i in enumerate(train_index_list):
    #if _ %5000 == 0:
    #    elapsed_time = time.time() - start_time
    #    print( _, ',', _/len(train_index_list) )
    #    print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)), '\n' )
    
    
    ### 자기자신을 뺀, positive + semi_hard_negative ###
    sampling_index = set(np.where( data_renew_distances[ i ] <= 0.35 )[0]) - set([i])
    
    positive_sample = set(grp_prod[ prod_grp[index_to_prod[i]] ]) - set( [index_to_prod[i] ] )
    positive_index_list = []
    for p in positive_sample:
        positive_index_list.append( prod_to_index[p] )
    
    # positive_index : 0.35 거리 안에 존재하는, positive
    positive_index = set(positive_index_list) & set(sampling_index)
    
    # negative_index : 0.35 거리 안에 존재하는, negative
    negative_index = set(sampling_index) - set(positive_index)
    
    #
    try:
        positive_max = np.max(data_renew_distances[ i ][list(positive_index)] )
        print('positive_max : {}'.format(positive_max))
        negative_distance = data_renew_distances[i][list(negative_index)]
        print(len( negative_distance[ negative_distance < positive_max])/len(negative_index))
        print('\n')
        negative_within_ratio.append( len( negative_distance[ negative_distance < positive_max])/len(negative_index) )
    except:
        negative_within_ratio.append(-1)
    # negative 와 전체 sampling 규모
    #print( 'negative_size : {}, total_sampling_size : {}'.format(len(negative_index), len(sampling_index)) )
    
    
    #positive_length.append( len(positive_index) )
    #negative_length.append( len(negative_index) )
    #sampling_length.append( len(sampling_index) )
    
    
    ### pair 쌍 만들어주기 ###
    #sampling_dict_wo_balance[ index_to_prod[i] ] = []
    
    #for s in sampling_index:
    #    sampling_dict_wo_balance[ index_to_prod[i] ].append( [index_to_prod[i] , index_to_prod[s] ] )

In [83]:
negative_within_ratio_array = np.array(negative_within_ratio)

In [84]:
# 0.35 이내에 존재하는 negative 중, 25% 정도가 positive_max 거리 안에 존재한다
negative_within_ratio_array[negative_within_ratio_array>=0].mean()

0.25953993225504185

---

# make balanced data
- 0.35 기준에 들고, positive 개수와 맞춰진 negative sample(가까운순)

In [104]:
_/len(train_index_list)

0.2625203583061889

In [123]:
%%time
### sampling-dict(pair) ###
sampling_dict = {}
#####################

negative_within_ratio = []

start_time = time.time()

for _,i in enumerate(train_index_list):
    if _ %5000 == 0:
        elapsed_time = time.time() - start_time
        print( _, ',', _/len(train_index_list) )
        print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)), '\n' )
    
    #################################################
    ### 자기자신을 뺀, positive + semi_hard_negative ###
    sampling_index = set(np.where( data_renew_distances[ i ] <= 0.35 )[0]) - set([i])
    
    positive_sample = set(grp_prod[ prod_grp[index_to_prod[i]] ]) - set( [index_to_prod[i] ] )
    positive_index_list = []
    for p in positive_sample:
        positive_index_list.append( prod_to_index[p] )
    
    # positive_index : 0.35 거리 안에 존재하는, positive
    positive_index = set(positive_index_list) & set(sampling_index)
    
    # negative_index : 0.35 거리 안에 존재하는, negative
    negative_index = set(sampling_index) - set(positive_index)
    
    
    #sampling - negative_index : positive_index 개수에 맞춰준다(그 중에서도 가까운 negative sample)
    sampling_index = set(np.argsort(data_renew_distances[ i ])) - set( positive_index ) # 거리순 index에서 positive 제외
    negative_sampling_index = list(set(sampling_index) & set(negative_index))[:len(positive_index)]
    
    
    #################################################
    ### pair 쌍 만들어주기 - negative / positive ###
    sampling_dict[ index_to_prod[i] ] = []
    
    # negative 데이터 붙여주기
    for s in negative_sampling_index:
        sampling_dict[ index_to_prod[i] ].append( [index_to_prod[i] , index_to_prod[s] ] )
    # positive 데이터 붙여주기
    for p in positive_index:
        sampling_dict[ index_to_prod[i] ].append( [index_to_prod[i] , index_to_prod[p] ] )

0 , 0.0
00:00:00 

5000 , 0.08482627578718784
00:01:28 

10000 , 0.16965255157437567
00:02:57 

15000 , 0.2544788273615635
00:04:26 

20000 , 0.33930510314875134
00:05:54 

25000 , 0.4241313789359392
00:07:23 

30000 , 0.508957654723127
00:08:51 

35000 , 0.5937839305103149
00:10:20 

40000 , 0.6786102062975027
00:11:48 

45000 , 0.7634364820846905
00:13:15 

50000 , 0.8482627578718784
00:14:43 

55000 , 0.9330890336590663
00:16:11 

CPU times: user 17min 21s, sys: 328 ms, total: 17min 21s
Wall time: 17min 21s


----

- save pair index

In [124]:
# save train_pair, valid_pair, test_index
with open('../train_within_0_35_balanced/train_sampling_dict.pkl', 'wb') as f:
    pickle.dump(sampling_dict, f)


---

- sampling_dict 펼치기 : train_pair

In [125]:
train_pair = []

for k,v in sampling_dict.items():
    train_pair.extend(v)

In [126]:
len(train_pair) # 60만에서 30만으로..!

330519

In [127]:
train_pair[:5]

[[155410538, 155398714],
 [155410538, 155399143],
 [155410538, 155399731],
 [155410538, 155407379],
 [155410538, 155407810]]

---
---

### ### step2 : make pair array data

### ### step2.1 : train

### *** multiprocessing

In [129]:
train_pair_chunk = chunkIt(train_pair, 36)

In [130]:
len(train_pair_chunk)

36

---

In [139]:
def parallelize_dataframe(chunk_pair, func):
    #
    #ctx = multiprocessing.get_context("spawn")
    #pool  = ctx.Pool(num_cores)
    pool = multiprocessing.Pool( num_cores )
    stats_list = pool.map(func, chunk_pair) # 리스트 형태로 떨궈준다
    pool.close()
    pool.join()
    print('parallel finished !')
    return stats_list

In [140]:
def make_stats( chunk_pair ):
            
    # input 데이터 정의
    train_pair  = chunk_pair
    
    for i,pair in enumerate( train_pair ):
    
        ###### X_train ######
        # row 생성
        ttt = np.array([])
        ttt = np.append( ttt, data_renew[data_renew.prod_id == pair[0]].iloc[:,3:].values )
        ttt = np.append( ttt, data_renew[data_renew.prod_id == pair[1]].iloc[:,3:].values )
        ttt = ttt.reshape(1, -1)
        
        # row 붙이기
        if i == 0:
            X_train = ttt
            y_train = np.array([])
        else:
            X_train = np.append( X_train, ttt, axis=0 )
        
        ###### y_train ######
        left_grp  = data_renew[data_renew.prod_id == pair[0]].group_no.values[0]
        right_grp = data_renew[data_renew.prod_id == pair[1]].group_no.values[0]
        
        # row 붙이기
        
        if left_grp == right_grp:
            y_train = np.append( y_train, [1] , axis=0 ) # 같다면 True
        else:
            y_train = np.append( y_train, [0] , axis=0 )
    
    return (X_train, y_train)

In [142]:
%%time

num_cores = len(train_pair_chunk)  # 

# df_split은 위에 정의했다
data_parallel = parallelize_dataframe( train_pair_chunk , make_stats )

parallel finished !
CPU times: user 5.59 s, sys: 11.3 s, total: 16.9 s
Wall time: 3min 27s


In [143]:
len(data_parallel)

36

---

- 펴주기

In [144]:
X_train = np.array([])
y_train = np.array([])
for i,(x,y) in enumerate(data_parallel):
    if i == 0:
        X_train = x
        y_train = y
    else:
        X_train = np.append( X_train, x, axis=0 )
        y_train = np.append( y_train, y, axis=0 )

In [145]:
X_train.shape

(330519, 512)

In [146]:
y_train.shape

(330519,)

In [147]:
np.sum(y_train)/y_train.shape[0] # 비율 63% 맞췄다 -> positive가 조금 더 많은 상태

0.6322753003609475

---

- save

In [148]:
os.getcwd()

'/mnt/sda1/myeonggyulee/myeonggyulee/VISION_AI/metric learning/2_svm2+data'

In [149]:
### save array data ###
# train
with open('../train_within_0_35_balanced/X_tain.pkl', 'wb') as f:
    pickle.dump(X_train, f)

with open('../train_within_0_35_balanced/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
