# Dual Autoencoder Collaborative Metric Learning

## 模型基本介绍
Dual Autoencoder Collaborative (DaeCML)是基于双自编码器的协同度量学习。其主要原理为将用户数据向量$\vec{v}_{user}$和商品数据向量$\vec{v}_{item}$通过不同的2个编码器同时映射到同一个嵌入空间中。在这一个空间中，定义一个定义明确的度量$d$，基于这一个度量以及训练集中物品和商品的正负关系，来优化编码器的2个函数，使其映射出的用户商品嵌入向量$\vec{e}_{user},\vec{e}_{item}$能够符合：  
* 若用户喜欢商品，则$d(\vec{e}_{user},\vec{e}_{item}) < p$.  
* 若用户不喜欢商品，则$d(\vec{e}_{user},\vec{e}_{item}) > p$.  

在此之后，$\vec{e}_{user},\vec{e}_{item}$也会分别通过2个不同的解码函数，被反映射回$\vec{v}_{user},\vec{v}_{item}$。这一部分可以做模型的解读使用，但对于在产业里，其实没有特别大的必要，因此可以省略（即在loss function中不产生损失）  

## 模型结构
![](./DaeCML.png)

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import math
import os


def sparseEmbed(df, name, num, colIdx):
    embedName = [ name+"_"+str(i) for i in range(num)] 
    Emptydf = pd.DataFrame()
    Emptydf[embedName] = df[name].str.split('|',expand=True)
    values = np.unique(Emptydf[embedName].values)
    
    dic = {}
    a = 0
    for i in values:
        dic[i] = a
        a += 1
    dic.pop('nan', None)
    
    
    appendValue = np.zeros([Emptydf.values.shape[0], len(values)])
    for i in range(Emptydf.values.shape[0]):
        for j in range(num):
            key = Emptydf.values[i][j]
            if key in dic:
                appendValue[i][dic[key]] = 1
    
    for i in range(appendValue.shape[1], 0, -1):
        df.insert(colIdx, name+"_"+str(i-1), appendValue[:, i-1])
    
    del df[name]
    return df

def toDummy(df, name, colIdx):
    num = len(np.unique(df[name].values.astype(str)))-1
    embedName = [ name+"_"+str(i) for i in range(num)]  # don't need nan value
        
    dic = {}
    a = 0
    for i in range(num+1):
        dic[i] = a
        a += 1
    dic.pop('nan', None)
        
    appendValue = np.zeros([df[name].size, a])
    for i in range(df[name].size):
        key = df[name].values[i]
        if key in dic:
            appendValue[i][dic[key]] = 1
    
    for i in range(appendValue.shape[1], 0, -1):
        df.insert(colIdx, name+"_"+str(i-1), appendValue[:, i-1])
    
    del df[name]
    return df

def genderDummy(df, name, colIdx):
    pool = set()
    num = len(np.unique(df[name].values))-1
    for i in df[name].values:
        pool.add(str(i))
    num = len(list(pool))-1
    embedName = [ name+"_"+str(i) for i in range(num)]  # don't need nan value
        
    dic = {}
    a = 0
    for i in range(num+1):
        dic[i] = a
        a += 1
    dic.pop('nan', None)
        
    appendValue = np.zeros([df[name].size, a])
    for i in range(df[name].size):
        key = df[name].values[i]
        if key in dic:
            appendValue[i][dic[key]] = 1
    
    for i in range(appendValue.shape[1], 0, -1):
        df.insert(colIdx, name+"_"+str(i-1), appendValue[:, i-1])
    
    del df[name]
    return df

## 数据结构以及数据清理
## Importing data and transforming to categorical binary input data form
模型所需要的数据结构要求，需要每1条样本中（1行数据），称之为$\vec{v}$，要包括关于一个用户的所有的特征，我们称之为$\vec{v}_{user}$，是的一个亚向量；同时，还要包括一个商品的所有特征，我们称之为$\vec{v}_{item}$，是的另一个亚向量；最终还要包括一个label，这个label要么是0要么是1，用于表示这个用户是否喜欢了这个物品。因此$\vec{v}$由$\vec{v}_{user},\vec{v}_{item}$和label组成。

In [2]:
head = ["user_age", "user_gender", "user_7_hero", "user_30_hero", "user_7_keyword", "user_7_author", "item_rate", "item_keyword", "item_author", "item_avgTime", "item_numReader", "item_numTime", "label"]
raw = pd.read_csv("./thing.txt", names=head, sep=",", index_col = False)


colIdx = raw.columns.values.tolist().index("user_gender")
raw = genderDummy(raw, "user_gender", colIdx)
colIdx = raw.columns.values.tolist().index("item_keyword")
raw = toDummy(raw, "item_keyword", colIdx)

numDic = {"user_gender": 1, "user_7_hero": 5, "user_30_hero": 5, "user_7_keyword": 3, "user_7_author": 3, "item_keyword": 1, "item_author": 3}
for i in ["user_7_hero", "user_30_hero", "user_7_keyword", "user_7_author", "item_author"]:
    colIdx = raw.columns.values.tolist().index(i)
    raw = sparseEmbed(raw, i, numDic[i], colIdx)
    print("finished with", i)

# normalize numerical features into interval [0, 1]
for i in ["user_age", "item_rate", "item_avgTime", "item_numReader", "item_numTime"]:
    r = raw[i].values.astype(float)
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
    raw_normalized = pd.DataFrame(x_scaled)
    raw[i] = raw_normalized

raw = raw.sample(200000)
    
raw.head()

finished with user_7_hero
finished with user_30_hero
finished with user_7_keyword
finished with user_7_author
finished with item_author


Unnamed: 0,user_age,user_gender_0,user_gender_1,user_gender_2,user_gender_3,user_7_hero_0,user_7_hero_1,user_7_hero_2,user_7_hero_3,user_7_hero_4,...,item_author_519,item_author_520,item_author_521,item_author_522,item_author_523,item_author_524,item_avgTime,item_numReader,item_numTime,label
123562,0.266667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071777,0.689978,0.344623,1.0
374081,0.28,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.013386,0.798574,0.074388,0.0
151357,0.293333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.323478,0.259139,0.583314,0.0
377594,0.373333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.040071,0.148939,0.04153,0.0
197279,0.32,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.113396,0.476278,0.375822,0.0


## 模型部分代码
代码使用Keras进行编写，因为大多数神经网络的构型都是MLP，因此keras是足够的。

### 模型Graph的代码

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Subtract, Lambda, Concatenate, multiply
import keras.backend as K
from keras.losses import mean_squared_error
from sklearn.metrics import roc_auc_score

batch = 2048

data = raw#.sample(50000)

# Splitting dataframe into train, validation, and testing
dataY = data['label'].values
dataX = data.drop(columns = 'label').values


X, Xtest, Y, Ytest = train_test_split(dataX, dataY, test_size = 0.2, random_state = 42)
Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size = 0.25, random_state = 42)


break_index = data.columns.values.tolist().index("item_rate") # first item index-1 is the break index
length_total = data.values.shape[1]
length_p = break_index # index of last user feature into length of the user feature
length_g = length_total-length_p-1


def pgSplit(data, idx):
    data_p = data[:, :idx]
    data_g = data[:, idx:]
    return data_p, data_g

Xtrain_p, Xtrain_g = pgSplit(Xtrain, break_index)
Xval_p, Xval_g = pgSplit(Xval, break_index)
Xtest_p, Xtest_g = pgSplit(Xtest, break_index)

a = 1
global num_encode_1
global num_encode_2
global num_encode_3
global num_neck
global num_decode_1
global num_decode_2
global num_decode_3
global num_output_to_p
global num_output_to_g
global threshold

num_encode_1 = int(256 *a)
num_encode_2 = int(128 *a)
num_encode_3 = int(64 *a)
num_neck = 3
num_decode_1 = num_encode_3
num_decode_2 = num_encode_2
num_decode_3 = num_encode_1
num_output_to_p = length_p
num_output_to_g = length_g
threshold = 0.5

label = Input(shape=(1,))

## person autoencoder
main_p_input = Input(shape=(length_p,))
encode_p_1 = Dense(num_encode_1, activation='relu')(main_p_input)
encode_p_2 = Dense(num_encode_2, activation='relu')(Dropout(0.2)(encode_p_1))
encode_p_3 = Dense(num_encode_3, activation='relu')(Dropout(0.2)(encode_p_2))
encode_p_neck = Dense(num_neck, activation= 'sigmoid')(encode_p_3) ###
decode_p_1 = Dense(num_decode_1, activation='relu')(Dropout(0.2)(encode_p_neck))
decode_p_2 = Dense(num_decode_2, activation='relu')(Dropout(0.2)(decode_p_1))
decode_p_3 = Dense(num_decode_3, activation='relu')(Dropout(0.2)(decode_p_2))

## goods autoencoder
main_g_input = Input(shape=(length_g,))
encode_g_1 = Dense(num_encode_1, activation='relu')(main_g_input)
encode_g_2 = Dense(num_encode_2, activation='relu')(Dropout(0.2)(encode_g_1))
encode_g_3 = Dense(num_encode_3, activation='relu')(Dropout(0.2)(encode_g_2))
encode_g_neck = Dense(num_neck, activation= 'sigmoid')(encode_g_3) ###
decode_g_1 = Dense(num_decode_1, activation='relu')(Dropout(0.2)(encode_g_neck))
decode_g_2 = Dense(num_decode_2, activation='relu')(Dropout(0.2)(decode_g_1))
decode_g_3 = Dense(num_decode_3, activation='relu')(Dropout(0.2)(decode_g_2))



###### Define 4 output layers
# Reconstruction Layer person
output_p_out = Dense(num_output_to_p, activation= 'sigmoid', name = "p")(decode_p_3)

# Reconstruction Layer goods
output_g_out = Dense(num_output_to_g, activation= 'sigmoid', name = "g")(decode_g_3)

# Covariance Layer
def CovLayer(X):
    n_rows = tf.cast(tf.shape(X)[0], tf.float32)
    X = X - (tf.reduce_mean(X, axis=0))
    cov = tf.matmul(X, X, transpose_a=True) / n_rows
    return tf.reshape(tf.reduce_mean(tf.matrix_set_diag(cov, tf.zeros(num_neck, tf.float32))), [1])

concat_layer = Concatenate(axis=0)([encode_p_neck, encode_g_neck])
covLayer = Lambda(CovLayer, name="cov")(concat_layer) # Just a scalar layer

# Signed Distance Layer
def DisLayer(distance):
    return tf.reshape(tf.norm(distance, axis=1, ord=np.inf), (-1,1))

distance = Subtract()([encode_p_neck, encode_g_neck])
disLayer = Lambda(DisLayer, name="dist")(distance)





## 损失方程 Loss function:
这个模型一共有3个损失方程：  

\begin{aligned}
    \mathcal{L}_{reconstruct} &= ||\vec{v}_{user,item}-\hat{\vec{v}}_{user,item}||_2\\
    \mathcal{L}_{cov} &= \frac{1}{N}(||\mathrm{Cov}(E)||_f - ||\mathrm{diag}(\mathrm{Cov}(E))||_2^2)\\
    \mathcal{L}_{neck} &= \max\{0, \lambda_{margin}+(\vec{y}_{label}||\vec{e}_{user}-\vec{e}_{item}||_\infty-p)\}\\
    \mathcal{L} &= \alpha\mathcal{L}_{neck}+\beta\mathcal{L}_{reconstruct}+\gamma\mathcal{L}_{cov}
\end{aligned}

* 重建损失 (reconstruct loss，第一个)

这一损失很好理解，就是经过编码器编码解码后再次还原回来的数据和原来的数据相差多少。此处，我们使用的是标准的MSE损失即可。

* 协方差损失（covariance loss，第二个)

该损失旨在于让$\vec{v}_{user},\vec{v}_{item}$，在低维度量空间中的嵌入表示$\vec{e}_{user},\vec{e}_{item}$，更好地利用空间。假设嵌入空间的维度为2（实际上维度比这个会高不少，但为了方便展示原理，此处以2维作图示范），那么嵌入的用户和商品可能会出现图3.4. i.2a的情况。这种情况不难看出，嵌入的表示并没有非常好的充盈整个2维的空间，由于巧合而导致的高共线性的嵌入会极度浪费空间，因而导致调参时不断调高嵌入空间的维度，以获得更大的嵌入空间，最终一定会造成严重的过拟合状况。因此，此处的损失方程，就是为了使更多的嵌入点能够尽可能均匀分布，呈现下左图的状况。其中，为嵌入空间后所有的$\vec{e}_{user},\vec{e}_{item}$形成全嵌入数据矩阵，其协方差矩阵用多变数平均中心化后，再使用矩阵自乘转置矩阵得到。  
![](./covLoss.png)

* 嵌入距离损失 (bottleneck loss，第三个)

这一部分损失发生于嵌入的低维度量空间，既然是度量空间，这一空间中的我们会相对应定义一个度量来测量各个$\vec{e}_{user},\vec{e}_{item}$之间的距离关系。此时，定义一个适合该任务的度量对于模型更好的表现是非常关键的，首先考虑到以下几点：  
1) 定义的度量必须要满足正式数学中对度量的各个要求
2) 定义的度量要尽可能在低维空间中，在保证目标点与点距离关系的前提下，嵌入更多的点
3) 定义的度量不可是图Graph类的距离（比如说汉明距离），其必须由欧式度量通过计算得到（神经网络计算向量的像位于欧式空间），因为需要计算深度学习反向传播过程中的提读数值。因此，这里对于设计合适度量的要求就是非常苛刻的了。传统的$l_1$,$l_2$,甚至$l_\infty$,$l_p$,距离会面临一个非常严峻的问题，那就是在高维度的时候，其能够嵌入更多的点（例如：使用最简单的$l_2$欧式距离，在1维上只能嵌入2个相互距离相等的点，在2维上能嵌入3个相互距离相等的点，在3维上能嵌入4个相互距离相等的点……嵌入能力随维度增加而线性增加），然而就像之前所提到的，在增大维度的时候会遇到严峻的过拟合问题，于是导致模型无法正常工作。然而在一般的图算法中所提到的 “最短路径距离” 直接突破维度的限制，因此才能有非常好的嵌入能力。

因此这里使用uniform metric：
$$d'(\vec{x},\vec{y}) = \min\{a, d(\vec{x},\vec{y})\},(a>0)$$
同时$d(\vec{x},\vec{y})$为$l_\infty$距离：
$$d(\vec{x},\vec{y}) = \sup_i\{|x_i-y_i|\}$$

之后通过一些列推到，可以得到，优化uniform metric的mse其实是等价于优化$l_\infty$的hinge loss的，因此这里不过多赘述

In [3]:
margin = 0.6 ## 可以设置的超参数 \lambda_{margin}

###### Define 3 loss
#loss 1: reconstruction loss for person
   ## MSE
    
#loss 2: reconstruction loss for goods
   ## MSE
    
#loss 3: covariance loss for Covariance Layer
def covarianceLoss(zeroCovariance, Cov_Layer):
    return Cov_Layer - 0

#loss 4: distance loss for Distance Layer
def distanceLoss(label, dis_Layer):
    sign = 2*label-1
    return tf.reduce_mean(tf.maximum(0.0, margin*threshold+tf.multiply(sign, dis_Layer-threshold)))



## 度量
度量采用基本的AUC和Accuracy度量

In [None]:
## Metric 
def AUC(label, disLayer):
    output = K.cast(tf.less_equal(disLayer, threshold), tf.float32)
    auc = tf.metrics.auc(output, label)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

def Accuracy(label, disLayer):
    output = K.cast(tf.less_equal(disLayer, threshold), tf.float32)
    accuracy = tf.metrics.accuracy(output, label)[1]
    K.get_session().run(tf.local_variables_initializer())
    return accuracy

## 训练！

In [None]:
losses = {"p": 'mse',
          "g": 'mse',
          "cov": covarianceLoss,
          "dist": distanceLoss}

weights = {"p": 0,
          "g": 0,
          "cov": 0,
          "dist": 1}

metric = {"dist": [AUC, Accuracy]}


zero_train = np.zeros((Xtrain_p.shape[0],))
zero_val = np.zeros((Xval_p.shape[0],))

model = Model(inputs= [main_p_input, main_g_input, label], outputs = [output_p_out, output_g_out, covLayer, disLayer])
model.compile(optimizer='RMSProp', loss=losses, loss_weights=weights, metrics = metric)
model.fit([Xtrain_p, Xtrain_g, Ytrain], [Xtrain_p, Xtrain_g, zero_train, Ytrain], validation_data=([Xval_p, Xval_g, Yval], [Xval_p, Xval_g, zero_val, Yval]), epochs=50, batch_size=batch, verbose=2)