# Dual-Collaborative Filtering Autoencoder Metric Network

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import math
import os


def sparseEmbed(df, name, num, colIdx):
    embedName = [ name+"_"+str(i) for i in range(num)] 
    Emptydf = pd.DataFrame()
    Emptydf[embedName] = df[name].str.split('|',expand=True)
    values = np.unique(Emptydf[embedName].values)
    
    dic = {}
    a = 0
    for i in values:
        dic[i] = a
        a += 1
    dic.pop('nan', None)
    
    
    appendValue = np.zeros([Emptydf.values.shape[0], len(values)])
    for i in range(Emptydf.values.shape[0]):
        for j in range(num):
            key = Emptydf.values[i][j]
            if key in dic:
                appendValue[i][dic[key]] = 1
    
    for i in range(appendValue.shape[1], 0, -1):
        df.insert(colIdx, name+"_"+str(i-1), appendValue[:, i-1])
    
    del df[name]
    return df

def toDummy(df, name, colIdx):
    num = len(np.unique(df[name].values.astype(str)))-1
    embedName = [ name+"_"+str(i) for i in range(num)]  # don't need nan value
        
    dic = {}
    a = 0
    for i in range(num+1):
        dic[i] = a
        a += 1
    dic.pop('nan', None)
        
    appendValue = np.zeros([df[name].size, a])
    for i in range(df[name].size):
        key = df[name].values[i]
        if key in dic:
            appendValue[i][dic[key]] = 1
    
    for i in range(appendValue.shape[1], 0, -1):
        df.insert(colIdx, name+"_"+str(i-1), appendValue[:, i-1])
    
    del df[name]
    return df

def genderDummy(df, name, colIdx):
    pool = set()
    num = len(np.unique(df[name].values))-1
    for i in df[name].values:
        pool.add(str(i))
    num = len(list(pool))-1
    embedName = [ name+"_"+str(i) for i in range(num)]  # don't need nan value
        
    dic = {}
    a = 0
    for i in range(num+1):
        dic[i] = a
        a += 1
    dic.pop('nan', None)
        
    appendValue = np.zeros([df[name].size, a])
    for i in range(df[name].size):
        key = df[name].values[i]
        if key in dic:
            appendValue[i][dic[key]] = 1
    
    for i in range(appendValue.shape[1], 0, -1):
        df.insert(colIdx, name+"_"+str(i-1), appendValue[:, i-1])
    
    del df[name]
    return df

# Importing data and transforming to categorical binary input data form

In [2]:
head = ["user_age", "user_gender", "user_7_hero", "user_30_hero", "user_7_keyword", "user_7_author", "item_rate", "item_keyword", "item_author", "item_avgTime", "item_numReader", "item_numTime", "label"]
raw = pd.read_csv("./thing.txt", names=head, sep=",", index_col = False)

"""
raw_1 = pd.read_csv("./zhangmeng_1.txt", names=head, sep=",", index_col = False)
raw_2 = pd.read_csv("./zhangmeng_2.txt", names=head, sep=",", index_col = False)
raw_3 = pd.read_csv("./zhangmeng_3.txt", names=head, sep=",", index_col = False)
raw_4 = pd.read_csv("./zhangmeng_4.txt", names=head, sep=",", index_col = False)
raw_5 = pd.read_csv("./zhangmeng_5.txt", names=head, sep=",", index_col = False)
raw = pd.concat([raw_1, raw_2, raw_3, raw_4, raw_5], ignore_index=True)

"""

colIdx = raw.columns.values.tolist().index("user_gender")
raw = genderDummy(raw, "user_gender", colIdx)
colIdx = raw.columns.values.tolist().index("item_keyword")
raw = toDummy(raw, "item_keyword", colIdx)

numDic = {"user_gender": 1, "user_7_hero": 5, "user_30_hero": 5, "user_7_keyword": 3, "user_7_author": 3, "item_keyword": 1, "item_author": 3}
for i in ["user_7_hero", "user_30_hero", "user_7_keyword", "user_7_author", "item_author"]:
    colIdx = raw.columns.values.tolist().index(i)
    raw = sparseEmbed(raw, i, numDic[i], colIdx)
    print("finished with", i)

# normalize numerical features into interval [0, 1]
for i in ["user_age", "item_rate", "item_avgTime", "item_numReader", "item_numTime"]:
    r = raw[i].values.astype(float)
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
    raw_normalized = pd.DataFrame(x_scaled)
    raw[i] = raw_normalized

raw = raw.sample(200000)
    
raw.head()

finished with user_7_hero
finished with user_30_hero
finished with user_7_keyword
finished with user_7_author
finished with item_author


Unnamed: 0,user_age,user_gender_0,user_gender_1,user_gender_2,user_gender_3,user_7_hero_0,user_7_hero_1,user_7_hero_2,user_7_hero_3,user_7_hero_4,...,item_author_519,item_author_520,item_author_521,item_author_522,item_author_523,item_author_524,item_avgTime,item_numReader,item_numTime,label
367932,0.266667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0477,0.592828,0.196775,0.0
423931,0.28,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.075289,0.135103,0.070782,0.0
95450,0.44,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.047768,0.144867,0.048154,1.0
24667,0.253333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.076805,0.004367,0.002334,0.0
407330,0.346667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.056155,0.148625,0.058077,1.0


In [8]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Subtract, Lambda, Concatenate, multiply
import keras.backend as K
from keras.losses import mean_squared_error
from sklearn.metrics import roc_auc_score

batch = 1024

data = raw#.sample(50000)

# Splitting dataframe into train, validation, and testing
dataY = data['label'].values
dataX = data.drop(columns = 'label').values


X, Xtest, Y, Ytest = train_test_split(dataX, dataY, test_size = 0.2, random_state = 42)
Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size = 0.25, random_state = 42)


break_index = data.columns.values.tolist().index("item_rate") # first item index-1 is the break index
length_total = data.values.shape[1]
length_p = break_index # index of last user feature into length of the user feature
length_g = length_total-length_p-1


def pgSplit(data, idx):
    data_p = data[:, :idx]
    data_g = data[:, idx:]
    return data_p, data_g

Xtrain_p, Xtrain_g = pgSplit(Xtrain, break_index)
Xval_p, Xval_g = pgSplit(Xval, break_index)
Xtest_p, Xtest_g = pgSplit(Xtest, break_index)

a = 4.5
global num_encode_1
global num_encode_2
global num_encode_3
global num_neck
global num_decode_1
global num_decode_2
global num_decode_3
global num_output_to_p
global num_output_to_g
global threshold

num_encode_1 = int(256 *a)
num_encode_2 = int(128 *a)
num_encode_3 = int(64 *a)
num_neck = 30
num_decode_1 = num_encode_3
num_decode_2 = num_encode_2
num_decode_3 = num_encode_1
num_output_to_p = length_p
num_output_to_g = length_g
threshold = 0.5 * math.sqrt(num_neck)

label = Input(shape=(1,))

## person autoencoder
main_p_input = Input(shape=(length_p,))
encode_p_1 = Dense(num_encode_1, activation='relu')(main_p_input)
encode_p_2 = Dense(num_encode_2, activation='relu')(Dropout(0.05)(encode_p_1))
encode_p_3 = Dense(num_encode_3, activation='relu')(Dropout(0.05)(encode_p_2))
encode_p_neck = Dense(num_neck, activation= 'sigmoid')(encode_p_3) ###
decode_p_1 = Dense(num_decode_1, activation='relu')(Dropout(0.05)(encode_p_neck))
decode_p_2 = Dense(num_decode_2, activation='relu')(Dropout(0.05)(decode_p_1))
decode_p_3 = Dense(num_decode_3, activation='relu')(Dropout(0.05)(decode_p_2))

## goods autoencoder
main_g_input = Input(shape=(length_g,))
encode_g_1 = Dense(num_encode_1, activation='relu')(main_g_input)
encode_g_2 = Dense(num_encode_2, activation='relu')(Dropout(0.05)(encode_g_1))
encode_g_3 = Dense(num_encode_3, activation='relu')(Dropout(0.05)(encode_g_2))
encode_g_neck = Dense(num_neck, activation= 'sigmoid')(encode_g_3) ###
decode_g_1 = Dense(num_decode_1, activation='relu')(Dropout(0.05)(encode_g_neck))
decode_g_2 = Dense(num_decode_2, activation='relu')(Dropout(0.05)(decode_g_1))
decode_g_3 = Dense(num_decode_3, activation='relu')(Dropout(0.05)(decode_g_2))



###### Define 4 output layers
# Reconstruction Layer person
output_p_out = Dense(num_output_to_p, activation= 'sigmoid', name = "p")(decode_p_3)

# Reconstruction Layer goods
output_g_out = Dense(num_output_to_g, activation= 'sigmoid', name = "g")(decode_g_3)

# Covariance Layer
def CovLayer(X):
    n_rows = tf.cast(tf.shape(X)[0], tf.float32)
    X = X - (tf.reduce_mean(X, axis=0))
    cov = tf.matmul(X, X, transpose_a=True) / n_rows
    return tf.reshape(tf.reduce_mean(tf.matrix_set_diag(cov, tf.zeros(num_neck, tf.float32))), [1])

concat_layer = Concatenate(axis=0)([encode_p_neck, encode_g_neck])
covLayer = Lambda(CovLayer, name="cov")(concat_layer) # Just a scalar layer

# Signed Distance Layer
def DisLayer(distance):
    return tf.reshape(tf.norm(distance, axis=1), (-1,1))

distance = Subtract()([encode_p_neck, encode_g_neck])
disLayer = Lambda(DisLayer, name="dist")(distance)


###### Define 3 loss
#loss 1: reconstruction loss for person
   ## MSE
    
#loss 2: reconstruction loss for goods
   ## MSE
    
#loss 3: covariance loss for Covariance Layer
def covarianceLoss(zeroCovariance, Cov_Layer):
    return Cov_Layer - 0

#loss 4: distance loss for Distance Layer
def distanceLoss(label, dis_Layer):
    sign = 2*label-1
    return tf.reduce_mean(tf.maximum(0.0, 0.6*threshold+tf.multiply(sign, dis_Layer-threshold)))

## Metric 
def AUC(label, disLayer):
    output = K.cast(tf.less_equal(disLayer, threshold), tf.float32)
    auc = tf.metrics.auc(output, label)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

def Accuracy(label, disLayer):
    output = K.cast(tf.less_equal(disLayer, threshold), tf.float32)
    accuracy = tf.metrics.accuracy(output, label)[1]
    K.get_session().run(tf.local_variables_initializer())
    return accuracy



losses = {"p": 'mse',
          "g": 'mse',
          "cov": covarianceLoss,
          "dist": distanceLoss}

weights = {"p": 0.25,
          "g": 0.25,
          "cov": 0.2,
          "dist": 1}

metric = {"dist": [AUC, Accuracy]}


zero_train = np.zeros((Xtrain_p.shape[0],))
zero_val = np.zeros((Xval_p.shape[0],))

model = Model(inputs= [main_p_input, main_g_input, label], outputs = [output_p_out, output_g_out, covLayer, disLayer])
model.compile(optimizer='RMSProp', loss=losses, loss_weights=weights, metrics = metric)
model.fit([Xtrain_p, Xtrain_g, Ytrain], [Xtrain_p, Xtrain_g, zero_train, Ytrain], validation_data=([Xval_p, Xval_g, Yval], [Xval_p, Xval_g, zero_val, Yval]), epochs=100, batch_size=batch)


Train on 120000 samples, validate on 40000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/

<keras.callbacks.History at 0x14810c4f470>

In [33]:
zero_train = np.zeros((Xtrain_p.shape[0],))
Ytrain_raw = model.predict([Xtrain_p, Xtrain_g, zero_train])[3]
Ytrain_predicted = np.less(Ytrain_raw, threshold)
print(roc_auc_score(Ytrain, Ytrain_predicted))


0.6374261508744934

In [9]:
model.save_weights("model_a=4dot5_neck=30.h5")

In [30]:
threshold

2.7386127875258306

array([1., 1., 0., ..., 0., 0., 1.])