# xDeepFM
Combining Explicit and Implicit Feature Interactions for RecSys (KDD 2018)

## 1. Set-up
import dependent packages and declare consts

In [1]:
# data handling
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# evaluation metrics & dataset processing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# DeepCTR
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names, combined_dnn_input
from deepctr_torch.models import xDeepFM
from deepctr_torch.models.basemodel import BaseModel
from deepctr_torch.layers import DNN

In [2]:
# setup computing device for pytorch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
# consts

# dataset path
DATASET1 = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
DATASET2 = '../../data/extracted/NYC-R/New_York_City_Restaurant_Complete_Review.csv'

OCCURENCE_THRESHOLD = 5

* Variable length columns * 3
  - ustyle, iattribute, itag
* Sparse columns * 8/9
  - uage, ugender, ucity, ucountry, uid_index, ulevel, iid, irating, iprice(NYC only)

In [4]:
# dataset columns: user, item, rating
user_columns = ['uage', 'ugender', 'ucity', 'ucountry', 'uid_index', 'ulevel', 'ustyle']
LON_item_columns = ['iid', 'iattribute', 'irating', 'itag']
NYC_item_columns = ['iid', 'iattribute', 'iprice', 'irating', 'itag']
rating_columns = ['rrate', 'rid']

# dataset columns: sparse (one value) / variable length (multiple values)
LON_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating']
NYC_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating', 'iprice']
var_sparse_features = ['ustyle', 'iattribute', 'itag']

## 2. Data Preprocessing

* Retain users/items with at least five ratings only
* Data splitting
  - the latest 20% interactions (by time)
  - Randomly split the remaining data into training (70%) and validation (10%) sets
* Transform the ratings into binary implicit feedback as ground truth, indicating whether the user has interacted with the specific item
* Transform SparseFeat(single value) into categorical data
* Transform VarLenSparseFeat(multiple values) columns into muliple columns of categorical data
* Then embed categorical data

In [5]:
def sort_by_time(df):
    return df.sort_values(by=['rid'], ascending=True)

def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= OCCURENCE_THRESHOLD)

def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1
    df.loc[df['rrate'] == "None", 'rrate'] = 0
    return df

def add_var_column(df, column):
    key2index = {}
    
    def split(x):
        key_ans = x.split(',')
        for key in key_ans:
            if key not in key2index:
                key2index[key] = len(key2index) + 1 # index starts from 1
        return list(map(lambda x: key2index[x], key_ans))
    
    # remove unnecessary characters
    df[column] = df[column].str.replace('[', '').str.replace(']', '').str.replace(', ', ',')
    column_list = list(map(split, df[column].values))
    column_length = np.array(list(map(len, column_list)))
    column_maxlen = max(column_length)
    column_list = pad_sequences(column_list, maxlen=column_maxlen, padding='post', )
    df = pd.concat([df, pd.DataFrame(column_list).add_prefix(str(column))], axis=1)
    
    return df, key2index, column_maxlen

def split_df(df):
    df['rating_cumcounts'] = df.groupby(['uid_index'])['rid'].rank(method='first', ascending=True)
    tmp = df.groupby('uid_index').size().rename('total_counts')
    df = df.join(tmp, on='uid_index', rsuffix='_r')
    train_df = df.loc[df['rating_cumcounts'] < (df['total_counts']*0.8)]
    test_df = df.loc[df['rating_cumcounts'] >= (df['total_counts']*0.8)]
    train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=1)
    
    return train_df, validation_df, test_df

In [6]:
def dataset(DATASET = 'LON'):
    
    if DATASET == 'LON':
        df = pd.read_csv(DATASET1, sep='\t')[user_columns + LON_item_columns + rating_columns].fillna('NaN')
        sparse_features = LON_sparse_features
    else:
        df = pd.read_csv(DATASET2, sep='\t')[user_columns + NYC_item_columns + rating_columns].fillna('NaN')
        sparse_features = NYC_sparse_features
    
    # sort by time (ascending order)
    df = sort_by_time(df)
    
    # retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'uid_index', 5)
    df = filter_by_occurrence(df, 'iid', 5)
    
    # convert ratings into binarys
    df = convert_binary(df)
    df = df.reset_index(drop=True)
    
    # transform columns with single values into category. e.g. 'male' => 1, 'female' => 2
    for feat in sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat].astype('str'))

    # add variable length categorical columns to dataframe
    column_dict_list, column_maxlen_list = [], []
    for column in var_sparse_features:
        df, column_dict, column_maxlen = add_var_column(df, column)
        column_dict_list.append(column_dict)
        column_maxlen_list.append(column_maxlen)
        
    
    fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
                              for feat in LON_sparse_features]
    
    # note: vocabulary need to add 1: 0 cannot be used
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=len(column_dict_list[i]) + 1,
                                  embedding_dim=4), maxlen=column_maxlen_list[i], combiner='mean',) 
                                  for i, feat in enumerate(var_sparse_features)]  
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    #feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    train_df, val_df, test_df = split_df(df)
    
    
    # train set
    train_model_input = {name: train_df[name] for name in sparse_features} 
    for i, feat in enumerate(var_sparse_features):
        train_model_input[feat] = train_df.filter(regex='^'+feat+'.+',axis=1).values
    
    # validation set
    val_model_input = {name: val_df[name] for name in sparse_features} 
    for i, feat in enumerate(var_sparse_features):
        val_model_input[feat] = val_df.filter(regex='^'+feat+'.+',axis=1).values
    
    # test set
    test_model_input = {name: test_df[name] for name in sparse_features}
    for i, feat in enumerate(var_sparse_features):
        test_model_input[feat] = test_df.filter(regex='^'+feat+'.+',axis=1).values
    
    train_y, val_y, test_y = train_df['rrate'].values, val_df['rrate'].values, test_df['rrate'].values
        
    return train_model_input, train_y, val_model_input, val_y, test_model_input, test_y, linear_feature_columns, dnn_feature_columns

In [13]:
# data = dataset('LON')
data = dataset('NYC')

train_model_input, train_y, val_model_input, val_y, test_model_input, test_y = data[:6]
linear_feature_columns, dnn_feature_columns = data[6:8]

train_y.shape, val_y.shape, test_y.shape

((79986,), (8888,), (36017,))

## 3. Define xDeepFM Model

to tuning model parameters, check [DeepCTR-Torch](https://github.com/shenweichen/DeepCTR-Torch/tree/master/deepctr_torch/models)

In [8]:
model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )


## 4. Training

In [9]:
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
10s - loss:  0.1968 - logloss:  0.1966 - val_logloss:  0.0350
Epoch 2/3
9s - loss:  0.0279 - logloss:  0.0279 - val_logloss:  0.0348
Epoch 3/3
9s - loss:  0.0164 - logloss:  0.0164 - val_logloss:  nan


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


## 5. Prediction & Evaluation

In [8]:
def evaluate_auc(z, y):
    return metrics.roc_auc_score(y, z)

# assume parameters z & y are ndarray
def evaluate_logloss(z, y):
    zf = z.flatten()
    zz = np.ones((zf.shape[0], 2))
    zz[:, 0] -= zf
    zz[:, 1] = zf
    return metrics.log_loss(y, zz)

# assume parameters z & y are ndarray
def evaluate_ndcg(z, y):
    return metrics.ndcg_score(np.expand_dims(y, axis=0), z.flatten().reshape((1, -1)), k=5)

In [14]:
preds = model.predict(test_model_input)

In [15]:
print('Testing AUC: ', evaluate_auc(preds, test_y))
print('Testing LogLoss: ', evaluate_logloss(preds.astype('float64'), test_y))
print("Testing NDCG@5: ", evaluate_ndcg(preds, test_y))

Testing AUC:  0.9975543182155988
Testing LogLoss:  0.05779955043344312
Testing NDCG@5:  0.9999999999999999


## Experiments

In [9]:
EPOCH = 3

In [10]:
def train_cin(cin_list):
    
    history = []
    for layer_size in cin_list:
        
        print("Using CIN layer sizes: ", repr(layer_size))
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=device, cin_layer_size=layer_size)
        model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
        model.fit(train_model_input, train_y, batch_size=512, epochs=EPOCH, verbose=2,
                            validation_data=(val_model_input, val_y), use_double=True)
        
        test_preds = model.predict(test_model_input)
        val_preds = model.predict(val_model_input)
        
        history.append({
            'cin_layer_size': layer_size,
            'val_auc': evaluate_auc(val_preds, val_y),
            'test_auc': evaluate_auc(test_preds, test_y),
            'val_logloss': evaluate_logloss(val_preds.astype('float64'), val_y),
            'test_logloss': evaluate_logloss(test_preds.astype('float64'), test_y),
            'val_ndcg': evaluate_ndcg(val_preds, val_y),
            'test_ndcg': evaluate_ndcg(test_preds, test_y)
        })
        
    return history

In [14]:
history = train_cin([
    (256, 128,),
    (256, 128, 64),
    (128, 64,),
    (128, 64, 32),
])

Using CIN layer sizes:  (256, 128)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


8s - loss:  0.2935 - logloss:  0.2921 - val_logloss:  nan
Epoch 2/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0091 - logloss:  0.0091 - val_logloss:  nan
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


8s - loss:  0.0030 - logloss:  0.0030 - val_logloss:  nan
Using CIN layer sizes:  (256, 128, 64)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.2845 - logloss:  0.2831 - val_logloss:  nan
Epoch 2/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0090 - logloss:  0.0091 - val_logloss:  nan
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0031 - logloss:  0.0032 - val_logloss:  nan
Using CIN layer sizes:  (128, 64)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
9s - loss:  0.3174 - logloss:  0.3161 - val_logloss:  0.0236
Epoch 2/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


8s - loss:  0.0108 - logloss:  0.0108 - val_logloss:  nan
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


8s - loss:  0.0037 - logloss:  0.0037 - val_logloss:  nan
Using CIN layer sizes:  (128, 64, 32)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.3174 - logloss:  0.3159 - val_logloss:  nan
Epoch 2/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0099 - logloss:  0.0101 - val_logloss:  nan
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0035 - logloss:  0.0037 - val_logloss:  nan


In [15]:
print("| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |")
print("|:-- | -- | -- | -- | -- | -- | -- |")
for his in history:
    print("| CIN layer size={} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        his['cin_layer_size'],
        his['val_auc'],
        his['val_logloss'],
        his['val_ndcg'],
        his['test_auc'],
        his['test_logloss'],
        his['test_ndcg'],
    ))

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| CIN layer size=(256, 128) | 0.99968 | 0.01142 | 1.00000 | 0.99949 | 0.01310 | 0.99974 |
| CIN layer size=(256, 128, 64) | 0.99975 | 0.01117 | 1.00000 | 0.99957 | 0.01281 | 0.99979 |
| CIN layer size=(128, 64) | 0.99987 | 0.01079 | 1.00000 | 0.99949 | 0.01250 | 1.00000 |
| CIN layer size=(128, 64, 32) | 0.99968 | 0.01097 | 1.00000 | 0.99955 | 0.01277 | 0.99987 |


## Experiment Results

LON-A dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| CIN layer size=(256, 128) | 0.99787 | 0.04936 | 1.00000 | 0.99743 | 0.05961 | 1.00000 |
| **CIN layer size=(256, 128, 64)*** | 0.99854 | 0.04536 | 1.00000 | 0.99776 | 0.05771 | 1.00000 |
| CIN layer size=(128, 64) | 0.99816 | 0.04629 | 1.00000 | 0.99756 | 0.05748 | 1.00000 |
| CIN layer size=(128, 64, 32) | 0.99810 | 0.04706 | 1.00000 | 0.99760 | 0.05471 | 1.00000 |

NYC-R dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| CIN layer size=(256, 128) | 0.99968 | 0.01142 | 1.00000 | 0.99949 | 0.01310 | 0.99974 |
| CIN layer size=(256, 128, 64) | 0.99975 | 0.01117 | 1.00000 | 0.99957 | 0.01281 | 0.99979 |
| **CIN layer size=(128, 64)*** | 0.99987 | 0.01079 | 1.00000 | 0.99949 | 0.01250 | 1.00000 |
| CIN layer size=(128, 64, 32) | 0.99968 | 0.01097 | 1.00000 | 0.99955 | 0.01277 | 0.99987 |