# FNN - FM‐supported Neural Networks

## 1. Set-up
import dependent packages and declare consts

In [1]:
# data handling
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# evaluation metrics & dataset processing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# DeepCTR
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names, combined_dnn_input
from deepctr_torch.models import DeepFM, CCPM, WDL, DCN, NFM
from deepctr_torch.models.basemodel import BaseModel
from deepctr_torch.layers import DNN

In [2]:
# setup computing device for pytorch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
# consts

# dataset path
DATASET1 = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
DATASET2 = '../../data/extracted/NYC-R/New_York_City_Restaurant_Complete_Review.csv'

OCCURENCE_THRESHOLD = 5

* Variable length columns * 3
  - ustyle, iattribute, itag
* Sparse columns * 8/9
  - uage, ugender, ucity, ucountry, uid_index, ulevel, iid, irating, iprice(NYC only)

In [4]:
# dataset columns: user, item, rating
user_columns = ['uage', 'ugender', 'ucity', 'ucountry', 'uid_index', 'ulevel', 'ustyle']
LON_item_columns = ['iid', 'iattribute', 'irating', 'itag']
NYC_item_columns = ['iid', 'iattribute', 'iprice', 'irating', 'itag']
rating_columns = ['rrate', 'rid']

# dataset columns: sparse (one value) / variable length (multiple values)
LON_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating']
NYC_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating', 'iprice']
var_sparse_features = ['ustyle', 'iattribute', 'itag']

## 2. Data Processing

* Retain users/items with at least five ratings only
* Data splitting
  - the latest 20% interactions (by time)
  - Randomly split the remaining data into training (70%) and validation (10%) sets
* Transform the ratings into binary implicit feedback as ground truth, indicating whether the user has interacted with the specific item
* Transform SparseFeat(single value) into categorical data
* Transform VarLenSparseFeat(multiple values) columns into muliple columns of categorical data
* Then embed categorical data

In [7]:
def sort_by_time(df):
    return df.sort_values(by=['rid'], ascending=True)

def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= OCCURENCE_THRESHOLD)

def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1
    df.loc[df['rrate'] == "None", 'rrate'] = 0
    return df

def add_var_column(df, column):
    key2index = {}
    
    def split(x):
        key_ans = x.split(',')
        for key in key_ans:
            if key not in key2index:
                key2index[key] = len(key2index) + 1 # index starts from 1
        return list(map(lambda x: key2index[x], key_ans))
    
    # remove unnecessary characters
    df[column] = df[column].str.replace('[', '').str.replace(']', '').str.replace(', ', ',')
    column_list = list(map(split, df[column].values))
    column_length = np.array(list(map(len, column_list)))
    column_maxlen = max(column_length)
    column_list = pad_sequences(column_list, maxlen=column_maxlen, padding='post', )
    df = pd.concat([df, pd.DataFrame(column_list).add_prefix(str(column))], axis=1)
    
    return df, key2index, column_maxlen

def split_df(df):
    df['rating_cumcounts'] = df.groupby(['uid_index'])['rid'].rank(method='first', ascending=True)
    tmp = df.groupby('uid_index').size().rename('total_counts')
    df = df.join(tmp, on='uid_index', rsuffix='_r')
    train_df = df.loc[df['rating_cumcounts'] < (df['total_counts']*0.8)]
    test_df = df.loc[df['rating_cumcounts'] >= (df['total_counts']*0.8)]
    train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=1)
    
    return train_df, validation_df, test_df

In [8]:
def dataset(DATASET = 'LON'):
    
    if DATASET == 'LON':
        df = pd.read_csv(DATASET1, sep='\t')[user_columns + LON_item_columns + rating_columns].fillna('NaN')
        sparse_features = LON_sparse_features
    else:
        df = pd.read_csv(DATASET2, sep='\t')[user_columns + NYC_item_columns + rating_columns].fillna('NaN')
        sparse_features = NYC_sparse_features
    
    # sort by time (ascending order)
    df = sort_by_time(df)
    
    # retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'uid_index', 5)
    df = filter_by_occurrence(df, 'iid', 5)
    
    # convert ratings into binarys
    df = convert_binary(df)
    df = df.reset_index(drop=True)
    
    # transform columns with single values into category. e.g. 'male' => 1, 'female' => 2
    for feat in sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat].astype('str'))

    # add variable length categorical columns to dataframe
    column_dict_list, column_maxlen_list = [], []
    for column in var_sparse_features:
        df, column_dict, column_maxlen = add_var_column(df, column)
        column_dict_list.append(column_dict)
        column_maxlen_list.append(column_maxlen)
        
    
    fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
                              for feat in LON_sparse_features]
    
    # note: vocabulary need to add 1: 0 cannot be used
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=len(column_dict_list[i]) + 1,
                                  embedding_dim=4), maxlen=column_maxlen_list[i], combiner='mean',) 
                                  for i, feat in enumerate(var_sparse_features)]  
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    #feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    train_df, val_df, test_df = split_df(df)
    
    
    # train set
    train_model_input = {name: train_df[name] for name in sparse_features} 
    for i, feat in enumerate(var_sparse_features):
        train_model_input[feat] = train_df.filter(regex='^'+feat+'.+',axis=1).values
    
    # validation set
    val_model_input = {name: val_df[name] for name in sparse_features} 
    for i, feat in enumerate(var_sparse_features):
        val_model_input[feat] = val_df.filter(regex='^'+feat+'.+',axis=1).values
    
    # test set
    test_model_input = {name: test_df[name] for name in sparse_features}
    for i, feat in enumerate(var_sparse_features):
        test_model_input[feat] = test_df.filter(regex='^'+feat+'.+',axis=1).values
    
    train_y, val_y, test_y = train_df['rrate'].values, val_df['rrate'].values, test_df['rrate'].values
        
    return train_model_input, train_y, val_model_input, val_y, test_model_input, test_y, linear_feature_columns, dnn_feature_columns

In [30]:
# data = dataset('LON')
data = dataset('NYC')

train_model_input, train_y, val_model_input, val_y, test_model_input, test_y = data[:6]
linear_feature_columns, dnn_feature_columns = data[6:8]

train_y.shape, val_y.shape, test_y.shape

((79986,), (8888,), (36017,))

## 3. Define FNN Model

to tuning model parameters, check [DeepCTR-Torch](https://github.com/shenweichen/DeepCTR-Torch/tree/master/deepctr_torch/models)

Parameters:
- dnn_hidden_units - list, DNN layer architecture, defaults to (128, 128)
- l2_reg_linear - regularization factor for linear, defaults to 0.00001
- l2_reg_embedding - regularization factor for embedding, defaults to 0.00001
- l2_reg_dnn - regularization factor for embedding, defaults to 0

In [17]:
class FNN(BaseModel):
    
    # here we use default values from source code as default value
    def __init__(self,linear_feature_columns, dnn_feature_columns,
                 dnn_hidden_units=(128, 128), l2_reg_linear=0.00001,
                 l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0,
                 dnn_activation='relu', dnn_use_bn=False, task='binary', device='cpu'):

        super(FNN, self).__init__(linear_feature_columns=linear_feature_columns,
                                  dnn_feature_columns=dnn_feature_columns,
                                  dnn_hidden_units=dnn_hidden_units,
                                  l2_reg_embedding=l2_reg_embedding, l2_reg_dnn=l2_reg_dnn, init_std=init_std,
                                  seed=seed,dnn_dropout=dnn_dropout, dnn_activation=dnn_activation,
                                  task=task, device=device)
        
        self.dnn_hidden_units = dnn_hidden_units
        self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
                       activation=dnn_activation, use_bn=dnn_use_bn, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout,
                       init_std=init_std, device=device)

        dnn_linear_in_feature = dnn_hidden_units[-1]

        self.dnn_linear = nn.Linear(dnn_linear_in_feature, 1, bias=False).to(device)
        self.add_regularization_loss(
            filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn)
        self.add_regularization_loss(self.dnn_linear.weight, l2_reg_linear)
        self.to(device)

    def forward(self, X):
        logit = self.linear_model(X)
        sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
                                                                                  self.embedding_dict)
        dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)

        deep_out = self.dnn(dnn_input)
        logit += self.dnn_linear(deep_out)
    
        y_pred = self.out(logit)
        return y_pred

## 4. Training

In [11]:
model = FNN(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
8s - loss:  0.3034 - logloss:  0.3031 - val_logloss:  0.0844
Epoch 2/3
9s - loss:  0.0466 - logloss:  0.0466 - val_logloss:  0.0374
Epoch 3/3
8s - loss:  0.0241 - logloss:  0.0241 - val_logloss:  0.0388


## 5. Prediction & Evaluation

In [26]:
def evaluate_auc(z, y):
    return metrics.roc_auc_score(y, z)

# assume parameters z & y are ndarray
def evaluate_logloss(z, y):
    zf = z.flatten()
    zz = np.ones((zf.shape[0], 2))
    zz[:, 0] -= zf
    zz[:, 1] = zf
    return metrics.log_loss(y, zz)

# assume parameters z & y are ndarray
def evaluate_ndcg(z, y):
    return metrics.ndcg_score(np.expand_dims(y, axis=0), z.flatten().reshape((1, -1)), k=5)

In [28]:
preds = model.predict(test_model_input)

In [44]:
print('Testing AUC: ', evaluate_auc(preds, test_y))
print('Testing LogLoss: ', evaluate_logloss(preds.astype('float64'), test_y))
print("Testing NDCG@5: ", evaluate_ndcg(preds, test_y))

Testing AUC:  0.9970333252499437
Testing LogLoss:  0.05606685813109742
Testing NDCG@5:  0.9999999999999999


## Experiments

In [23]:
# consts
DNN_UNITS = (128, 128)
REG_LINEAR = 0.00001
REG_EMBEDDING = 0.00001
REG_DNN = 0
EPOCH = 3

In [27]:
def train_units(unit_list):
    
    history = []
    for hidden_unit in unit_list:
        
        print("Using archtect ", repr(hidden_unit))
        model = FNN(linear_feature_columns, dnn_feature_columns,
                    dnn_hidden_units=hidden_unit, l2_reg_linear=REG_LINEAR,
                    l2_reg_embedding=REG_EMBEDDING, l2_reg_dnn=REG_DNN, task='binary', device=device)
        model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
        model.fit(train_model_input, train_y, batch_size=512, epochs=EPOCH, verbose=2,
                            validation_data=(val_model_input, val_y), use_double=True)
        
        test_preds = model.predict(test_model_input)
        val_preds = model.predict(val_model_input)
        
        history.append({
            'units': repr(hidden_unit),
            'val_auc': evaluate_auc(val_preds, val_y),
            'test_auc': evaluate_auc(test_preds, test_y),
            'val_logloss': evaluate_logloss(val_preds.astype('float64'), val_y),
            'test_logloss': evaluate_logloss(test_preds.astype('float64'), test_y),
            'val_ndcg': evaluate_ndcg(val_preds, val_y),
            'test_ndcg': evaluate_ndcg(test_preds, test_y)
        })
        
    return history

In [31]:
history = train_units([
    (32,32),
    (32,32,32),
    (64,64),
    (64,64,64),
    (96,96),
    (96,96,96),
    (128,128),
    (128,128,128)
])

Using archtect  (32, 32)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
8s - loss:  0.5363 - logloss:  0.5352 - val_logloss:  0.3436
Epoch 2/3
8s - loss:  0.1449 - logloss:  0.1444 - val_logloss:  0.0591
Epoch 3/3
9s - loss:  0.0310 - logloss:  0.0309 - val_logloss:  0.0268
Using archtect  (32, 32, 32)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
9s - loss:  0.5172 - logloss:  0.5160 - val_logloss:  0.2542
Epoch 2/3
9s - loss:  0.1134 - logloss:  0.1130 - val_logloss:  0.0569
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0286 - logloss:  0.0285 - val_logloss:  nan
Using archtect  (64, 64)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
9s - loss:  0.4849 - logloss:  0.4836 - val_logloss:  0.2133
Epoch 2/3
9s - loss:  0.0890 - logloss:  0.0887 - val_logloss:  0.0423
Epoch 3/3
9s - loss:  0.0220 - logloss:  0.0220 - val_logloss:  0.0205
Using archtect  (64, 64, 64)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
9s - loss:  0.4780 - logloss:  0.4765 - val_logloss:  0.1786
Epoch 2/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0810 - logloss:  0.0809 - val_logloss:  nan
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


10s - loss:  0.0188 - logloss:  0.0188 - val_logloss:  nan
Using archtect  (96, 96)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
8s - loss:  0.4572 - logloss:  0.4560 - val_logloss:  0.1600
Epoch 2/3
8s - loss:  0.0608 - logloss:  0.0606 - val_logloss:  0.0279
Epoch 3/3
8s - loss:  0.0132 - logloss:  0.0131 - val_logloss:  0.0155
Using archtect  (96, 96, 96)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
8s - loss:  0.4215 - logloss:  0.4198 - val_logloss:  0.1193
Epoch 2/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


10s - loss:  0.0488 - logloss:  0.0486 - val_logloss:  nan
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0157 - logloss:  0.0156 - val_logloss:  nan
Using archtect  (128, 128)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3
8s - loss:  0.4421 - logloss:  0.4407 - val_logloss:  0.1312
Epoch 2/3
7s - loss:  0.0461 - logloss:  0.0460 - val_logloss:  0.0224
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0097 - logloss:  0.0096 - val_logloss:  nan
Using archtect  (128, 128, 128)
cuda:0
Train on 79986 samples, validate on 8888 samples, 157 steps per epoch
Epoch 1/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


8s - loss:  0.4016 - logloss:  0.4002 - val_logloss:  nan
Epoch 2/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


9s - loss:  0.0440 - logloss:  0.0438 - val_logloss:  nan
Epoch 3/3


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


8s - loss:  0.0145 - logloss:  0.0145 - val_logloss:  nan


In [33]:
print("| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |")
print("|:-- | -- | -- | -- | -- | -- | -- |")
for his in history:
    print("| DNN architect={} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        his['units'],
        his['val_auc'],
        his['val_logloss'],
        his['val_ndcg'],
        his['test_auc'],
        his['test_logloss'],
        his['test_ndcg'],
    ))

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| DNN architect=(32, 32) | 0.99962 | 0.02681 | 1.00000 | 0.99942 | 0.02774 | 1.00000 |
| DNN architect=(32, 32, 32) | 0.99961 | 0.02188 | 1.00000 | 0.99949 | 0.02285 | 1.00000 |
| DNN architect=(64, 64) | 0.99968 | 0.02048 | 1.00000 | 0.99949 | 0.02152 | 1.00000 |
| DNN architect=(64, 64, 64) | 0.99969 | 0.01696 | 1.00000 | 0.99952 | 0.01756 | 1.00000 |
| DNN architect=(96, 96) | 0.99975 | 0.01547 | 1.00000 | 0.99946 | 0.01713 | 1.00000 |
| DNN architect=(96, 96, 96) | 0.99980 | 0.01763 | 1.00000 | 0.99953 | 0.02030 | 1.00000 |
| DNN architect=(128, 128) | 0.99976 | 0.01351 | 1.00000 | 0.99952 | 0.01477 | 1.00000 |
| DNN architect=(128, 128, 128) | 0.99968 | 0.01804 | 1.00000 | 0.99946 | 0.02156 | 1.00000 |


## Experiment Results

LON-A dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| DNN architect=(32, 32) | 0.99775 | 0.04568 | 1.00000 | 0.99647 | 0.05671 | 1.00000 |
| DNN architect=(32, 32, 32) | 0.99790 | 0.04657 | 1.00000 | 0.99639 | 0.05830 | 1.00000 |
| DNN architect=(64, 64) | 0.99807 | 0.04089 | 1.00000 | 0.99676 | 0.05516 | 1.00000 |
| DNN architect=(64, 64, 64) | 0.99813 | 0.04229 | 1.00000 | 0.99669 | 0.05547 | 1.00000 |
| DNN architect=(96, 96) | 0.99819 | 0.03794 | 1.00000 | 0.99690 | 0.05491 | 1.00000 |
| **DNN architect=(96, 96, 96)*** | 0.99825 | 0.04304 | 1.00000 | 0.99714 | 0.06077 | 1.00000 |
| DNN architect=(128, 128) | 0.99822 | 0.04063 | 1.00000 | 0.99704 | 0.05880 | 1.00000 |
| DNN architect=(128, 128, 128) | 0.99817 | 0.04251 | 1.00000 | 0.99718 | 0.05730 | 1.00000 |

NYC-R dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| DNN architect=(32, 32) | 0.99962 | 0.02681 | 1.00000 | 0.99942 | 0.02774 | 1.00000 |
| DNN architect=(32, 32, 32) | 0.99961 | 0.02188 | 1.00000 | 0.99949 | 0.02285 | 1.00000 |
| DNN architect=(64, 64) | 0.99968 | 0.02048 | 1.00000 | 0.99949 | 0.02152 | 1.00000 |
| DNN architect=(64, 64, 64) | 0.99969 | 0.01696 | 1.00000 | 0.99952 | 0.01756 | 1.00000 |
| DNN architect=(96, 96) | 0.99975 | 0.01547 | 1.00000 | 0.99946 | 0.01713 | 1.00000 |
| **DNN architect=(96, 96, 96)*** | 0.99980 | 0.01763 | 1.00000 | 0.99953 | 0.02030 | 1.00000 |
| DNN architect=(128, 128) | 0.99976 | 0.01351 | 1.00000 | 0.99952 | 0.01477 | 1.00000 |
| DNN architect=(128, 128, 128) | 0.99968 | 0.01804 | 1.00000 | 0.99946 | 0.02156 | 1.00000 |