## NN Model

#### 1. Set-up
import dependent packages and declare consts

In [1]:
# data handling
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# evaluation metrics & dataset processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, ndcg_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# DeepCTR
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names, combined_dnn_input
from deepctr_torch.models import DeepFM, CCPM, WDL, DCN, NFM
from deepctr_torch.models.basemodel import BaseModel
from deepctr_torch.layers import DNN

In [2]:
# setup computing device for pytorch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
# consts
DATASET1 = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
DATASET2 = '../../data/extracted/NYC-R/New_York_City_Restaurant_Complete_Review.csv'

user_columns = ['uage', 'ugender', 'ucity', 'ucountry', 'uid_index', 'ulevel', 'ustyle']
LON_item_columns = ['iid', 'iattribute', 'irating', 'itag']
NYC_item_columns = ['iid', 'iattribute', 'iprice', 'irating', 'itag']
rating_columns = ['rrate', 'rid']

LON_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating']
NYC_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating', 'iprice']
var_sparse_features = ['ustyle', 'iattribute', 'itag']

In [4]:
def sort_by_time(df):
    return df.sort_values(by=['rid'], ascending=True)

def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

def add_var_column(df, column):
    key2index = {}
    
    def split(x):
        key_ans = x.split(',')
        for key in key_ans:
            if key not in key2index:
                # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
                key2index[key] = len(key2index) + 1
        return list(map(lambda x: key2index[x], key_ans))
    
    #remove unnecessary characters
    df[column] = df[column].str.replace('[', '').str.replace(']', '').str.replace(', ', ',')
    column_list = list(map(split, df[column].values))
    column_length = np.array(list(map(len, column_list)))
    column_maxlen = max(column_length)
    column_list = pad_sequences(column_list, maxlen=column_maxlen, padding='post', )
    df = pd.concat([df, pd.DataFrame(column_list).add_prefix(str(column))], axis=1)
    
    return df, key2index, column_maxlen

def split_df(df):
    df['rating_cumcounts'] = df.groupby(['uid_index'])['rid'].rank(method='first', ascending=True)
    tmp = df.groupby('uid_index').size().rename('total_counts')
    df = df.join(tmp, on='uid_index', rsuffix='_r')
    train_df = df.loc[df['rating_cumcounts'] < (df['total_counts']*0.8)]
    test_df = df.loc[df['rating_cumcounts'] >= (df['total_counts']*0.8)]
    train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=1)
    
    return train_df, validation_df, test_df

def preprocessing(df):
    df = sort_by_time(df)
    df = filter_by_occurrence(df, 'uid_index', 5)
    df = filter_by_occurrence(df, 'iid', 5)
    df['rrate'] = df['rrate'].apply(lambda x: 1 if x != 'None' else 0)
    df = df.reset_index(drop=True)
    return df

def get_data(DATASET = 'LON'):
    assert DATASET in ['LON', 'NYC']
    
    if DATASET == 'LON':
        df = pd.read_csv(DATASET1, sep='\t')[user_columns + LON_item_columns + rating_columns].fillna('NaN')
        sparse_features = LON_sparse_features
    else:
        df = pd.read_csv(DATASET2, sep='\t')[user_columns + NYC_item_columns + rating_columns].fillna('NaN')
        sparse_features = NYC_sparse_features
    
    # sort, filter, binarize
    df = preprocessing(df)
    
    #Label encode categorical features
    for feat in sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat].astype('str'))

    # Adding variable length categorical columns
    column_dict_list, column_maxlen_list = [], []
    for column in var_sparse_features:
        df, column_dict, column_maxlen = add_var_column(df, column)
        column_dict_list.append(column_dict)
        column_maxlen_list.append(column_maxlen)
        
        
    fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
                              for feat in LON_sparse_features]
    # Notice : value 0 is for padding for sequence input feature
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=len(column_dict_list[i]) + 1,
                                  embedding_dim=4), maxlen=column_maxlen_list[i], combiner='mean',) 
                                  for i, feat in enumerate(var_sparse_features)]  
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    train_df, val_df, test_df = split_df(df)
    
    # generate input data for model
    train_model_input = {name: train_df[name] for name in sparse_features} 
    for i, feat in enumerate(var_sparse_features):
        train_model_input[feat] = train_df.filter(regex='^'+feat+'.+',axis=1).values
    
    val_model_input = {name: val_df[name] for name in sparse_features} 
    for i, feat in enumerate(var_sparse_features):
        val_model_input[feat] = val_df.filter(regex='^'+feat+'.+',axis=1).values
    
    test_model_input = {name: test_df[name] for name in sparse_features}
    for i, feat in enumerate(var_sparse_features):
        test_model_input[feat] = test_df.filter(regex='^'+feat+'.+',axis=1).values
    
    train_y, val_y, test_y = train_df['rrate'].values, val_df['rrate'].values, test_df['rrate'].values
        
    return train_model_input, train_y, val_model_input, val_y, test_model_input, test_y, linear_feature_columns, dnn_feature_columns

### Data
* Variable length columns * 3
  - ustyle, iattribute, itag
* Sparse columns * 8 or 9
  - uage, ugender, ucity, ucountry, uid_index, ulevel, iid, irating, iprice(NYC DATASET2)

### DeepCTR using dictionary as model input format
* there three types of data format in DeepCTR
  - SparseFeat : for simple categorical data
  - VarLenSparseFeat : for variable length categorical data (e.g. ustyle)
  - DenseFeat : for numerical data
* SparseFeat & VarLenSparseFeat will go through embedding layer

In [5]:
data = get_data('LON')

train_model_input, train_y, val_model_input, val_y, test_model_input, test_y = data[:6]
linear_feature_columns, dnn_feature_columns = data[6:8]

train_y.shape, val_y.shape, test_y.shape

((87440,), (9716,), (39339,))

### DeepCTR model detail parameters:
https://github.com/shenweichen/DeepCTR-Torch/tree/master/deepctr_torch/models

### FFN

In [6]:
class FFN(BaseModel):
    def __init__(self,linear_feature_columns, dnn_feature_columns,
                 dnn_hidden_units=(128, 128), l2_reg_linear=0.00001,
                 l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0,
                 dnn_activation='relu', dnn_use_bn=False, task='binary', device='cpu'):

        super(FFN, self).__init__(linear_feature_columns=linear_feature_columns,
                                  dnn_feature_columns=dnn_feature_columns,
                                  dnn_hidden_units=dnn_hidden_units,
                                  l2_reg_embedding=l2_reg_embedding, l2_reg_dnn=l2_reg_dnn, init_std=init_std,
                                  seed=seed,dnn_dropout=dnn_dropout, dnn_activation=dnn_activation,
                                  task=task, device=device)
        self.dnn_hidden_units = dnn_hidden_units
        self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
                       activation=dnn_activation, use_bn=dnn_use_bn, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout,
                       init_std=init_std, device=device)

        dnn_linear_in_feature = dnn_hidden_units[-1]

        self.dnn_linear = nn.Linear(dnn_linear_in_feature, 1, bias=False).to(
            device)
        self.add_regularization_loss(
            filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn)
        self.add_regularization_loss(self.dnn_linear.weight, l2_reg_linear)
        self.to(device)

    def forward(self, X):
        logit = self.linear_model(X)
        sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
                                                                                  self.embedding_dict)
        dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)

        deep_out = self.dnn(dnn_input)
        logit += self.dnn_linear(deep_out)
    
        y_pred = self.out(logit)
        return y_pred

In [7]:
model = FFN(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

preds = model.predict(test_model_input)
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing los_loss scores: ', log_loss(test_y, preds.astype('float64')))

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
9s - loss:  0.3145 - logloss:  0.3142 - val_logloss:  0.0897
Epoch 2/3
8s - loss:  0.0476 - logloss:  0.0476 - val_logloss:  0.0376
Epoch 3/3
8s - loss:  0.0235 - logloss:  0.0235 - val_logloss:  0.0393
Testing AUC scores:  0.9970451299644998
Testing los_loss scores:  0.05842061716549185


### CCPM

In [8]:
model = CCPM(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

preds = model.predict(test_model_input)
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing los_loss scores: ', log_loss(test_y, preds.astype('float64')))

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
9s - loss:  0.3986 - logloss:  0.3982 - val_logloss:  0.0886
Epoch 2/3
9s - loss:  0.0340 - logloss:  0.0339 - val_logloss:  0.0358
Epoch 3/3
9s - loss:  0.0186 - logloss:  0.0186 - val_logloss:  0.0392
Testing AUC scores:  0.9977941283900754
Testing los_loss scores:  0.047862328554676946


### WD (Wide & Deep)

In [9]:
model = WDL(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

preds = model.predict(test_model_input)
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing los_loss scores: ', log_loss(test_y, preds.astype('float64')))

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
8s - loss:  0.2900 - logloss:  0.2897 - val_logloss:  0.0769
Epoch 2/3
8s - loss:  0.0422 - logloss:  0.0422 - val_logloss:  0.0375
Epoch 3/3
8s - loss:  0.0202 - logloss:  0.0202 - val_logloss:  0.0444
Testing AUC scores:  0.9970358596415562
Testing los_loss scores:  0.06604857508723155


### DCN (Deep & Cross)

In [10]:
model = DCN(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

preds = model.predict(test_model_input)
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing los_loss scores: ', log_loss(test_y, preds.astype('float64')))

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
10s - loss:  0.2972 - logloss:  0.2969 - val_logloss:  0.0551
Epoch 2/3
10s - loss:  0.0344 - logloss:  0.0344 - val_logloss:  0.0346
Epoch 3/3
9s - loss:  0.0228 - logloss:  0.0228 - val_logloss:  0.0356
Testing AUC scores:  0.9978341807560748
Testing los_loss scores:  0.0499696159088416


### NFM (Deep & Cross)

In [11]:
model = NFM(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

preds = model.predict(test_model_input)
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing los_loss scores: ', log_loss(test_y, preds.astype('float64')))

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
8s - loss:  0.3556 - logloss:  0.3553 - val_logloss:  0.0982
Epoch 2/3
8s - loss:  0.0457 - logloss:  0.0457 - val_logloss:  0.0403
Epoch 3/3
8s - loss:  0.0225 - logloss:  0.0225 - val_logloss:  0.0399
Testing AUC scores:  0.9970965564841615
Testing los_loss scores:  0.05822759422259144


### DeepFM

In [12]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=device)
model.compile("adam", "binary_crossentropy", metrics=['logloss'], )
history = model.fit(train_model_input, train_y, batch_size=512, epochs=3, verbose=2,
                    validation_data=(val_model_input, val_y), use_double=True)

preds = model.predict(test_model_input)
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing los_loss scores: ', log_loss(test_y, preds.astype('float64')))

cuda:0
Train on 87440 samples, validate on 9716 samples, 171 steps per epoch
Epoch 1/3
8s - loss:  0.2695 - logloss:  0.2692 - val_logloss:  0.0677
Epoch 2/3
8s - loss:  0.0398 - logloss:  0.0398 - val_logloss:  0.0371
Epoch 3/3
8s - loss:  0.0199 - logloss:  0.0199 - val_logloss:  0.0415
Testing AUC scores:  0.9972663382759003
Testing los_loss scores:  0.05990438047291399
