In [4]:
import numpy as np
import pandas as pd
import torch
import csv
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, ndcg_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from deepctr_torch.inputs import SparseFeat, get_feature_names
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from deepctr_torch.models.pnn import PNN

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
London_data = 'London_Attractions_Complete_Review.csv'
NewYork_data = 'New_York_City_Restaurant_Complete_Review.csv'
device

device(type='cuda', index=0)

In [3]:
LD_feature = ['uage','ugender','ucity','uid_index','ulevel','ustyle','iid','iattribute','irating','itag','rrate','rid']
NY_feature = ['uage','ugender','ucity','ucountry','uid_index','ulevel','ustyle','iid','iattribute','iprice','irating','itag','rrate','rid']
att_feature = ['ustyle', 'iattribute', 'itag']
key2index = {}

In [5]:
def sort_by_time(df):
    return df.sort_values(by='rid', ascending=True)

def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

def filtering(df, threshold) :
    df_i = filter_by_occurrence(df, 'iid', threshold)
    df_u = filter_by_occurrence(df_i, 'uid_index', threshold)
    if len(df_i) == len(df_u) :
        print('Completing deleting the number of user/item above %d' %threshold)
        return df_u
    else :
        return filtering(df_u, threshold)

def preprocessing(df, threshold) :
    df = df.fillna('NaN')
    df = filtering(df, threshold)
    df['rrate'] = df['rrate'].apply(lambda x: 1 if x != 'None' else 0)
    df = sort_by_time(df)
    df = df.reset_index(drop=True)
    df['index'] = df.index
    return df

'''
def spliting(df, valid_rate=0.1, test_rate=0.2) :
    test_df = pd.DataFrame()
    left_df = pd.DataFrame()   
    #train_df = pd.DataFrame()
    #valid_df = pd.DataFrame()
    uid = df.groupby('uid_index')
    for id in uid.size().to_dict().keys():
        user = uid.get_group(id)
        test_df = test_df.append(user.iloc[int(len(user)*(1-test_rate)):])
        left_df = left_df.append(user.iloc[:int(len(user)*(1-test_rate))])
        print(id)
    ind = (len(test_df)*valid_rate/test_rate)/len(left_df)
    train_df, valid_df = train_test_split(left_df, test_size=ind, random_state=0)
    test_df = shuffle(test_df, random_state=0)
    return train_df, valid_df, test_df'''

def split_df(df):
    df['rating_cumcounts'] = df.groupby(['uid_index'])['rid'].rank(method='first', ascending=True)
    tmp = df.groupby('uid_index').size().rename('total_counts')
    df = df.join(tmp, on='uid_index', rsuffix='_r')
    train_df = df.loc[df['rating_cumcounts'] < (df['total_counts']*0.8)]
    test_df = df.loc[df['rating_cumcounts'] >= (df['total_counts']*0.8)]
    train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=1)
    #((len(test_df)/2)/len(train_df))
    return train_df, validation_df, test_df

def split(x):
    key_ans = x.split(',')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

def var_feature(df, var) :
    df[var] = df[var].str.replace('[', '').str.replace(']', '').str.replace(' ', '')
    key2index.clear()
    var_list = list(map(split, df[var].values))
    var_length = np.array(list(map(len, var_list)))
    max_len = max(var_length)
    # Notice : padding=`post`
    var_list = pad_sequences(var_list, maxlen = max_len, padding='post', )
    return var_list, key2index, max_len

def generate_var(df, var) :
    var_out = {}
    dict_len, len_list = [], []
    for i in var :
        var_list, key_dict, max_len = var_feature(df, i)
        #print(key_dict)
        var_out[i] = var_list
        dict_len.append(len(key_dict))
        len_list.append(max_len)
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat( feat, vocabulary_size=dict_len[i] + 1
                                , embedding_dim=4), maxlen=len_list[i], combiner='mean') for i, feat in enumerate(var)]
    return var_out, varlen_feature_columns

def input_data(data, var, threshold) :
    df = pd.read_csv(data, sep='\t')[var]
    df = preprocessing(df, threshold)
    
    sparse_feature = [l for l in var if l not in att_feature][:-2]
    for feat in sparse_feature:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat].astype('str'))
        
    var_input, varlen_feature_columns = generate_var(df, att_feature)
    fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
                             for feat in sparse_feature]
    
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    train_df, valid_df, test_df = split_df(df)
    
    train_input = {name: train_df[name] for name in sparse_feature}
    valid_input = {name: valid_df[name] for name in sparse_feature}
    test_input = {name: test_df[name] for name in sparse_feature}
    for feat in att_feature:
        train_input[feat] = var_input[feat][train_df['index'].values]
        valid_input[feat] = var_input[feat][valid_df['index'].values]
        test_input[feat] = var_input[feat][test_df['index'].values]
    train_output, val_output, test_output = train_df['rrate'].values, valid_df ['rrate'].values, test_df['rrate'].values
    
    return train_input, train_output, valid_input, val_output, test_input, test_output, linear_feature_columns, dnn_feature_columns

In [6]:
def NeuMF_data(df) :
    out = pd.DataFrame({
        "userID" : df['uid_index'].values,
        "itemID" : df['iid'].values,
        "rating" : df['rrate'].values,
        "timestamp" : df['rid'].values
    })
    return out

def split_NCF_df(df):
    df['rating_cumcounts'] = df.groupby(['uid_index'])['rid'].rank(method='first', ascending=True)
    tmp = df.groupby('uid_index').size().rename('total_counts')
    df = df.join(tmp, on='uid_index', rsuffix='_r')
    train_df = df.loc[df['rating_cumcounts'] < (df['total_counts']*0.8)]
    test_df = df.loc[df['rating_cumcounts'] >= (df['total_counts']*0.8)]
        
    return train_df, test_df    

def NMF_preprocess(filename, feature, threshold) :
    df = pd.read_csv(filename, sep='\t')[feature]
    df = preprocessing(df, threshold)
    train_df, test_df = split_NCF_df(df)
    return train_df, test_df

### Generate Data for PNN with deepCTR

In [7]:
data_CTR = input_data(London_data, LD_feature, 5)
#data_CTR = input_data(NewYork_data, NY_feature, 5)

Completing deleting the number of user/item above 5


In [8]:
train_x, train_y, val_x, val_y, test_x, test_y, linear_feature_columns, dnn_feature_columns =  data_CTR

In [9]:
len(train_y), len(val_y), len(test_y)

(87282, 9699, 39282)

### Generate Data for NCF and save to csv

-  uid_index -> userID
-  iid_index -> itemID
-  rrate -> rating
-  rid -> timestamp

In [10]:
LD_train_NCF, LD_test_NCF = NMF_preprocess(London_data, LD_feature, 5)
NY_train_NCF, NY_test_NCF = NMF_preprocess(NewYork_data, NY_feature, 5)

Completing deleting the number of user/item above 5
Completing deleting the number of user/item above 5


In [11]:
pLD_train_NCF = NeuMF_data(LD_train_NCF)
pLD_test_NCF = NeuMF_data(LD_test_NCF)
pNY_train_NCF = NeuMF_data(NY_train_NCF)
pNY_test_NCF = NeuMF_data(NY_test_NCF)

In [12]:
pLD_train_NCF.to_csv('LD_train.csv', index=False)
pLD_test_NCF.to_csv('LD_test.csv', index=False)
pNY_train_NCF.to_csv('NY_train.csv', index=False)
pNY_test_NCF.to_csv('NY_test.csv', index=False)

## DeepCTR PNN

### IPNN

In [70]:
model = PNN(dnn_feature_columns, task='binary', device=device, use_inner=True, use_outter=False)
model.compile("adam", "binary_crossentropy", metrics=['acc'], )

In [71]:
history = model.fit(train_x, train_y, batch_size = 512, epochs=3, verbose=2, validation_data=(val_x, val_y))

cuda:0
Train on 87282 samples, validate on 9699 samples, 171 steps per epoch
Epoch 1/3
9s - loss:  0.3264 - acc:  0.8433 - val_acc:  0.9842
Epoch 2/3
10s - loss:  0.0619 - acc:  0.9882 - val_acc:  0.9889
Epoch 3/3
10s - loss:  0.0270 - acc:  0.9929 - val_acc:  0.9874


In [72]:
preds = model.predict(test_x, use_double=False)

In [73]:
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing log_loss scores: ', log_loss(test_y, preds))
print('NDCG@5 : ', ndcg_score(np.expand_dims(test_y, axis=0), preds.reshape(1,-1)))

Testing AUC scores:  0.9955130976131802
Testing log_loss scores:  0.055174912171581825
NDCG@5 :  0.9999102623398726


### OPNN

In [74]:
model = PNN(dnn_feature_columns, task='binary', device=device, use_inner=False, use_outter=True)
model.compile("adam", "binary_crossentropy", metrics=['acc'], )

In [75]:
history = model.fit(train_x, train_y, batch_size = 512, epochs=3, verbose=2, validation_data=(val_x, val_y))

cuda:0
Train on 87282 samples, validate on 9699 samples, 171 steps per epoch
Epoch 1/3
11s - loss:  0.3298 - acc:  0.8413 - val_acc:  0.9832
Epoch 2/3
10s - loss:  0.0722 - acc:  0.9878 - val_acc:  0.9887
Epoch 3/3
10s - loss:  0.0289 - acc:  0.9932 - val_acc:  0.9880


In [76]:
preds = model.predict(test_x, use_double=False)

In [77]:
print('Testing AUC scores: ', roc_auc_score(test_y, preds))
print('Testing log_loss scores: ', log_loss(test_y, preds))
print('NDCG@5 : ', ndcg_score(np.expand_dims(test_y, axis=0), preds.reshape(1,-1)))

Testing AUC scores:  0.9954970069757177
Testing log_loss scores:  0.056984476439332625
NDCG@5 :  0.9999100005974142
