## LVXNN

In [None]:
import time
import numpy as np
import pandas as pd 
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from gaminet.utils import local_visualize
from gaminet.utils import global_visualize_density
from gaminet.utils import feature_importance_visualize
from gaminet.utils import plot_trajectory
from gaminet.utils import plot_regularization
from sklearn.linear_model import LinearRegression
from collections import OrderedDict
import sys
sys.path.append('../../')
from lvxnn.LVXNN import LV_XNN
from lvxnn.DataReader import data_initialize

train= pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

list1 = train.columns
meta_info = OrderedDict()
for i in list1:
    meta_info[i]={'type': 'categorical','source':'item'} 
meta_info['Occupation']={"type":"categorical",'source':'user'}
meta_info['Gender']={"type":"categorical",'source':'user'}
meta_info['Age'] = {"type":"continues",'source':'user'}
meta_info['user_id']={"type":"id",'source':'user'}
meta_info['item_id']={"type":"id",'source':'item'}
meta_info['target']={"type":"target",'source':''}


tr_x, tr_Xi, tr_y , te_x , te_Xi, te_y, meta_info, model_info = data_initialize(train,test,meta_info,"Regression")


def auto_test():
    cold_mae = []
    cold_rmse = []
    warm_mae = []
    warm_rmse = []
    gami_mae = []
    gami_rmse = []

    for times in range(10):
        
        print(times)


        model = LV_XNN(model_info=model_info, meta_info=meta_info, subnet_arch=[8, 16],interact_arch=[20, 10],activation_func=tf.tanh, batch_size=1000, lr_bp=0.01, auto_tune=False,
               interaction_epochs=20,main_effect_epochs=20,tuning_epochs=20,loss_threshold_main=0.01,loss_threshold_inter=0.01,alpha=0.5,
              verbose=True,val_ratio=0.125, early_stop_thres=100,interact_num=10,u_group_num=30,i_group_num=50,scale_ratio=1,n_power_iterations=5,n_oversamples=0,
              mf_training_iters=1,mf_tuning_iters=300,change_mode=True,convergence_threshold=0.001,max_rank=3,shrinkage_value=20,random_state=times)
    
        st_time = time.time()
        model.fit(tr_x,tr_Xi, tr_y)
        ed_time = time.time()
    
        
        pred = model.predict(te_x, te_Xi)
        
        cold_y = te_y[(te_Xi[:,1] == 'cold') | (te_Xi[:,0] == 'cold')]
        cold_pred = pred[(te_Xi[:,1] == 'cold') | (te_Xi[:,0] == 'cold')]
        warm_y = te_y[(te_Xi[:,1] != 'cold') & (te_Xi[:,0] != 'cold')]
        warm_pred = pred[(te_Xi[:,1] != 'cold') & (te_Xi[:,0] != 'cold')]
    
        cold_mae.append(mean_absolute_error(cold_y,cold_pred))
        cold_rmse.append(mean_squared_error(cold_y,cold_pred)**0.5)
        warm_mae.append(mean_absolute_error(warm_y,warm_pred))
        warm_rmse.append(mean_squared_error(warm_y,warm_pred)**0.5)
        

        gami_mae.append(mean_absolute_error(te_y,model.final_gam_model.predict(te_x)))
        gami_rmse.append(mean_squared_error(te_y,model.final_gam_model.predict(te_x))**0.5)
        
    i_result = np.array([np.mean(cold_mae),np.mean(cold_rmse),np.mean(warm_mae),np.mean(warm_rmse)]).reshape(1,-1)
    result = pd.DataFrame(i_result,columns=['cold_mae','cold_rmse','warm_mae','warm_rmse'])
    
    g_result = np.array([np.mean(gami_mae),np.mean(gami_rmse)]).reshape(1,-1)
    g_result = pd.DataFrame(g_result,columns=['mae','rmse'])
    
    return result, g_result

results, g_result = (auto_test())
results.to_csv('result/LVXNN_result.csv',index=None)
g_result.to_csv('result/gami_result.csv',index=None)

Memory usage of dataframe is 146.52 MB
Memory usage after optimization is: 19.84 MB
Decreased by 86.5%
Memory usage of dataframe is 36.63 MB
Memory usage after optimization is: 4.96 MB
Decreased by 86.5%
cold start user: 3086
cold start item: 1964
0
ListWrapper(['Gender', 'Age', 'Occupation', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
####################GAMI-Net training start.####################
##########Stage 1: main effect training start.##########
Main effects training epoch: 1, train loss: 0.89224, val loss: 0.88902
Main effects training epoch: 2, train loss: 0.87938, val loss: 0.87646
Main effects training epoch: 3, train loss: 0.87468, val loss: 0.87161
Main effects training epoch: 4, train loss: 0.87453, val loss: 0.87135
Main effects training epoch: 5, train loss: 0.86936, val loss: 0.86602
Main effects training epoch:

[SoftImpute] Iter 38: observed MAE=0.703574 validation MAE=0.721410,rank=3
[SoftImpute] Iter 39: observed MAE=0.703069 validation MAE=0.721027,rank=3
[SoftImpute] Iter 40: observed MAE=0.702581 validation MAE=0.720657,rank=3
[SoftImpute] Iter 41: observed MAE=0.702111 validation MAE=0.720302,rank=3
[SoftImpute] Iter 42: observed MAE=0.701656 validation MAE=0.719959,rank=3
[SoftImpute] Iter 43: observed MAE=0.701216 validation MAE=0.719629,rank=3
[SoftImpute] Iter 44: observed MAE=0.700789 validation MAE=0.719310,rank=3
[SoftImpute] Iter 45: observed MAE=0.700376 validation MAE=0.719001,rank=3
[SoftImpute] Iter 46: observed MAE=0.699976 validation MAE=0.718702,rank=3
[SoftImpute] Iter 47: observed MAE=0.699587 validation MAE=0.718413,rank=3
[SoftImpute] Iter 48: observed MAE=0.699210 validation MAE=0.718131,rank=3
[SoftImpute] Iter 49: observed MAE=0.698843 validation MAE=0.717857,rank=3
[SoftImpute] Iter 50: observed MAE=0.698486 validation MAE=0.717590,rank=3
[SoftImpute] Iter 51: obs

[SoftImpute] Iter 147: observed MAE=0.681964 validation MAE=0.704863,rank=3
[SoftImpute] Iter 148: observed MAE=0.681875 validation MAE=0.704789,rank=3
[SoftImpute] Iter 149: observed MAE=0.681788 validation MAE=0.704716,rank=3
[SoftImpute] Iter 150: observed MAE=0.681701 validation MAE=0.704643,rank=3
[SoftImpute] Iter 151: observed MAE=0.681615 validation MAE=0.704570,rank=3
[SoftImpute] Iter 152: observed MAE=0.681530 validation MAE=0.704498,rank=3
[SoftImpute] Iter 153: observed MAE=0.681445 validation MAE=0.704427,rank=3
[SoftImpute] Iter 154: observed MAE=0.681362 validation MAE=0.704356,rank=3
[SoftImpute] Iter 155: observed MAE=0.681279 validation MAE=0.704286,rank=3
[SoftImpute] Iter 156: observed MAE=0.681197 validation MAE=0.704217,rank=3
[SoftImpute] Iter 157: observed MAE=0.681116 validation MAE=0.704148,rank=3
[SoftImpute] Iter 158: observed MAE=0.681036 validation MAE=0.704080,rank=3
[SoftImpute] Iter 159: observed MAE=0.680956 validation MAE=0.704013,rank=3
[SoftImpute]

[SoftImpute] Iter 255: observed MAE=0.676006 validation MAE=0.699718,rank=3
[SoftImpute] Iter 256: observed MAE=0.675974 validation MAE=0.699690,rank=3
[SoftImpute] Iter 257: observed MAE=0.675942 validation MAE=0.699661,rank=3
[SoftImpute] Iter 258: observed MAE=0.675911 validation MAE=0.699633,rank=3
[SoftImpute] Iter 259: observed MAE=0.675879 validation MAE=0.699605,rank=3
[SoftImpute] Iter 260: observed MAE=0.675848 validation MAE=0.699578,rank=3
[SoftImpute] Iter 261: observed MAE=0.675817 validation MAE=0.699550,rank=3
[SoftImpute] Iter 262: observed MAE=0.675787 validation MAE=0.699523,rank=3
[SoftImpute] Iter 263: observed MAE=0.675757 validation MAE=0.699497,rank=3
[SoftImpute] Iter 264: observed MAE=0.675727 validation MAE=0.699470,rank=3
[SoftImpute] Iter 265: observed MAE=0.675697 validation MAE=0.699444,rank=3
[SoftImpute] Iter 266: observed MAE=0.675668 validation MAE=0.699418,rank=3
[SoftImpute] Iter 267: observed MAE=0.675639 validation MAE=0.699392,rank=3
[SoftImpute]

## xgboost

In [4]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

train= pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data,val = train_test_split(train,test_size=0.125)

x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values
x_t = test.iloc[:,:-1].values
y_t = test.iloc[:,-1].values

enc = MinMaxScaler()
x = enc.fit_transform(x)
x_t = enc.fit_transform(x_t)

def auto_test():
    mae = []
    rmse = []
    for times in range(10):
        xgb = XGBRegressor(n_jobs=-1)
        xgb.fit(x,y)
        pred = xgb.predict(x_t)
        
        mae.append(mean_absolute_error(y_t,pred))
        rmse.append(mean_squared_error(y_t,pred)**0.5)

    i_result = np.array([np.mean(mae),np.mean(rmse)]).reshape(1,-1)
    result = pd.DataFrame(i_result,columns=['mae','rmse'])
    
    return result

results = (auto_test())
results.to_csv('result/xgboost_result.csv',index=None)

## SVD

In [5]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from sklearn.model_selection import train_test_split

train= pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data,val = train_test_split(train,test_size=0.125)

Xi = data.iloc[:,-3:-1].values
y = data.iloc[:,-1].values
Xi_t = test.iloc[:,-3:-1].values
y_t = test.iloc[:,-1].values

tr_ratings_dict = {'itemID': Xi[:,1].tolist(),
                'userID': Xi[:,0].tolist(),
                'rating': y.tolist()}

tr_df = pd.DataFrame(tr_ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(y.min(), y.max()))

# The columns must correspond to user id, item id and ratings (in that order).
tr_data = Dataset.load_from_df(tr_df[['userID', 'itemID', 'rating']], reader)

tr_data = tr_data.build_full_trainset()

def auto_test():
    mae = []
    rmse = []
    for j in range(10):
        model = SVD(n_factors=3)

        model.fit(tr_data)

        pred = []
        
        for i in range(Xi_t.shape[0]):
            pred.append(model.predict(Xi_t[i,0],Xi_t[i,1],Xi_t[i,0]).est)
    
        pred2 = np.array(pred).ravel()

        mae.append(mean_absolute_error(y_t,pred2))
        rmse.append(mean_squared_error(y_t,pred2)**0.5)
    

    i_result = np.array([np.mean(mae),np.mean(rmse)]).reshape(1,-1)
    result = pd.DataFrame(i_result,columns=['mae','rmse'])
    
    return result

results = (auto_test())
results.to_csv('result/svd_result.csv',index=None)

## deepfm

In [1]:
class config():
# set the path-to-files

    TRAIN_FILE = "./data/train.csv"
    TEST_FILE = "./data/test.csv"
    SUB_DIR = "./output"
    NUM_SPLITS = 3
    RANDOM_SEED = 2017

# types of columns of the dataset dataframe
    CATEGORICAL_COLS = ['Occupation']
    NUMERIC_COLS = ['Gender','Age','Action', 'Adventure', 'Animation',
       "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western']
    IGNORE_COLS = ["target"]
    
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import train_test_split
sys.path.append('../benchmark/deepfm/')
from DataReader import FeatureDictionary, DataParser
from DeepFM import DeepFM


def _load_data():

    dfTrain = pd.read_csv(config.TRAIN_FILE)
    dfTest = pd.read_csv(config.TEST_FILE)

    def preprocess(df):        
        cols = [c for c in df.columns if c not in ["target"]]
        return df

    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)
    cols = [c for c in dfTrain.columns if c not in ["target"]]


    X_train = dfTrain[cols].values
    y_train = dfTrain["target"].values
    X_test = dfTest[cols].values
    
    ids_test = dfTest["user_id"].values
    idv_test = dfTest["item_id"].values
    y_test = dfTest['target'].values
        
    return dfTrain, dfTest, X_train, y_train, X_test, ids_test,idv_test, y_test


def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test,idv_test = data_parser.parse(df=dfTest)
    
    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])
    print(dfm_params)

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)
        
        gini_results_cv[i] = mean_absolute_error(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    return y_train_meta, y_test_meta

# load data
dfTrain, dfTest, X_train, y_train, X_test, ids_test ,idv_test, y_test= _load_data()

# folds
folds = list(KFold(n_splits=config.NUM_SPLITS, shuffle=True,
                             random_state=config.RANDOM_SEED).split(X_train, y_train))



# ------------------ DeepFM Model ------------------
# params
dfm_params = {
    "embedding_size": 3,
    "deep_layers": [32, 32],
    "use_deep" : True ,
    "use_fm" : True , 
    "deep_layers_activation": tf.nn.relu,
    "loss_type" : "mse",
    "epoch": 50 ,
    "batch_size": 1024,
    "learning_rate": 0.01,
    "optimizer_type": "adam",
    "batch_norm": 0,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": False,
    "eval_metric": mean_absolute_error,
    "random_seed": config.RANDOM_SEED
}

def auto_test(deep):
    mae = []
    rmse = []
    dfm_params['use_deep']=deep
    
    for i in range(5):
        dfm_params['random_seed']=i
        y_train_dfm, y_test_dfm = _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params)
        mae.append(mean_absolute_error(y_test,y_test_dfm))
        rmse.append(mean_squared_error(y_test,y_test_dfm)**0.5)
    
    i_result = np.array([np.mean(mae),np.mean(rmse)]).reshape(1,-1)
    results = pd.DataFrame(i_result,columns=['mae','rmse'])
    
    return results
result_1 = (auto_test(True))
result_2 = (auto_test(False))
result_1.to_csv('result/deepfm_result.csv',index=None)
result_2.to_csv('result/fm_result.csv',index=None)

W0613 14:05:39.747494  9740 deprecation.py:323] From C:\Users\64161\Anaconda3\lib\site-packages\tensorflow_core\python\compat\v2_compat.py:65: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term


{'embedding_size': 3, 'deep_layers': [32, 32], 'use_deep': True, 'use_fm': True, 'deep_layers_activation': <function relu at 0x00000261068156A8>, 'loss_type': 'mse', 'epoch': 50, 'batch_size': 1024, 'learning_rate': 0.01, 'optimizer_type': 'adam', 'batch_norm': 0, 'batch_norm_decay': 0.995, 'l2_reg': 0.01, 'verbose': False, 'eval_metric': <function mean_absolute_error at 0x000002611A8EB378>, 'random_seed': 0, 'feature_size': 9787, 'field_size': 23}


W0613 14:05:51.088824  9740 deprecation.py:506] From ../benchmark/deepfm\DeepFM.py:93: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


{'embedding_size': 3, 'deep_layers': [32, 32], 'use_deep': True, 'use_fm': True, 'deep_layers_activation': <function relu at 0x00000261068156A8>, 'loss_type': 'mse', 'epoch': 50, 'batch_size': 1024, 'learning_rate': 0.01, 'optimizer_type': 'adam', 'batch_norm': 0, 'batch_norm_decay': 0.995, 'l2_reg': 0.01, 'verbose': False, 'eval_metric': <function mean_absolute_error at 0x000002611A8EB378>, 'random_seed': 1, 'feature_size': 9787, 'field_size': 23}
{'embedding_size': 3, 'deep_layers': [32, 32], 'use_deep': True, 'use_fm': True, 'deep_layers_activation': <function relu at 0x00000261068156A8>, 'loss_type': 'mse', 'epoch': 50, 'batch_size': 1024, 'learning_rate': 0.01, 'optimizer_type': 'adam', 'batch_norm': 0, 'batch_norm_decay': 0.995, 'l2_reg': 0.01, 'verbose': False, 'eval_metric': <function mean_absolute_error at 0x000002611A8EB378>, 'random_seed': 2, 'feature_size': 9787, 'field_size': 23}
{'embedding_size': 3, 'deep_layers': [32, 32], 'use_deep': True, 'use_fm': True, 'deep_layers_