## Pre‐training via XGBoost for LR

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve, auc, accuracy_score, roc_auc_score, log_loss, ndcg_score



### Read Data

In [84]:
DATASET_LON = '../data/LON-A/London_Attractions_Complete_Review.csv'
DATASET_NYC = '../data/NYC-R/New_York_City_Restaurant_Complete_Review.csv'

user_columns = ['uage', 'ugender', 'ucity', 'ucountry', 'uid_index', 'ulevel', 'ustyle']
LON_item_columns = ['iid', 'iattribute', 'irating', 'itag']
NYC_item_columns = ['iid', 'iattribute', 'iprice', 'irating', 'itag']
rating_columns = ['rtime', 'rquote', 'rrate', 'rid']

In [85]:
LON_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating']
NYC_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating', 'iprice']
var_sparse_features = ['ustyle', 'iattribute', 'itag']

def sort_by_time(df):
    return df.sort_values(by=['rid'], ascending=True)

def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

def split_df(df):
    df['rating_cumcounts'] = df.groupby(['uid_index'])['rid'].rank(method='first', ascending=True)
    tmp = df.groupby('uid_index').size().rename('total_counts')
    df = df.join(tmp, on='uid_index', rsuffix='_r')
    train_df = df.loc[df['rating_cumcounts'] < (df['total_counts']*0.8)]
    test_df = df.loc[df['rating_cumcounts'] >= (df['total_counts']*0.8)]
    train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=1)
    
    return train_df, validation_df, test_df

def preprocessing(df):
    df = sort_by_time(df)
    df = filter_by_occurrence(df, 'uid_index', 5)
    df = filter_by_occurrence(df, 'iid', 5)
    df['rrate'] = df['rrate'].apply(lambda x: 1 if x != 'None' else 0)
    df = df.reset_index(drop=True)
    return df

def get_data(DATASET = 'LON'):
    assert DATASET in ['LON', 'NYC']
    
    print('Read data...')
    if DATASET == 'LON':
        df = pd.read_csv(DATASET_LON, sep='\t')[user_columns + LON_item_columns + rating_columns].fillna('NaN')
        sparse_features = LON_sparse_features
    else:
        df = pd.read_csv(DATASET_NYC, sep='\t')[user_columns + NYC_item_columns + rating_columns].fillna('NaN')
        sparse_features = NYC_sparse_features
    
    # sort, filter, binarize
    df = preprocessing(df)
    
    #Label encode categorical features
    for feat in sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat].astype('str'))

    train_df, val_df, test_df = split_df(df)      
    
    train_y, val_y, test_y = train_df[['rrate']], val_df[['rrate']], test_df[['rrate']]
    
    train_df, val_df, test_df = train_df[sparse_features], val_df[sparse_features], test_df[sparse_features]
    
    return train_df, val_df, test_df, train_y, val_y, test_y


### Data
 - user : uage, ugender, ucity, ucountry, uid_index, ulevel
 - item : iid, irating

In [86]:
train_df, val_df, test_df, train_y, val_y, test_y = get_data('LON')
print("\ntrain shape: \n",train_df.shape)
print("\nvalidation  shape: \n",val_df.shape)
print("\ntest shape: \n",test_df.shape)

Read data...

train shape: 
 (87440, 8)

validation  shape: 
 (9716, 8)

test shape: 
 (39339, 8)


### XGB‐LR

In [87]:
model = xgb.XGBClassifier(nthread=4,
                          learning_rate=0.08,
                          n_estimators=50,
                          max_depth=5,
                          gamma=0,
                          subsample=0.9,
                          colsample_bytree=0.5)


model.fit(train_df.values, train_y.values.ravel())

y_pred_val = model.predict_proba(val_df.values)
y_pred_test = model.predict_proba(test_df.values)

xgb_val_auc = roc_auc_score(np.array(val_y), np.array(y_pred_val)[:,1])
xgb_test_auc = roc_auc_score(np.array(test_y), np.array(y_pred_test)[:,1])
print('xgboost val auc: %.5f' % xgb_val_auc)
print('xgboost test auc: %.5f' % xgb_test_auc)

xgboost val auc: 0.97092
xgboost test auc: 0.95323


In [88]:
# xgboost encoding
xgboost = model

#apply()get leaf indices
X_train_leaves = xgboost.apply(train_df.values)
X_val_leaves = xgboost.apply(val_df.values)
X_test_leaves = xgboost.apply(test_df.values)
#Return the predicted leaf every tree for each sample.



train_rows = X_train_leaves.shape[0]
X_leaves = np.concatenate((X_train_leaves, X_val_leaves), axis=0)

val_rows = X_leaves.shape[0]
X_leaves = np.concatenate((X_leaves, X_test_leaves), axis=0)

X_leaves = X_leaves.astype(np.int32)
(rows, cols) = X_leaves.shape

In [89]:
# feature OneHotEncoding
xgbenc = OneHotEncoder()
X_trans = xgbenc.fit_transform(X_leaves)

#### Encoded feature

In [90]:
lr = LogisticRegression()

lr.fit(X_trans[:train_rows, :], train_y.values.ravel())

y_pred_val_xgblr1 = lr.predict_proba(X_trans[train_rows:val_rows, :])
y_pred_test_xgblr1 = lr.predict_proba(X_trans[val_rows:, :])
verbose=0

xgb_val_auc = roc_auc_score(np.array(val_y), np.array(y_pred_val_xgblr1)[:,1])
xgb_test_auc = roc_auc_score(np.array(test_y), np.array(y_pred_test_xgblr1)[:,1])

print('Encoded xgboost validation   auc: %.5f' % xgb_val_auc)
print('Encoded xgboost test auc: %.5f' % xgb_test_auc)

Encoded xgboost validation   auc: 0.97825
Encoded xgboost test auc: 0.96541


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [91]:
logloss_val = log_loss(val_y, y_pred_val_xgblr1.astype('float64'))
logloss_test = log_loss(test_y, y_pred_test_xgblr1.astype('float64'))

print('validation logloss scores:', logloss_val)
print('test logloss scores:', logloss_test)

validation logloss scores: 0.13155689423996186
test logloss scores: 0.16697542751644873


#### Combined feature

In [92]:
lr = LogisticRegression(n_jobs=-1)

X_train_ext = hstack([X_trans[:train_rows, :], train_df])
X_val_ext = hstack([X_trans[train_rows:val_rows, :], val_df])
X_test_ext = hstack([X_trans[val_rows:, :], test_df])

lr.fit(X_train_ext, train_y.values.ravel())

y_pred_val_xgblr2 = lr.predict_proba(X_val_ext)
y_pred_test_xgblr2 = lr.predict_proba(X_test_ext)

xgb_val_auc = roc_auc_score(np.array(val_y).T[0], np.array(y_pred_val_xgblr2)[:,1])
xgb_test_auc = roc_auc_score(np.array(test_y).T[0], np.array(y_pred_test_xgblr2)[:,1])

print('Combined feature validation LR AUC: %.5f' % xgb_val_auc)
print('Combined feature test LR AUC: %.5f' % xgb_test_auc)

Combined feature validation LR AUC: 0.94127
Combined feature test LR AUC: 0.90435


In [93]:
logloss_val = log_loss(val_y, y_pred_val_xgblr2.astype('float64'))
logloss_test = log_loss(test_y, y_pred_test_xgblr2.astype('float64'))

print('test logloss validation scores:', logloss_val)
print('test logloss test scores:', logloss_test)

test logloss validation scores: 0.2075886060175333
test logloss test scores: 0.2621061507190821


In [94]:
ndcg_val = ndcg_score(np.expand_dims(np.array(val_y).T[0], axis=0), np.expand_dims(np.array(y_pred_val_xgblr2)[:,1], axis=0), k=5)
ndcg_test = ndcg_score(np.expand_dims(np.array(test_y).T[0], axis=0), np.expand_dims(np.array(y_pred_test_xgblr2)[:,1], axis=0), k=5)

print('validation NDCG@5 scores:', ndcg_val)
print('test NDCG@5 scores:', ndcg_test)

validation NDCG@5 scores: 0.9999999999999999
test NDCG@5 scores: 0.9999999999999999
