## Pre‐training via GBDT for LR

In [136]:
from __future__ import division
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.datasets.samples_generator import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, ndcg_score

### Read Data

In [137]:
DATASET_LON = '../data/LON-A/London_Attractions_Complete_Review.csv'
DATASET_NYC = '../data/NYC-R/New_York_City_Restaurant_Complete_Review.csv'

user_columns = ['uage', 'ugender', 'ucity', 'ucountry', 'uid_index', 'ulevel', 'ustyle']
LON_item_columns = ['iid', 'iattribute', 'irating', 'itag']
NYC_item_columns = ['iid', 'iattribute', 'iprice', 'irating', 'itag']
rating_columns = ['rtime', 'rquote', 'rrate', 'rid']

In [138]:
LON_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating']
NYC_sparse_features = ["uage", "ugender", "ucity", "ucountry", "uid_index", "ulevel", 'iid', 'irating', 'iprice']
var_sparse_features = ['ustyle', 'iattribute', 'itag']

def sort_by_time(df):
    return df.sort_values(by=['rid'], ascending=True)

def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

def split_df(df):
    df['rating_cumcounts'] = df.groupby(['uid_index'])['rid'].rank(method='first', ascending=True)
    tmp = df.groupby('uid_index').size().rename('total_counts')
    df = df.join(tmp, on='uid_index', rsuffix='_r')
    train_df = df.loc[df['rating_cumcounts'] < (df['total_counts']*0.8)]
    test_df = df.loc[df['rating_cumcounts'] >= (df['total_counts']*0.8)]
    train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=1)
    
    return train_df, validation_df, test_df

def preprocessing(df):
    df = sort_by_time(df)
    df = filter_by_occurrence(df, 'uid_index', 5)
    df = filter_by_occurrence(df, 'iid', 5)
    df['rrate'] = df['rrate'].apply(lambda x: 1 if x != 'None' else 0)
    df = df.reset_index(drop=True)
    return df

def get_data(DATASET = 'LON'):
    assert DATASET in ['LON', 'NYC']
    
    print('Read data...')
    if DATASET == 'LON':
        df = pd.read_csv(DATASET_LON, sep='\t')[user_columns + LON_item_columns + rating_columns].fillna('NaN')
        sparse_features = LON_sparse_features
    else:
        df = pd.read_csv(DATASET_NYC, sep='\t')[user_columns + NYC_item_columns + rating_columns].fillna('NaN')
        sparse_features = NYC_sparse_features
    
    # sort, filter, binarize
    df = preprocessing(df)
    
    #Label encode categorical features
    for feat in sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat].astype('str'))
  
    train_df, val_df, test_df = split_df(df)      
    
    train_y, val_y, test_y = train_df[['rrate']], val_df[['rrate']], test_df[['rrate']]
    
    train_df, val_df, test_df = train_df[sparse_features], val_df[sparse_features], test_df[sparse_features]
    
    return train_df, val_df, test_df, train_y, val_y, test_y


### Data
 - user : uage, ugender, ucity, ucountry, uid_index, ulevel
 - item : iid, irating

In [139]:
train_df, val_df, test_df, train_y, val_y, test_y = get_data('LON')
print("\ntrain shape: \n",train_df.shape)
print("\nvalidation  shape: \n",val_df.shape)
print("\ntest shape: \n",test_df.shape)

Read data...

train shape: 
 (87440, 8)

validation  shape: 
 (9716, 8)

test shape: 
 (39339, 8)


### GBDT‐LR

In [140]:
class GBDTLR(BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=100, max_depth=3,
                 min_samples_leaf=1, max_leaf_nodes=None,
                 subsample=1.0, learning_rate=0.1,
                 max_iter=100, C=1.0, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.subsample = subsample
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.C = C
        self.random_state = random_state

        self.gbdt_params = {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'min_samples_leaf': self.min_samples_leaf,
            'max_leaf_nodes': self.max_leaf_nodes,
            'subsample': self.subsample,
            'learning_rate': self.learning_rate
        }

        self.lr_params = {
            'C': self.C,
            'max_iter': self.max_iter
        }

        self.GBDT = GradientBoostingClassifier(**self.gbdt_params, random_state=random_state)
        self.LR = LogisticRegression(**self.lr_params, random_state=random_state)
        self.ENC = OneHotEncoder(categories='auto')

    def fit(self, X, y):
        X_gbdt, X_lr, Y_gbdt, Y_lr = train_test_split(X, y, test_size=0.5)
        self.GBDT.fit(X_gbdt, Y_gbdt)
        tree_feature = self.GBDT.apply(X_gbdt)[:, :, 0]
        self.ENC.fit(tree_feature)

        X = self.ENC.transform(self.GBDT.apply(X_lr)[:, :, 0])
        y = Y_lr
        return self.LR.fit(X, y)


    def predict(self,X):
        X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0])
        return self.LR.predict(X)


    def predict_proba(self, X):
        X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0])
        return self.LR.predict_proba(X)

    def predict_log_proba(self,X):
        X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0])
        return self.LR.predict_log_proba(X)

In [141]:
params = {
        # 这些是GBDT的超参数
        'n_estimators': 300, 'max_depth': 7,
        'min_samples_leaf': 45, 'max_leaf_nodes': 4,
        'subsample': 0.8, 'learning_rate': 0.1,
        # 这些是LR的超参数
        'max_iter': 2770, 'C': 0.8,
        # random_state是公共参数
        'random_state': 1234
    }

In [142]:
model = GBDTLR(**params)
print(model)

model.fit(train_df, train_y)
val_pred_proba = model.predict_proba(val_df)
test_pred_proba = model.predict_proba(test_df)

GBDTLR(C=0.8, learning_rate=0.1, max_depth=7, max_iter=2770, max_leaf_nodes=4,
       min_samples_leaf=45, n_estimators=300, random_state=1234, subsample=0.8)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [143]:
roc_auc = metrics.roc_auc_score(np.array(val_y), np.array(val_pred_proba)[:,1])
logloss = metrics.log_loss(val_y, val_pred_proba.astype('float64'))
ndcg_val = ndcg_score(np.expand_dims(np.array(val_y).T[0], axis=0), np.expand_dims(np.array(val_pred_proba)[:,1], axis=0), k=5)
print("validation roc_auc:",roc_auc)
print('validation log_loss scores:', logloss)
print('validation NDCG@5 scores:', ndcg_val)

roc_auc = metrics.roc_auc_score(np.array(test_y), np.array(test_pred_proba)[:,1])
logloss = metrics.log_loss(test_y, test_pred_proba.astype('float64'))
ndcg_test = ndcg_score(np.expand_dims(np.array(test_y).T[0], axis=0), np.expand_dims(np.array(test_pred_proba)[:,1], axis=0), k=5)
print("\ntest roc_auc:",roc_auc)
print('test log_loss scores:', logloss)
print('test NDCG@5 scores:', ndcg_test)

validation roc_auc: 0.9914227350528894
validation log_loss scores: 0.08460939214818365
validation NDCG@5 scores: 0.9999999999999999

test roc_auc: 0.9857000626292651
test log_loss scores: 0.10890890146234473
test NDCG@5 scores: 0.9999999999999999
